acryl-datahub 1.2.0.6__py3-none-any.whl → 1.2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (84) hide show
  1. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2629 -2543
  2. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
  3. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/graphql/operation.py +1 -1
  6. datahub/ingestion/autogenerated/capability_summary.json +46 -6
  7. datahub/ingestion/autogenerated/lineage.json +3 -2
  8. datahub/ingestion/run/pipeline.py +1 -0
  9. datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
  10. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  11. datahub/ingestion/source/common/subtypes.py +3 -0
  12. datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
  13. datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
  14. datahub/ingestion/source/dbt/dbt_common.py +74 -0
  15. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  16. datahub/ingestion/source/dremio/dremio_source.py +4 -0
  17. datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
  18. datahub/ingestion/source/excel/__init__.py +0 -0
  19. datahub/ingestion/source/excel/config.py +92 -0
  20. datahub/ingestion/source/excel/excel_file.py +539 -0
  21. datahub/ingestion/source/excel/profiling.py +308 -0
  22. datahub/ingestion/source/excel/report.py +49 -0
  23. datahub/ingestion/source/excel/source.py +662 -0
  24. datahub/ingestion/source/excel/util.py +18 -0
  25. datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
  26. datahub/ingestion/source/openapi.py +1 -1
  27. datahub/ingestion/source/powerbi/config.py +33 -0
  28. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  29. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  30. datahub/ingestion/source/powerbi/powerbi.py +5 -0
  31. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  32. datahub/ingestion/source/redshift/config.py +9 -6
  33. datahub/ingestion/source/redshift/lineage.py +386 -687
  34. datahub/ingestion/source/redshift/redshift.py +19 -106
  35. datahub/ingestion/source/s3/source.py +65 -59
  36. datahub/ingestion/source/snowflake/constants.py +2 -0
  37. datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
  38. datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
  39. datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
  40. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  41. datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
  42. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
  43. datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
  44. datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
  45. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
  46. datahub/ingestion/source/sql/hive_metastore.py +1 -0
  47. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  48. datahub/ingestion/source/sql/mssql/source.py +62 -3
  49. datahub/ingestion/source/sql_queries.py +24 -2
  50. datahub/ingestion/source/state/checkpoint.py +3 -28
  51. datahub/ingestion/source/unity/config.py +74 -9
  52. datahub/ingestion/source/unity/proxy.py +167 -5
  53. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  54. datahub/ingestion/source/unity/proxy_types.py +24 -0
  55. datahub/ingestion/source/unity/report.py +5 -0
  56. datahub/ingestion/source/unity/source.py +111 -1
  57. datahub/ingestion/source/usage/usage_common.py +1 -0
  58. datahub/metadata/_internal_schema_classes.py +573 -517
  59. datahub/metadata/_urns/urn_defs.py +1748 -1748
  60. datahub/metadata/schema.avsc +18564 -18484
  61. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  62. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
  63. datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
  64. datahub/metadata/schemas/LogicalParent.avsc +104 -100
  65. datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
  66. datahub/metadata/schemas/Ownership.avsc +69 -0
  67. datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
  68. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  69. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
  70. datahub/metadata/schemas/__init__.py +3 -3
  71. datahub/sdk/chart.py +36 -22
  72. datahub/sdk/dashboard.py +38 -62
  73. datahub/sdk/lineage_client.py +6 -26
  74. datahub/sdk/main_client.py +7 -3
  75. datahub/sdk/search_filters.py +16 -0
  76. datahub/specific/aspect_helpers/siblings.py +73 -0
  77. datahub/specific/dataset.py +2 -0
  78. datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
  79. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  80. datahub/upgrade/upgrade.py +14 -2
  81. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  82. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
  83. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
  84. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0
@@ -45,6 +45,7 @@ class ServerVersionStats(BaseModel):
45
45
  latest: Optional[VersionStats] = None
46
46
  current_server_type: Optional[str] = None
47
47
  current_server_default_cli_version: Optional[VersionStats] = None
48
+ is_cloud_server: Optional[bool] = None
48
49
 
49
50
 
50
51
  class ClientVersionStats(BaseModel):
@@ -145,7 +146,9 @@ async def get_server_config(gms_url: str, token: Optional[str]) -> RestServiceCo
145
146
 
146
147
  async def get_server_version_stats(
147
148
  server: Optional[DataHubGraph] = None,
148
- ) -> Tuple[Optional[str], Optional[Version], Optional[str], Optional[datetime]]:
149
+ ) -> Tuple[
150
+ Optional[str], Optional[Version], Optional[str], Optional[datetime], Optional[bool]
151
+ ]:
149
152
  import aiohttp
150
153
 
151
154
  server_config: Optional[RestServiceConfig] = None
@@ -167,11 +170,13 @@ async def get_server_version_stats(
167
170
  server_version: Optional[Version] = None
168
171
  current_server_default_cli_version = None
169
172
  current_server_release_date = None
173
+ is_cloud_server: Optional[bool] = None
170
174
  if server_config:
171
175
  server_version_string = server_config.service_version
172
176
  commit_hash = server_config.commit_hash
173
177
  server_type = server_config.server_type
174
178
  current_server_default_cli_version = server_config.default_cli_version
179
+ is_cloud_server = server_config.is_datahub_cloud
175
180
  if server_type == "quickstart" and commit_hash:
176
181
  async with aiohttp.ClientSession(
177
182
  headers={"Accept": "application/vnd.github.v3+json"}
@@ -191,6 +196,7 @@ async def get_server_version_stats(
191
196
  server_version,
192
197
  current_server_default_cli_version,
193
198
  current_server_release_date,
199
+ is_cloud_server,
194
200
  )
195
201
 
196
202
 
@@ -236,6 +242,7 @@ async def _retrieve_version_stats(
236
242
  current_server_version,
237
243
  current_server_default_cli_version,
238
244
  current_server_release_date,
245
+ is_cloud_server,
239
246
  ) = results[2]
240
247
 
241
248
  server_version_stats = None
@@ -255,6 +262,7 @@ async def _retrieve_version_stats(
255
262
  else None
256
263
  ),
257
264
  current_server_type=current_server_type,
265
+ is_cloud_server=is_cloud_server,
258
266
  )
259
267
 
260
268
  if client_version_stats and server_version_stats:
@@ -353,7 +361,11 @@ def _maybe_print_upgrade_message(
353
361
  else None
354
362
  )
355
363
  client_server_compat = 0
356
- if version_stats.server.current_server_type != "cloud":
364
+ # Skip version compatibility checks for cloud servers (serverEnv="cloud")
365
+ # Cloud servers use different versioning schemes between server and CLI
366
+ is_cloud = version_stats.server.is_cloud_server
367
+
368
+ if not is_cloud:
357
369
  client_server_compat = is_client_server_compatible(
358
370
  version_stats.client.current, version_stats.server.current
359
371
  )
@@ -1,466 +0,0 @@
1
- import collections
2
- import logging
3
- from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
4
-
5
- import redshift_connector
6
-
7
- from datahub.emitter import mce_builder
8
- from datahub.ingestion.api.closeable import Closeable
9
- from datahub.ingestion.api.common import PipelineContext
10
- from datahub.ingestion.api.workunit import MetadataWorkUnit
11
- from datahub.ingestion.source.redshift.config import LineageMode, RedshiftConfig
12
- from datahub.ingestion.source.redshift.lineage import (
13
- LineageCollectorType,
14
- RedshiftLineageExtractor,
15
- )
16
- from datahub.ingestion.source.redshift.query import (
17
- RedshiftCommonQuery,
18
- RedshiftProvisionedQuery,
19
- RedshiftServerlessQuery,
20
- )
21
- from datahub.ingestion.source.redshift.redshift_schema import (
22
- LineageRow,
23
- RedshiftDataDictionary,
24
- RedshiftSchema,
25
- RedshiftTable,
26
- RedshiftView,
27
- )
28
- from datahub.ingestion.source.redshift.report import RedshiftReport
29
- from datahub.ingestion.source.state.redundant_run_skip_handler import (
30
- RedundantLineageRunSkipHandler,
31
- )
32
- from datahub.metadata.urns import DatasetUrn
33
- from datahub.sql_parsing.sql_parsing_aggregator import (
34
- KnownQueryLineageInfo,
35
- ObservedQuery,
36
- SqlParsingAggregator,
37
- )
38
- from datahub.utilities.perf_timer import PerfTimer
39
-
40
- logger = logging.getLogger(__name__)
41
-
42
-
43
- class RedshiftSqlLineageV2(Closeable):
44
- # does lineage and usage based on SQL parsing.
45
-
46
- def __init__(
47
- self,
48
- config: RedshiftConfig,
49
- report: RedshiftReport,
50
- context: PipelineContext,
51
- database: str,
52
- redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = None,
53
- ):
54
- self.platform = "redshift"
55
- self.config = config
56
- self.report = report
57
- self.context = context
58
-
59
- self.database = database
60
- self.known_urns: Set[str] = set() # will be set later
61
-
62
- self.aggregator = SqlParsingAggregator(
63
- platform=self.platform,
64
- platform_instance=self.config.platform_instance,
65
- env=self.config.env,
66
- generate_lineage=True,
67
- generate_queries=self.config.lineage_v2_generate_queries,
68
- generate_usage_statistics=False,
69
- generate_operations=False,
70
- usage_config=self.config,
71
- graph=self.context.graph,
72
- is_temp_table=self._is_temp_table,
73
- )
74
- self.report.sql_aggregator = self.aggregator.report
75
-
76
- self.queries: RedshiftCommonQuery = RedshiftProvisionedQuery()
77
- if self.config.is_serverless:
78
- self.queries = RedshiftServerlessQuery()
79
-
80
- self._lineage_v1 = RedshiftLineageExtractor(
81
- config=config,
82
- report=report,
83
- context=context,
84
- redundant_run_skip_handler=redundant_run_skip_handler,
85
- )
86
-
87
- self.start_time, self.end_time = (
88
- self.report.lineage_start_time,
89
- self.report.lineage_end_time,
90
- ) = self._lineage_v1.get_time_window()
91
-
92
- def _is_temp_table(self, name: str) -> bool:
93
- return (
94
- DatasetUrn.create_from_ids(
95
- self.platform,
96
- name,
97
- env=self.config.env,
98
- platform_instance=self.config.platform_instance,
99
- ).urn()
100
- not in self.known_urns
101
- )
102
-
103
- def build(
104
- self,
105
- connection: redshift_connector.Connection,
106
- all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
107
- db_schemas: Dict[str, Dict[str, RedshiftSchema]],
108
- ) -> None:
109
- # Assume things not in `all_tables` as temp tables.
110
- self.known_urns = {
111
- DatasetUrn.create_from_ids(
112
- self.platform,
113
- f"{db}.{schema}.{table.name}",
114
- env=self.config.env,
115
- platform_instance=self.config.platform_instance,
116
- ).urn()
117
- for db, schemas in all_tables.items()
118
- for schema, tables in schemas.items()
119
- for table in tables
120
- }
121
-
122
- # Handle all the temp tables up front.
123
- if self.config.resolve_temp_table_in_lineage:
124
- for temp_row in self._lineage_v1.get_temp_tables(connection=connection):
125
- self.aggregator.add_observed_query(
126
- ObservedQuery(
127
- query=temp_row.query_text,
128
- default_db=self.database,
129
- default_schema=self.config.default_schema,
130
- session_id=temp_row.session_id,
131
- timestamp=temp_row.start_time,
132
- ),
133
- # The "temp table" query actually returns all CREATE TABLE statements, even if they
134
- # aren't explicitly a temp table. As such, setting is_known_temp_table=True
135
- # would not be correct. We already have mechanisms to autodetect temp tables,
136
- # so we won't lose anything by not setting it.
137
- is_known_temp_table=False,
138
- )
139
-
140
- populate_calls: List[Tuple[LineageCollectorType, str, Callable]] = []
141
-
142
- if self.config.include_table_rename_lineage:
143
- # Process all the ALTER TABLE RENAME statements
144
- table_renames, _ = self._lineage_v1._process_table_renames(
145
- database=self.database,
146
- connection=connection,
147
- all_tables=collections.defaultdict(
148
- lambda: collections.defaultdict(set)
149
- ),
150
- )
151
- for entry in table_renames.values():
152
- self.aggregator.add_table_rename(entry)
153
-
154
- if self.config.table_lineage_mode in {
155
- LineageMode.SQL_BASED,
156
- LineageMode.MIXED,
157
- }:
158
- # Populate lineage by parsing table creating sqls
159
- query = self.queries.list_insert_create_queries_sql(
160
- db_name=self.database,
161
- start_time=self.start_time,
162
- end_time=self.end_time,
163
- )
164
- populate_calls.append(
165
- (
166
- LineageCollectorType.QUERY_SQL_PARSER,
167
- query,
168
- self._process_sql_parser_lineage,
169
- )
170
- )
171
- if self.config.table_lineage_mode in {
172
- LineageMode.STL_SCAN_BASED,
173
- LineageMode.MIXED,
174
- }:
175
- # Populate lineage by getting upstream tables from stl_scan redshift table
176
- query = self.queries.stl_scan_based_lineage_query(
177
- self.database,
178
- self.start_time,
179
- self.end_time,
180
- )
181
- populate_calls.append(
182
- (LineageCollectorType.QUERY_SCAN, query, self._process_stl_scan_lineage)
183
- )
184
-
185
- if self.config.include_views and self.config.include_view_lineage:
186
- # Populate lineage for views
187
- query = self.queries.view_lineage_query()
188
- populate_calls.append(
189
- (LineageCollectorType.VIEW, query, self._process_view_lineage)
190
- )
191
-
192
- # Populate lineage for late binding views
193
- query = self.queries.list_late_view_ddls_query()
194
- populate_calls.append(
195
- (
196
- LineageCollectorType.VIEW_DDL_SQL_PARSING,
197
- query,
198
- self._process_view_lineage,
199
- )
200
- )
201
-
202
- if self.config.include_copy_lineage:
203
- # Populate lineage for copy commands.
204
- query = self.queries.list_copy_commands_sql(
205
- db_name=self.database,
206
- start_time=self.start_time,
207
- end_time=self.end_time,
208
- )
209
- populate_calls.append(
210
- (LineageCollectorType.COPY, query, self._process_copy_command)
211
- )
212
-
213
- if self.config.include_unload_lineage:
214
- # Populate lineage for unload commands.
215
- query = self.queries.list_unload_commands_sql(
216
- db_name=self.database,
217
- start_time=self.start_time,
218
- end_time=self.end_time,
219
- )
220
- populate_calls.append(
221
- (LineageCollectorType.UNLOAD, query, self._process_unload_command)
222
- )
223
-
224
- for lineage_type, query, processor in populate_calls:
225
- self._populate_lineage_agg(
226
- query=query,
227
- lineage_type=lineage_type,
228
- processor=processor,
229
- connection=connection,
230
- )
231
-
232
- # Populate lineage for external tables.
233
- if not self.config.skip_external_tables:
234
- self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
235
-
236
- def _populate_lineage_agg(
237
- self,
238
- query: str,
239
- lineage_type: LineageCollectorType,
240
- processor: Callable[[LineageRow], None],
241
- connection: redshift_connector.Connection,
242
- ) -> None:
243
- logger.info(f"Extracting {lineage_type.name} lineage for db {self.database}")
244
- try:
245
- logger.debug(f"Processing {lineage_type.name} lineage query: {query}")
246
-
247
- timer = self.report.lineage_phases_timer.setdefault(
248
- lineage_type.name, PerfTimer()
249
- )
250
- with timer:
251
- for lineage_row in RedshiftDataDictionary.get_lineage_rows(
252
- conn=connection, query=query
253
- ):
254
- processor(lineage_row)
255
- except Exception as e:
256
- self.report.warning(
257
- title="Failed to extract some lineage",
258
- message=f"Failed to extract lineage of type {lineage_type.name}",
259
- context=f"Query: '{query}'",
260
- exc=e,
261
- )
262
- self._lineage_v1.report_status(f"extract-{lineage_type.name}", False)
263
-
264
- def _process_sql_parser_lineage(self, lineage_row: LineageRow) -> None:
265
- ddl = lineage_row.ddl
266
- if ddl is None:
267
- return
268
-
269
- # TODO actor
270
-
271
- self.aggregator.add_observed_query(
272
- ObservedQuery(
273
- query=ddl,
274
- default_db=self.database,
275
- default_schema=self.config.default_schema,
276
- timestamp=lineage_row.timestamp,
277
- session_id=lineage_row.session_id,
278
- )
279
- )
280
-
281
- def _make_filtered_target(self, lineage_row: LineageRow) -> Optional[DatasetUrn]:
282
- target = DatasetUrn.create_from_ids(
283
- self.platform,
284
- f"{self.database}.{lineage_row.target_schema}.{lineage_row.target_table}",
285
- env=self.config.env,
286
- platform_instance=self.config.platform_instance,
287
- )
288
- if target.urn() not in self.known_urns:
289
- logger.debug(
290
- f"Skipping lineage for {target.urn()} as it is not in known_urns"
291
- )
292
- return None
293
-
294
- return target
295
-
296
- def _process_stl_scan_lineage(self, lineage_row: LineageRow) -> None:
297
- target = self._make_filtered_target(lineage_row)
298
- if not target:
299
- return
300
-
301
- source = DatasetUrn.create_from_ids(
302
- self.platform,
303
- f"{self.database}.{lineage_row.source_schema}.{lineage_row.source_table}",
304
- env=self.config.env,
305
- platform_instance=self.config.platform_instance,
306
- )
307
-
308
- if lineage_row.ddl is None:
309
- logger.warning(
310
- f"stl scan entry is missing query text for {lineage_row.source_schema}.{lineage_row.source_table}"
311
- )
312
- return
313
- self.aggregator.add_known_query_lineage(
314
- KnownQueryLineageInfo(
315
- query_text=lineage_row.ddl,
316
- downstream=target.urn(),
317
- upstreams=[source.urn()],
318
- timestamp=lineage_row.timestamp,
319
- ),
320
- merge_lineage=True,
321
- )
322
-
323
- def _process_view_lineage(self, lineage_row: LineageRow) -> None:
324
- ddl = lineage_row.ddl
325
- if ddl is None:
326
- return
327
-
328
- target = self._make_filtered_target(lineage_row)
329
- if not target:
330
- return
331
-
332
- self.aggregator.add_view_definition(
333
- view_urn=target,
334
- view_definition=ddl,
335
- default_db=self.database,
336
- default_schema=self.config.default_schema,
337
- )
338
-
339
- def _process_copy_command(self, lineage_row: LineageRow) -> None:
340
- logger.debug(f"Processing COPY command for lineage row: {lineage_row}")
341
- sources = self._lineage_v1._get_sources(
342
- lineage_type=LineageCollectorType.COPY,
343
- db_name=self.database,
344
- source_schema=None,
345
- source_table=None,
346
- ddl=None,
347
- filename=lineage_row.filename,
348
- )
349
- logger.debug(f"Recognized sources: {sources}")
350
- source = sources[0]
351
- if not source:
352
- logger.debug("Ignoring command since couldn't recognize proper source")
353
- return
354
- s3_urn = source[0].urn
355
- logger.debug(f"Recognized s3 dataset urn: {s3_urn}")
356
- if not lineage_row.target_schema or not lineage_row.target_table:
357
- logger.debug(
358
- f"Didn't find target schema (found: {lineage_row.target_schema}) or target table (found: {lineage_row.target_table})"
359
- )
360
- return
361
- target = self._make_filtered_target(lineage_row)
362
- if not target:
363
- return
364
-
365
- self.aggregator.add_known_lineage_mapping(
366
- upstream_urn=s3_urn, downstream_urn=target.urn()
367
- )
368
-
369
- def _process_unload_command(self, lineage_row: LineageRow) -> None:
370
- lineage_entry = self._lineage_v1._get_target_lineage(
371
- alias_db_name=self.database,
372
- lineage_row=lineage_row,
373
- lineage_type=LineageCollectorType.UNLOAD,
374
- all_tables_set={},
375
- )
376
- if not lineage_entry:
377
- return
378
- output_urn = lineage_entry.dataset.urn
379
-
380
- if not lineage_row.source_schema or not lineage_row.source_table:
381
- return
382
- source = DatasetUrn.create_from_ids(
383
- self.platform,
384
- f"{self.database}.{lineage_row.source_schema}.{lineage_row.source_table}",
385
- env=self.config.env,
386
- platform_instance=self.config.platform_instance,
387
- )
388
- if source.urn() not in self.known_urns:
389
- logger.debug(
390
- f"Skipping unload lineage for {source.urn()} as it is not in known_urns"
391
- )
392
- return
393
-
394
- self.aggregator.add_known_lineage_mapping(
395
- upstream_urn=source.urn(), downstream_urn=output_urn
396
- )
397
-
398
- def _process_external_tables(
399
- self,
400
- all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
401
- db_schemas: Dict[str, Dict[str, RedshiftSchema]],
402
- ) -> None:
403
- for schema_name, tables in all_tables[self.database].items():
404
- logger.info(f"External table lineage: checking schema {schema_name}")
405
- if not db_schemas[self.database].get(schema_name):
406
- logger.warning(f"Schema {schema_name} not found")
407
- continue
408
- for table in tables:
409
- schema = db_schemas[self.database][schema_name]
410
- if (
411
- table.is_external_table()
412
- and schema.is_external_schema()
413
- and schema.external_platform
414
- ):
415
- logger.info(
416
- f"External table lineage: processing table {schema_name}.{table.name}"
417
- )
418
- # external_db_params = schema.option
419
- upstream_platform = schema.external_platform.lower()
420
-
421
- table_urn = mce_builder.make_dataset_urn_with_platform_instance(
422
- self.platform,
423
- f"{self.database}.{schema_name}.{table.name}",
424
- platform_instance=self.config.platform_instance,
425
- env=self.config.env,
426
- )
427
- if upstream_platform == self.platform:
428
- upstream_schema = schema.get_upstream_schema_name() or "public"
429
- upstream_dataset_name = (
430
- f"{schema.external_database}.{upstream_schema}.{table.name}"
431
- )
432
- upstream_platform_instance = self.config.platform_instance
433
- else:
434
- upstream_dataset_name = (
435
- f"{schema.external_database}.{table.name}"
436
- )
437
- upstream_platform_instance = (
438
- self.config.platform_instance_map.get(upstream_platform)
439
- if self.config.platform_instance_map
440
- else None
441
- )
442
-
443
- upstream_urn = mce_builder.make_dataset_urn_with_platform_instance(
444
- upstream_platform,
445
- upstream_dataset_name,
446
- platform_instance=upstream_platform_instance,
447
- env=self.config.env,
448
- )
449
-
450
- self.aggregator.add_known_lineage_mapping(
451
- upstream_urn=upstream_urn,
452
- downstream_urn=table_urn,
453
- )
454
-
455
- def generate(self) -> Iterable[MetadataWorkUnit]:
456
- for mcp in self.aggregator.gen_metadata():
457
- yield mcp.as_workunit()
458
- if len(self.aggregator.report.observed_query_parse_failures) > 0:
459
- self.report.report_warning(
460
- title="Failed to extract some SQL lineage",
461
- message="Unexpected error(s) while attempting to extract lineage from SQL queries. See the full logs for more details.",
462
- context=f"Query Parsing Failures: {self.aggregator.report.observed_query_parse_failures}",
463
- )
464
-
465
- def close(self) -> None:
466
- self.aggregator.close()