acryl-datahub 0.15.0.1rc1__py3-none-any.whl → 0.15.0.1rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -265,64 +265,17 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
265
265
  with PerfTimer() as timer:
266
266
  self.report.num_external_table_edges_scanned = 0
267
267
 
268
- for (
269
- known_lineage_mapping
270
- ) in self._populate_external_lineage_from_copy_history(discovered_tables):
271
- self.sql_aggregator.add(known_lineage_mapping)
272
- logger.info(
273
- "Done populating external lineage from copy history. "
274
- f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
275
- )
276
-
277
- for (
278
- known_lineage_mapping
279
- ) in self._populate_external_lineage_from_show_query(discovered_tables):
280
- self.sql_aggregator.add(known_lineage_mapping)
281
-
282
- logger.info(
283
- "Done populating external lineage from show external tables. "
284
- f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
285
- )
268
+ for entry in self._get_copy_history_lineage(discovered_tables):
269
+ self.sql_aggregator.add(entry)
270
+ logger.info("Done populating external lineage from copy history. ")
286
271
 
287
272
  self.report.external_lineage_queries_secs = timer.elapsed_seconds()
288
273
 
289
- # Handles the case for explicitly created external tables.
290
- # NOTE: Snowflake does not log this information to the access_history table.
291
- def _populate_external_lineage_from_show_query(
292
- self, discovered_tables: List[str]
293
- ) -> Iterable[KnownLineageMapping]:
294
- external_tables_query: str = SnowflakeQuery.show_external_tables()
295
- try:
296
- for db_row in self.connection.query(external_tables_query):
297
- key = self.identifiers.get_dataset_identifier(
298
- db_row["name"], db_row["schema_name"], db_row["database_name"]
299
- )
300
-
301
- if key not in discovered_tables:
302
- continue
303
- if db_row["location"].startswith("s3://"):
304
- yield KnownLineageMapping(
305
- upstream_urn=make_s3_urn_for_lineage(
306
- db_row["location"], self.config.env
307
- ),
308
- downstream_urn=self.identifiers.gen_dataset_urn(key),
309
- )
310
- self.report.num_external_table_edges_scanned += 1
311
-
312
- self.report.num_external_table_edges_scanned += 1
313
- except Exception as e:
314
- logger.debug(e, exc_info=e)
315
- self.structured_reporter.warning(
316
- "Error populating external table lineage from Snowflake",
317
- exc=e,
318
- )
319
- self.report_status(EXTERNAL_LINEAGE, False)
320
-
321
274
  # Handles the case where a table is populated from an external stage/s3 location via copy.
322
275
  # Eg: copy into category_english from @external_s3_stage;
323
276
  # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv';
324
277
  # NOTE: Snowflake does not log this information to the access_history table.
325
- def _populate_external_lineage_from_copy_history(
278
+ def _get_copy_history_lineage(
326
279
  self, discovered_tables: List[str]
327
280
  ) -> Iterable[KnownLineageMapping]:
328
281
  query: str = SnowflakeQuery.copy_lineage_history(
@@ -247,9 +247,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
247
247
  for entry in self.fetch_copy_history():
248
248
  queries.append(entry)
249
249
 
250
- # TODO: Add "show external tables" lineage to the main schema extractor.
251
- # Because it's not a time-based thing, it doesn't really make sense in the snowflake-queries extractor.
252
-
253
250
  with self.report.query_log_fetch_timer:
254
251
  for entry in self.fetch_query_log():
255
252
  queries.append(entry)
@@ -16,6 +16,7 @@ from datahub.ingestion.glossary.classification_mixin import (
16
16
  ClassificationHandler,
17
17
  classification_workunit_processor,
18
18
  )
19
+ from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
19
20
  from datahub.ingestion.source.common.subtypes import (
20
21
  DatasetContainerSubTypes,
21
22
  DatasetSubTypes,
@@ -35,6 +36,7 @@ from datahub.ingestion.source.snowflake.snowflake_connection import (
35
36
  )
36
37
  from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader
37
38
  from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler
39
+ from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
38
40
  from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
39
41
  from datahub.ingestion.source.snowflake.snowflake_schema import (
40
42
  SCHEMA_PARALLELISM,
@@ -65,6 +67,7 @@ from datahub.ingestion.source.sql.sql_utils import (
65
67
  get_domain_wu,
66
68
  )
67
69
  from datahub.ingestion.source_report.ingestion_stage import (
70
+ EXTERNAL_TABLE_DDL_LINEAGE,
68
71
  METADATA_EXTRACTION,
69
72
  PROFILING,
70
73
  )
@@ -96,7 +99,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
96
99
  TimeType,
97
100
  )
98
101
  from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
99
- from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
102
+ from datahub.sql_parsing.sql_parsing_aggregator import (
103
+ KnownLineageMapping,
104
+ SqlParsingAggregator,
105
+ )
100
106
  from datahub.utilities.registries.domain_registry import DomainRegistry
101
107
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
102
108
 
@@ -180,7 +186,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
180
186
 
181
187
  # These are populated as side-effects of get_workunits_internal.
182
188
  self.databases: List[SnowflakeDatabase] = []
183
- self.aggregator: Optional[SqlParsingAggregator] = aggregator
189
+
190
+ self.aggregator = aggregator
184
191
 
185
192
  def get_connection(self) -> SnowflakeConnection:
186
193
  return self.connection
@@ -212,6 +219,19 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
212
219
  self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
213
220
  yield from self._process_database(snowflake_db)
214
221
 
222
+ self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
223
+ discovered_tables: List[str] = [
224
+ self.identifiers.get_dataset_identifier(
225
+ table_name, schema.name, db.name
226
+ )
227
+ for db in self.databases
228
+ for schema in db.schemas
229
+ for table_name in schema.tables
230
+ ]
231
+ if self.aggregator:
232
+ for entry in self._external_tables_ddl_lineage(discovered_tables):
233
+ self.aggregator.add(entry)
234
+
215
235
  except SnowflakePermissionError as e:
216
236
  self.structured_reporter.failure(
217
237
  GENERIC_PERMISSION_ERROR_KEY,
@@ -1082,3 +1102,33 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1082
1102
 
1083
1103
  # Access to table but none of its constraints - is this possible ?
1084
1104
  return constraints.get(table_name, [])
1105
+
1106
+ # Handles the case for explicitly created external tables.
1107
+ # NOTE: Snowflake does not log this information to the access_history table.
1108
+ def _external_tables_ddl_lineage(
1109
+ self, discovered_tables: List[str]
1110
+ ) -> Iterable[KnownLineageMapping]:
1111
+ external_tables_query: str = SnowflakeQuery.show_external_tables()
1112
+ try:
1113
+ for db_row in self.connection.query(external_tables_query):
1114
+ key = self.identifiers.get_dataset_identifier(
1115
+ db_row["name"], db_row["schema_name"], db_row["database_name"]
1116
+ )
1117
+
1118
+ if key not in discovered_tables:
1119
+ continue
1120
+ if db_row["location"].startswith("s3://"):
1121
+ yield KnownLineageMapping(
1122
+ upstream_urn=make_s3_urn_for_lineage(
1123
+ db_row["location"], self.config.env
1124
+ ),
1125
+ downstream_urn=self.identifiers.gen_dataset_urn(key),
1126
+ )
1127
+ self.report.num_external_table_edges_scanned += 1
1128
+
1129
+ self.report.num_external_table_edges_scanned += 1
1130
+ except Exception as e:
1131
+ self.structured_reporter.warning(
1132
+ "External table ddl lineage extraction failed",
1133
+ exc=e,
1134
+ )
@@ -161,35 +161,32 @@ class SnowflakeV2Source(
161
161
  # For database, schema, tables, views, etc
162
162
  self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
163
163
  self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
164
- self.aggregator: Optional[SqlParsingAggregator] = None
165
-
166
- if self.config.use_queries_v2 or self.config.include_table_lineage:
167
- self.aggregator = self._exit_stack.enter_context(
168
- SqlParsingAggregator(
169
- platform=self.identifiers.platform,
170
- platform_instance=self.config.platform_instance,
171
- env=self.config.env,
172
- graph=self.ctx.graph,
173
- eager_graph_load=(
174
- # If we're ingestion schema metadata for tables/views, then we will populate
175
- # schemas into the resolver as we go. We only need to do a bulk fetch
176
- # if we're not ingesting schema metadata as part of ingestion.
177
- not (
178
- self.config.include_technical_schema
179
- and self.config.include_tables
180
- and self.config.include_views
181
- )
182
- and not self.config.lazy_schema_resolver
183
- ),
184
- generate_usage_statistics=False,
185
- generate_operations=False,
186
- format_queries=self.config.format_sql_queries,
187
- )
164
+
165
+ self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context(
166
+ SqlParsingAggregator(
167
+ platform=self.identifiers.platform,
168
+ platform_instance=self.config.platform_instance,
169
+ env=self.config.env,
170
+ graph=self.ctx.graph,
171
+ eager_graph_load=(
172
+ # If we're ingestion schema metadata for tables/views, then we will populate
173
+ # schemas into the resolver as we go. We only need to do a bulk fetch
174
+ # if we're not ingesting schema metadata as part of ingestion.
175
+ not (
176
+ self.config.include_technical_schema
177
+ and self.config.include_tables
178
+ and self.config.include_views
179
+ )
180
+ and not self.config.lazy_schema_resolver
181
+ ),
182
+ generate_usage_statistics=False,
183
+ generate_operations=False,
184
+ format_queries=self.config.format_sql_queries,
188
185
  )
189
- self.report.sql_aggregator = self.aggregator.report
186
+ )
187
+ self.report.sql_aggregator = self.aggregator.report
190
188
 
191
189
  if self.config.include_table_lineage:
192
- assert self.aggregator is not None
193
190
  redundant_lineage_run_skip_handler: Optional[
194
191
  RedundantLineageRunSkipHandler
195
192
  ] = None
@@ -487,8 +484,6 @@ class SnowflakeV2Source(
487
484
 
488
485
  databases = schema_extractor.databases
489
486
 
490
- # TODO: The checkpoint state for stale entity detection can be committed here.
491
-
492
487
  if self.config.shares:
493
488
  yield from SnowflakeSharesHandler(
494
489
  self.config, self.report