acryl-datahub 1.1.0.5rc5__py3-none-any.whl → 1.1.0.5rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (43) hide show
  1. {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/METADATA +2603 -2603
  2. {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/RECORD +43 -39
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/api/report.py +183 -35
  5. datahub/ingestion/autogenerated/capability_summary.json +3366 -0
  6. datahub/ingestion/autogenerated/lineage.json +401 -0
  7. datahub/ingestion/autogenerated/lineage_helper.py +30 -128
  8. datahub/ingestion/run/pipeline.py +4 -1
  9. datahub/ingestion/source/bigquery_v2/bigquery.py +23 -22
  10. datahub/ingestion/source/bigquery_v2/queries.py +2 -2
  11. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  12. datahub/ingestion/source/common/subtypes.py +1 -1
  13. datahub/ingestion/source/data_lake_common/object_store.py +40 -0
  14. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  15. datahub/ingestion/source/dremio/dremio_source.py +6 -3
  16. datahub/ingestion/source/gcs/gcs_source.py +4 -1
  17. datahub/ingestion/source/ge_data_profiler.py +28 -20
  18. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  19. datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
  20. datahub/ingestion/source/redshift/usage.py +4 -3
  21. datahub/ingestion/source/s3/report.py +4 -2
  22. datahub/ingestion/source/s3/source.py +367 -115
  23. datahub/ingestion/source/snowflake/snowflake_queries.py +47 -3
  24. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  25. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  26. datahub/ingestion/source/sql/athena.py +95 -10
  27. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  28. datahub/ingestion/source/unity/proxy.py +4 -3
  29. datahub/ingestion/source/unity/source.py +10 -8
  30. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  31. datahub/metadata/_internal_schema_classes.py +85 -4
  32. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  33. datahub/metadata/schema.avsc +54 -1
  34. datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
  35. datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
  36. datahub/sdk/lineage_client.py +2 -0
  37. datahub/sql_parsing/sql_parsing_aggregator.py +3 -3
  38. datahub/sql_parsing/sqlglot_lineage.py +2 -0
  39. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  40. {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/WHEEL +0 -0
  41. {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/entry_points.txt +0 -0
  42. {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/licenses/LICENSE +0 -0
  43. {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/top_level.txt +0 -0
@@ -44,6 +44,11 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
44
44
  SnowflakeIdentifierBuilder,
45
45
  SnowflakeStructuredReportMixin,
46
46
  )
47
+ from datahub.ingestion.source.snowflake.stored_proc_lineage import (
48
+ StoredProcCall,
49
+ StoredProcLineageReport,
50
+ StoredProcLineageTracker,
51
+ )
47
52
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
48
53
  from datahub.metadata.urns import CorpUserUrn
49
54
  from datahub.sql_parsing.schema_resolver import SchemaResolver
@@ -130,6 +135,7 @@ class SnowflakeQueriesExtractorReport(Report):
130
135
  aggregator_generate_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
131
136
 
132
137
  sql_aggregator: Optional[SqlAggregatorReport] = None
138
+ stored_proc_lineage: Optional[StoredProcLineageReport] = None
133
139
 
134
140
  num_ddl_queries_dropped: int = 0
135
141
  num_stream_queries_observed: int = 0
@@ -261,6 +267,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
261
267
  TableRename,
262
268
  TableSwap,
263
269
  ObservedQuery,
270
+ StoredProcCall,
264
271
  ]
265
272
  ] = self._exit_stack.enter_context(FileBackedList(shared_connection))
266
273
 
@@ -277,12 +284,34 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
277
284
  for entry in self.fetch_query_log(users):
278
285
  queries.append(entry)
279
286
 
287
+ stored_proc_tracker: StoredProcLineageTracker = self._exit_stack.enter_context(
288
+ StoredProcLineageTracker(
289
+ platform=self.identifiers.platform,
290
+ shared_connection=shared_connection,
291
+ )
292
+ )
293
+ self.report.stored_proc_lineage = stored_proc_tracker.report
294
+
280
295
  with self.report.audit_log_load_timer:
281
296
  for i, query in enumerate(queries):
282
297
  if i % 1000 == 0:
283
298
  logger.info(f"Added {i} query log entries to SQL aggregator")
284
299
 
285
- self.aggregator.add(query)
300
+ if isinstance(query, StoredProcCall):
301
+ stored_proc_tracker.add_stored_proc_call(query)
302
+ continue
303
+
304
+ if not (
305
+ isinstance(query, PreparsedQuery)
306
+ and stored_proc_tracker.add_related_query(query)
307
+ ):
308
+ # Only add to aggregator if it's not part of a stored procedure.
309
+ self.aggregator.add(query)
310
+
311
+ # Generate and add stored procedure lineage entries.
312
+ for lineage_entry in stored_proc_tracker.build_merged_lineage_entries():
313
+ # TODO: Make this the lowest priority lineage - so that it doesn't override other lineage entries.
314
+ self.aggregator.add(lineage_entry)
286
315
 
287
316
  with self.report.aggregator_generate_timer:
288
317
  yield from auto_workunit(self.aggregator.gen_metadata())
@@ -342,7 +371,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
342
371
 
343
372
  def fetch_query_log(
344
373
  self, users: UsersMapping
345
- ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery]]:
374
+ ) -> Iterable[
375
+ Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
376
+ ]:
346
377
  query_log_query = _build_enriched_query_log_query(
347
378
  start_time=self.config.window.start_time,
348
379
  end_time=self.config.window.end_time,
@@ -382,7 +413,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
382
413
 
383
414
  def _parse_audit_log_row(
384
415
  self, row: Dict[str, Any], users: UsersMapping
385
- ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
416
+ ) -> Optional[
417
+ Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery, StoredProcCall]
418
+ ]:
386
419
  json_fields = {
387
420
  "DIRECT_OBJECTS_ACCESSED",
388
421
  "OBJECTS_MODIFIED",
@@ -482,6 +515,17 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
482
515
  extra_info=extra_info,
483
516
  )
484
517
 
518
+ if snowflake_query_type == "CALL" and res["root_query_id"] is None:
519
+ return StoredProcCall(
520
+ # This is the top-level query ID that other entries will reference.
521
+ snowflake_root_query_id=res["query_id"],
522
+ query_text=query_text,
523
+ timestamp=timestamp,
524
+ user=user,
525
+ default_db=res["default_db"],
526
+ default_schema=res["default_schema"],
527
+ )
528
+
485
529
  upstreams = []
486
530
  column_usage = {}
487
531
 
@@ -231,7 +231,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
231
231
 
232
232
  with self.report.usage_aggregation.result_fetch_timer as fetch_timer:
233
233
  for row in results:
234
- with fetch_timer.pause(), self.report.usage_aggregation.result_skip_timer as skip_timer:
234
+ with (
235
+ fetch_timer.pause(),
236
+ self.report.usage_aggregation.result_skip_timer as skip_timer,
237
+ ):
235
238
  if results.rownumber is not None and results.rownumber % 1000 == 0:
236
239
  logger.debug(f"Processing usage row number {results.rownumber}")
237
240
  logger.debug(self.report.usage_aggregation.as_string())
@@ -255,7 +258,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
255
258
  f"Skipping usage for {row['OBJECT_DOMAIN']} {dataset_identifier}, as table is not accessible."
256
259
  )
257
260
  continue
258
- with skip_timer.pause(), self.report.usage_aggregation.result_map_timer as map_timer:
261
+ with (
262
+ skip_timer.pause(),
263
+ self.report.usage_aggregation.result_map_timer as map_timer,
264
+ ):
259
265
  wu = self.build_usage_statistics_for_dataset(
260
266
  dataset_identifier, row
261
267
  )
@@ -0,0 +1,143 @@
1
+ import dataclasses
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from typing import Any, Iterable, List, Optional
5
+
6
+ from datahub.ingestion.api.closeable import Closeable
7
+ from datahub.metadata.urns import CorpUserUrn
8
+ from datahub.sql_parsing.sql_parsing_aggregator import (
9
+ PreparsedQuery,
10
+ UrnStr,
11
+ )
12
+ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
13
+ from datahub.utilities.file_backed_collections import FileBackedDict
14
+
15
+
16
+ @dataclasses.dataclass
17
+ class StoredProcCall:
18
+ snowflake_root_query_id: str
19
+
20
+ # Query text will typically be something like:
21
+ # "CALL SALES_FORECASTING.CUSTOMER_ANALYSIS_PROC();"
22
+ query_text: str
23
+
24
+ timestamp: datetime
25
+ user: CorpUserUrn
26
+ default_db: str
27
+ default_schema: str
28
+
29
+
30
+ @dataclass
31
+ class StoredProcExecutionLineage:
32
+ call: StoredProcCall
33
+
34
+ inputs: List[UrnStr]
35
+ outputs: List[UrnStr]
36
+
37
+
38
+ @dataclass
39
+ class StoredProcLineageReport:
40
+ num_stored_proc_calls: int = 0
41
+ num_related_queries: int = 0
42
+ num_related_queries_without_proc_call: int = 0
43
+
44
+ # Incremented at generation/build time.
45
+ num_stored_proc_lineage_entries: int = 0
46
+ num_stored_proc_calls_with_no_inputs: int = 0
47
+ num_stored_proc_calls_with_no_outputs: int = 0
48
+
49
+
50
+ class StoredProcLineageTracker(Closeable):
51
+ """
52
+ Tracks table-level lineage for Snowflake stored procedures.
53
+
54
+ Stored procedures in Snowflake trigger multiple SQL queries during execution.
55
+ Snowflake assigns each stored procedure call a unique query_id and uses this as the
56
+ root_query_id for all subsequent queries executed within that procedure. This allows
57
+ us to trace which queries belong to a specific stored procedure execution and build
58
+ table-level lineage by aggregating inputs/outputs from all related queries.
59
+ """
60
+
61
+ def __init__(self, platform: str, shared_connection: Optional[Any] = None):
62
+ self.platform = platform
63
+ self.report = StoredProcLineageReport()
64
+
65
+ # { root_query_id -> StoredProcExecutionLineage }
66
+ self._stored_proc_execution_lineage: FileBackedDict[
67
+ StoredProcExecutionLineage
68
+ ] = FileBackedDict(shared_connection)
69
+
70
+ def add_stored_proc_call(self, call: StoredProcCall) -> None:
71
+ """Add a stored procedure call to track."""
72
+ self._stored_proc_execution_lineage[call.snowflake_root_query_id] = (
73
+ StoredProcExecutionLineage(
74
+ call=call,
75
+ # Will be populated by subsequent queries.
76
+ inputs=[],
77
+ outputs=[],
78
+ )
79
+ )
80
+ self.report.num_stored_proc_calls += 1
81
+
82
+ def add_related_query(self, query: PreparsedQuery) -> bool:
83
+ """Add a query that might be related to a stored procedure execution.
84
+
85
+ Returns True if the query was added to a stored procedure execution, False otherwise.
86
+ """
87
+ snowflake_root_query_id = (query.extra_info or {}).get(
88
+ "snowflake_root_query_id"
89
+ )
90
+
91
+ if snowflake_root_query_id:
92
+ if snowflake_root_query_id not in self._stored_proc_execution_lineage:
93
+ self.report.num_related_queries_without_proc_call += 1
94
+ return False
95
+
96
+ stored_proc_execution = self._stored_proc_execution_lineage.for_mutation(
97
+ snowflake_root_query_id
98
+ )
99
+ stored_proc_execution.inputs.extend(query.upstreams)
100
+ if query.downstream is not None:
101
+ stored_proc_execution.outputs.append(query.downstream)
102
+ self.report.num_related_queries += 1
103
+ return True
104
+
105
+ return False
106
+
107
+ def build_merged_lineage_entries(self) -> Iterable[PreparsedQuery]:
108
+ # For stored procedures, we can only get table-level lineage from the audit log.
109
+ # We represent these as PreparsedQuery objects for now. Eventually we'll want to
110
+ # create dataJobInputOutput lineage instead.
111
+
112
+ for stored_proc_execution in self._stored_proc_execution_lineage.values():
113
+ if not stored_proc_execution.inputs:
114
+ self.report.num_stored_proc_calls_with_no_inputs += 1
115
+ continue
116
+
117
+ if not stored_proc_execution.outputs:
118
+ self.report.num_stored_proc_calls_with_no_outputs += 1
119
+ # Still continue to generate lineage for cases where we have inputs but no outputs
120
+
121
+ for downstream in stored_proc_execution.outputs:
122
+ stored_proc_query_id = get_query_fingerprint(
123
+ stored_proc_execution.call.query_text,
124
+ self.platform,
125
+ fast=True,
126
+ secondary_id=downstream,
127
+ )
128
+
129
+ lineage_entry = PreparsedQuery(
130
+ query_id=stored_proc_query_id,
131
+ query_text=stored_proc_execution.call.query_text,
132
+ upstreams=stored_proc_execution.inputs,
133
+ downstream=downstream,
134
+ query_count=0,
135
+ user=stored_proc_execution.call.user,
136
+ timestamp=stored_proc_execution.call.timestamp,
137
+ )
138
+
139
+ self.report.num_stored_proc_lineage_entries += 1
140
+ yield lineage_entry
141
+
142
+ def close(self) -> None:
143
+ self._stored_proc_execution_lineage.close()
@@ -34,6 +34,9 @@ from datahub.ingestion.source.common.subtypes import (
34
34
  SourceCapabilityModifier,
35
35
  )
36
36
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
37
+ from datahub.ingestion.source.sql.athena_properties_extractor import (
38
+ AthenaPropertiesExtractor,
39
+ )
37
40
  from datahub.ingestion.source.sql.sql_common import (
38
41
  SQLAlchemySource,
39
42
  register_custom_type,
@@ -47,12 +50,17 @@ from datahub.ingestion.source.sql.sql_utils import (
47
50
  )
48
51
  from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
49
52
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
50
- from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass
53
+ from datahub.metadata.schema_classes import (
54
+ ArrayTypeClass,
55
+ MapTypeClass,
56
+ RecordTypeClass,
57
+ )
51
58
  from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
52
59
  from datahub.utilities.sqlalchemy_type_converter import (
53
60
  MapType,
54
61
  get_schema_fields_for_sqlalchemy_column,
55
62
  )
63
+ from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
56
64
 
57
65
  try:
58
66
  from typing_extensions import override
@@ -284,6 +292,11 @@ class AthenaConfig(SQLCommonConfig):
284
292
  description="Extract partitions for tables. Partition extraction needs to run a query (`select * from table$partitions`) on the table. Disable this if you don't want to grant select permission.",
285
293
  )
286
294
 
295
+ extract_partitions_using_create_statements: bool = pydantic.Field(
296
+ default=False,
297
+ description="Extract partitions using the `SHOW CREATE TABLE` statement instead of querying the table's partitions directly. This needs to be enabled to extract Iceberg partitions. If extraction fails it falls back to the default partition extraction. This is experimental.",
298
+ )
299
+
287
300
  _s3_staging_dir_population = pydantic_renamed_field(
288
301
  old_name="s3_staging_dir",
289
302
  new_name="query_result_location",
@@ -496,23 +509,38 @@ class AthenaSource(SQLAlchemySource):
496
509
  def get_partitions(
497
510
  self, inspector: Inspector, schema: str, table: str
498
511
  ) -> Optional[List[str]]:
499
- if not self.config.extract_partitions:
512
+ if (
513
+ not self.config.extract_partitions
514
+ and not self.config.extract_partitions_using_create_statements
515
+ ):
500
516
  return None
501
517
 
502
518
  if not self.cursor:
503
519
  return None
504
520
 
505
- metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
506
- table_name=table, schema_name=schema
507
- )
521
+ if self.config.extract_partitions_using_create_statements:
522
+ try:
523
+ partitions = self._get_partitions_create_table(schema, table)
524
+ except Exception as e:
525
+ logger.warning(
526
+ f"Failed to get partitions from create table statement for {schema}.{table} because of {e}. Falling back to SQLAlchemy.",
527
+ exc_info=True,
528
+ )
529
+
530
+ # If we can't get create table statement, we fall back to SQLAlchemy
531
+ partitions = self._get_partitions_sqlalchemy(schema, table)
532
+ else:
533
+ partitions = self._get_partitions_sqlalchemy(schema, table)
508
534
 
509
- partitions = []
510
- for key in metadata.partition_keys:
511
- if key.name:
512
- partitions.append(key.name)
513
535
  if not partitions:
514
536
  return []
515
537
 
538
+ if (
539
+ not self.config.profiling.enabled
540
+ or not self.config.profiling.partition_profiling_enabled
541
+ ):
542
+ return partitions
543
+
516
544
  with self.report.report_exc(
517
545
  message="Failed to extract partition details",
518
546
  context=f"{schema}.{table}",
@@ -538,6 +566,56 @@ class AthenaSource(SQLAlchemySource):
538
566
 
539
567
  return partitions
540
568
 
569
+ def _get_partitions_create_table(self, schema: str, table: str) -> List[str]:
570
+ assert self.cursor
571
+ try:
572
+ res = self.cursor.execute(f"SHOW CREATE TABLE `{schema}`.`{table}`")
573
+ except Exception as e:
574
+ # Athena does not support SHOW CREATE TABLE for views
575
+ # and will throw an error. We need to handle this case
576
+ # and caller needs to fallback to sqlalchemy's get partitions call.
577
+ logger.debug(
578
+ f"Failed to get table properties for {schema}.{table}: {e}",
579
+ exc_info=True,
580
+ )
581
+ raise e
582
+ rows = res.fetchall()
583
+
584
+ # Concatenate all rows into a single string with newlines
585
+ create_table_statement = "\n".join(row[0] for row in rows)
586
+
587
+ try:
588
+ athena_table_info = AthenaPropertiesExtractor.get_table_properties(
589
+ create_table_statement
590
+ )
591
+ except Exception as e:
592
+ logger.debug(
593
+ f"Failed to parse table properties for {schema}.{table}: {e} and statement: {create_table_statement}",
594
+ exc_info=True,
595
+ )
596
+ raise e
597
+
598
+ partitions = []
599
+ if (
600
+ athena_table_info.partition_info
601
+ and athena_table_info.partition_info.simple_columns
602
+ ):
603
+ partitions = [
604
+ ci.name for ci in athena_table_info.partition_info.simple_columns
605
+ ]
606
+ return partitions
607
+
608
+ def _get_partitions_sqlalchemy(self, schema: str, table: str) -> List[str]:
609
+ assert self.cursor
610
+ metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
611
+ table_name=table, schema_name=schema
612
+ )
613
+ partitions = []
614
+ for key in metadata.partition_keys:
615
+ if key.name:
616
+ partitions.append(key.name)
617
+ return partitions
618
+
541
619
  # Overwrite to modify the creation of schema fields
542
620
  def get_schema_fields_for_column(
543
621
  self,
@@ -563,7 +641,14 @@ class AthenaSource(SQLAlchemySource):
563
641
  partition_keys is not None and column["name"] in partition_keys
564
642
  ),
565
643
  )
566
-
644
+ if isinstance(
645
+ fields[0].type.type, (RecordTypeClass, MapTypeClass, ArrayTypeClass)
646
+ ):
647
+ return fields
648
+ else:
649
+ fields[0].fieldPath = get_simple_field_path_from_v2_field_path(
650
+ fields[0].fieldPath
651
+ )
567
652
  return fields
568
653
 
569
654
  def generate_partition_profiler_query(