acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (214) hide show
  1. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
  2. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
  3. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/assertion/assertion_operator.py +3 -5
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  8. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  9. datahub/api/entities/dataset/dataset.py +2 -1
  10. datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
  11. datahub/cli/cli_utils.py +13 -2
  12. datahub/cli/delete_cli.py +3 -3
  13. datahub/cli/docker_cli.py +6 -6
  14. datahub/cli/ingest_cli.py +25 -15
  15. datahub/cli/lite_cli.py +2 -2
  16. datahub/cli/migrate.py +5 -5
  17. datahub/cli/specific/assertions_cli.py +3 -3
  18. datahub/cli/specific/structuredproperties_cli.py +84 -0
  19. datahub/cli/timeline_cli.py +1 -1
  20. datahub/configuration/common.py +1 -2
  21. datahub/configuration/config_loader.py +73 -50
  22. datahub/configuration/git.py +2 -2
  23. datahub/configuration/time_window_config.py +10 -5
  24. datahub/emitter/mce_builder.py +4 -8
  25. datahub/emitter/mcp_builder.py +27 -0
  26. datahub/emitter/mcp_patch_builder.py +1 -2
  27. datahub/emitter/rest_emitter.py +141 -93
  28. datahub/entrypoints.py +6 -0
  29. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
  30. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  31. datahub/ingestion/api/report.py +1 -2
  32. datahub/ingestion/api/source.py +8 -2
  33. datahub/ingestion/api/source_helpers.py +1 -1
  34. datahub/ingestion/extractor/json_schema_util.py +3 -3
  35. datahub/ingestion/extractor/schema_util.py +3 -5
  36. datahub/ingestion/fs/s3_fs.py +3 -3
  37. datahub/ingestion/glossary/classifier.py +2 -3
  38. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  39. datahub/ingestion/graph/client.py +22 -19
  40. datahub/ingestion/graph/config.py +1 -1
  41. datahub/ingestion/run/pipeline.py +8 -7
  42. datahub/ingestion/run/pipeline_config.py +3 -3
  43. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  44. datahub/ingestion/source/abs/source.py +19 -8
  45. datahub/ingestion/source/aws/glue.py +77 -47
  46. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  47. datahub/ingestion/source/aws/s3_util.py +24 -1
  48. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  49. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  50. datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
  51. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  52. datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
  53. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  54. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  55. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
  56. datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
  57. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  58. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  59. datahub/ingestion/source/bigquery_v2/usage.py +60 -60
  60. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  61. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  62. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
  63. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  64. datahub/ingestion/source/csv_enricher.py +29 -29
  65. datahub/ingestion/source/datahub/config.py +20 -0
  66. datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
  67. datahub/ingestion/source/datahub/datahub_source.py +13 -3
  68. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  69. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  70. datahub/ingestion/source/delta_lake/source.py +0 -5
  71. datahub/ingestion/source/demo_data.py +1 -1
  72. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  73. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  74. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  75. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  76. datahub/ingestion/source/elastic_search.py +4 -4
  77. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  78. datahub/ingestion/source/gc/datahub_gc.py +11 -14
  79. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  80. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
  81. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  82. datahub/ingestion/source/ge_data_profiler.py +2 -5
  83. datahub/ingestion/source/ge_profiling_config.py +3 -3
  84. datahub/ingestion/source/iceberg/iceberg.py +13 -6
  85. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  86. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  87. datahub/ingestion/source/identity/azure_ad.py +3 -3
  88. datahub/ingestion/source/identity/okta.py +3 -3
  89. datahub/ingestion/source/kafka/kafka.py +11 -9
  90. datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  93. datahub/ingestion/source/looker/looker_common.py +19 -19
  94. datahub/ingestion/source/looker/looker_config.py +11 -6
  95. datahub/ingestion/source/looker/looker_source.py +25 -25
  96. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  97. datahub/ingestion/source/looker/looker_usage.py +5 -7
  98. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  99. datahub/ingestion/source/looker/lookml_source.py +13 -15
  100. datahub/ingestion/source/looker/view_upstream.py +5 -5
  101. datahub/ingestion/source/metabase.py +1 -6
  102. datahub/ingestion/source/mlflow.py +4 -9
  103. datahub/ingestion/source/mode.py +5 -5
  104. datahub/ingestion/source/mongodb.py +6 -4
  105. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  106. datahub/ingestion/source/nifi.py +24 -31
  107. datahub/ingestion/source/openapi.py +9 -9
  108. datahub/ingestion/source/powerbi/config.py +12 -12
  109. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  110. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  111. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  112. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  113. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  114. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  115. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  116. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  117. datahub/ingestion/source/redash.py +0 -5
  118. datahub/ingestion/source/redshift/config.py +3 -3
  119. datahub/ingestion/source/redshift/redshift.py +45 -46
  120. datahub/ingestion/source/redshift/usage.py +33 -33
  121. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  122. datahub/ingestion/source/s3/source.py +11 -15
  123. datahub/ingestion/source/salesforce.py +26 -25
  124. datahub/ingestion/source/schema/json_schema.py +1 -1
  125. datahub/ingestion/source/sigma/sigma.py +3 -3
  126. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  127. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  128. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  129. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
  130. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  131. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  132. datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
  133. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
  134. datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
  135. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
  136. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  137. datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
  138. datahub/ingestion/source/sql/athena.py +1 -3
  139. datahub/ingestion/source/sql/clickhouse.py +8 -14
  140. datahub/ingestion/source/sql/oracle.py +1 -3
  141. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  142. datahub/ingestion/source/sql/sql_types.py +1 -2
  143. datahub/ingestion/source/sql/sql_utils.py +5 -0
  144. datahub/ingestion/source/sql/teradata.py +18 -5
  145. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  146. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  147. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  148. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  149. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  150. datahub/ingestion/source/superset.py +1 -6
  151. datahub/ingestion/source/tableau/tableau.py +343 -117
  152. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  153. datahub/ingestion/source/unity/config.py +3 -1
  154. datahub/ingestion/source/unity/proxy.py +1 -1
  155. datahub/ingestion/source/unity/source.py +74 -78
  156. datahub/ingestion/source/unity/usage.py +3 -1
  157. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  158. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  159. datahub/ingestion/source/usage/usage_common.py +1 -1
  160. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  161. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  162. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  163. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  164. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  165. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  166. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  167. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  168. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  169. datahub/lite/duckdb_lite.py +12 -10
  170. datahub/metadata/_schema_classes.py +317 -44
  171. datahub/metadata/_urns/urn_defs.py +69 -15
  172. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  173. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  174. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  175. datahub/metadata/schema.avsc +302 -89
  176. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  177. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  179. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  180. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  181. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  182. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  183. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  184. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  185. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  186. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  187. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  188. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  189. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  190. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  191. datahub/secret/datahub_secrets_client.py +12 -21
  192. datahub/secret/secret_common.py +14 -8
  193. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  194. datahub/sql_parsing/schema_resolver.py +5 -10
  195. datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
  196. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  197. datahub/sql_parsing/sqlglot_utils.py +1 -1
  198. datahub/telemetry/stats.py +1 -2
  199. datahub/testing/mcp_diff.py +1 -1
  200. datahub/utilities/file_backed_collections.py +11 -11
  201. datahub/utilities/hive_schema_to_avro.py +2 -2
  202. datahub/utilities/logging_manager.py +2 -2
  203. datahub/utilities/lossy_collections.py +3 -3
  204. datahub/utilities/mapping.py +3 -3
  205. datahub/utilities/memory_footprint.py +3 -2
  206. datahub/utilities/perf_timer.py +11 -6
  207. datahub/utilities/serialized_lru_cache.py +3 -1
  208. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  209. datahub/utilities/sqllineage_patch.py +1 -1
  210. datahub/utilities/stats_collections.py +3 -1
  211. datahub/utilities/urns/_urn_base.py +28 -5
  212. datahub/utilities/urns/urn_iter.py +2 -2
  213. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
  214. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
@@ -276,6 +276,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
276
276
  "HLLSKETCH": NullType,
277
277
  "TIMETZ": TimeType,
278
278
  "VARBYTE": StringType,
279
+ "SUPER": NullType,
279
280
  }
280
281
 
281
282
  def get_platform_instance_id(self) -> str:
@@ -304,13 +305,13 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
304
305
  test_report.capability_report = {}
305
306
  try:
306
307
  RedshiftDataDictionary.get_schemas(connection, database=config.database)
307
- test_report.capability_report[
308
- SourceCapability.SCHEMA_METADATA
309
- ] = CapabilityReport(capable=True)
308
+ test_report.capability_report[SourceCapability.SCHEMA_METADATA] = (
309
+ CapabilityReport(capable=True)
310
+ )
310
311
  except Exception as e:
311
- test_report.capability_report[
312
- SourceCapability.SCHEMA_METADATA
313
- ] = CapabilityReport(capable=False, failure_reason=str(e))
312
+ test_report.capability_report[SourceCapability.SCHEMA_METADATA] = (
313
+ CapabilityReport(capable=False, failure_reason=str(e))
314
+ )
314
315
 
315
316
  except Exception as e:
316
317
  test_report.basic_connectivity = CapabilityReport(
@@ -423,10 +424,10 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
423
424
 
424
425
  database = self.config.database
425
426
  logger.info(f"Processing db {database}")
426
- self.report.report_ingestion_stage_start(METADATA_EXTRACTION)
427
- self.db_tables[database] = defaultdict()
428
- self.db_views[database] = defaultdict()
429
- self.db_schemas.setdefault(database, {})
427
+ with self.report.new_stage(METADATA_EXTRACTION):
428
+ self.db_tables[database] = defaultdict()
429
+ self.db_views[database] = defaultdict()
430
+ self.db_schemas.setdefault(database, {})
430
431
 
431
432
  # TODO: Ideally, we'd push down exception handling to the place where the connection is used, as opposed to keeping
432
433
  # this fallback. For now, this gets us broad coverage quickly.
@@ -462,12 +463,12 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
462
463
  self.process_schemas(connection, database)
463
464
  )
464
465
 
465
- self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
466
- yield from self.extract_lineage_v2(
467
- connection=connection,
468
- database=database,
469
- lineage_extractor=lineage_extractor,
470
- )
466
+ with self.report.new_stage(LINEAGE_EXTRACTION):
467
+ yield from self.extract_lineage_v2(
468
+ connection=connection,
469
+ database=database,
470
+ lineage_extractor=lineage_extractor,
471
+ )
471
472
 
472
473
  all_tables = self.get_all_tables()
473
474
  else:
@@ -480,25 +481,25 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
480
481
  or self.config.include_view_lineage
481
482
  or self.config.include_copy_lineage
482
483
  ):
483
- self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
484
- yield from self.extract_lineage(
485
- connection=connection, all_tables=all_tables, database=database
486
- )
484
+ with self.report.new_stage(LINEAGE_EXTRACTION):
485
+ yield from self.extract_lineage(
486
+ connection=connection, all_tables=all_tables, database=database
487
+ )
487
488
 
488
- self.report.report_ingestion_stage_start(USAGE_EXTRACTION_INGESTION)
489
489
  if self.config.include_usage_statistics:
490
- yield from self.extract_usage(
491
- connection=connection, all_tables=all_tables, database=database
492
- )
490
+ with self.report.new_stage(USAGE_EXTRACTION_INGESTION):
491
+ yield from self.extract_usage(
492
+ connection=connection, all_tables=all_tables, database=database
493
+ )
493
494
 
494
495
  if self.config.is_profiling_enabled():
495
- self.report.report_ingestion_stage_start(PROFILING)
496
- profiler = RedshiftProfiler(
497
- config=self.config,
498
- report=self.report,
499
- state_handler=self.profiling_state_handler,
500
- )
501
- yield from profiler.get_workunits(self.db_tables)
496
+ with self.report.new_stage(PROFILING):
497
+ profiler = RedshiftProfiler(
498
+ config=self.config,
499
+ report=self.report,
500
+ state_handler=self.profiling_state_handler,
501
+ )
502
+ yield from profiler.get_workunits(self.db_tables)
502
503
 
503
504
  def process_schemas(self, connection, database):
504
505
  for schema in self.data_dictionary.get_schemas(
@@ -633,8 +634,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
633
634
  else:
634
635
  logger.info("View processing disabled, skipping")
635
636
 
636
- self.report.metadata_extraction_sec[report_key] = round(
637
- timer.elapsed_seconds(), 2
637
+ self.report.metadata_extraction_sec[report_key] = timer.elapsed_seconds(
638
+ digits=2
638
639
  )
639
640
 
640
641
  def _process_table(
@@ -946,9 +947,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
946
947
  def get_all_tables(
947
948
  self,
948
949
  ) -> Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]]:
949
- all_tables: Dict[
950
- str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]
951
- ] = defaultdict(dict)
950
+ all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]] = (
951
+ defaultdict(dict)
952
+ )
952
953
  for db in set().union(self.db_tables, self.db_views):
953
954
  tables = self.db_tables.get(db, {})
954
955
  views = self.db_views.get(db, {})
@@ -966,9 +967,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
966
967
  all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
967
968
  ) -> Iterable[MetadataWorkUnit]:
968
969
  with PerfTimer() as timer:
969
- redundant_usage_run_skip_handler: Optional[
970
- RedundantUsageRunSkipHandler
971
- ] = None
970
+ redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = (
971
+ None
972
+ )
972
973
  if self.config.enable_stateful_usage_ingestion:
973
974
  redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler(
974
975
  source=self,
@@ -986,9 +987,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
986
987
 
987
988
  yield from usage_extractor.get_usage_workunits(all_tables=all_tables)
988
989
 
989
- self.report.usage_extraction_sec[database] = round(
990
- timer.elapsed_seconds(), 2
991
- )
990
+ self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2)
992
991
 
993
992
  def extract_lineage(
994
993
  self,
@@ -1011,8 +1010,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
1011
1010
  database=database, connection=connection, all_tables=all_tables
1012
1011
  )
1013
1012
 
1014
- self.report.lineage_extraction_sec[f"{database}"] = round(
1015
- timer.elapsed_seconds(), 2
1013
+ self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
1014
+ digits=2
1016
1015
  )
1017
1016
  yield from self.generate_lineage(
1018
1017
  database, lineage_extractor=lineage_extractor
@@ -1042,8 +1041,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
1042
1041
 
1043
1042
  yield from lineage_extractor.generate()
1044
1043
 
1045
- self.report.lineage_extraction_sec[f"{database}"] = round(
1046
- timer.elapsed_seconds(), 2
1044
+ self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
1045
+ digits=2
1047
1046
  )
1048
1047
 
1049
1048
  if self.redundant_lineage_run_skip_handler:
@@ -182,38 +182,38 @@ class RedshiftUsageExtractor:
182
182
  self.report.num_operational_stats_filtered = 0
183
183
 
184
184
  if self.config.include_operational_stats:
185
- self.report.report_ingestion_stage_start(USAGE_EXTRACTION_OPERATIONAL_STATS)
186
- with PerfTimer() as timer:
187
- # Generate operation aspect workunits
188
- yield from self._gen_operation_aspect_workunits(
189
- self.connection, all_tables
190
- )
191
- self.report.operational_metadata_extraction_sec[
192
- self.config.database
193
- ] = round(timer.elapsed_seconds(), 2)
185
+ with self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS):
186
+ with PerfTimer() as timer:
187
+ # Generate operation aspect workunits
188
+ yield from self._gen_operation_aspect_workunits(
189
+ self.connection, all_tables
190
+ )
191
+ self.report.operational_metadata_extraction_sec[
192
+ self.config.database
193
+ ] = timer.elapsed_seconds(digits=2)
194
194
 
195
195
  # Generate aggregate events
196
- self.report.report_ingestion_stage_start(USAGE_EXTRACTION_USAGE_AGGREGATION)
197
- query: str = self.queries.usage_query(
198
- start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
199
- end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
200
- database=self.config.database,
201
- )
202
- access_events_iterable: Iterable[
203
- RedshiftAccessEvent
204
- ] = self._gen_access_events_from_history_query(
205
- query, connection=self.connection, all_tables=all_tables
206
- )
196
+ with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):
197
+ query: str = self.queries.usage_query(
198
+ start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
199
+ end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
200
+ database=self.config.database,
201
+ )
202
+ access_events_iterable: Iterable[RedshiftAccessEvent] = (
203
+ self._gen_access_events_from_history_query(
204
+ query, connection=self.connection, all_tables=all_tables
205
+ )
206
+ )
207
207
 
208
- aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
209
- access_events_iterable
210
- )
211
- # Generate usage workunits from aggregated events.
212
- for time_bucket in aggregated_events.values():
213
- for aggregate in time_bucket.values():
214
- wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
215
- self.report.num_usage_workunits_emitted += 1
216
- yield wu
208
+ aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
209
+ access_events_iterable
210
+ )
211
+ # Generate usage workunits from aggregated events.
212
+ for time_bucket in aggregated_events.values():
213
+ for aggregate in time_bucket.values():
214
+ wu: MetadataWorkUnit = self._make_usage_stat(aggregate)
215
+ self.report.num_usage_workunits_emitted += 1
216
+ yield wu
217
217
 
218
218
  def _gen_operation_aspect_workunits(
219
219
  self,
@@ -225,10 +225,10 @@ class RedshiftUsageExtractor:
225
225
  start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
226
226
  end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
227
227
  )
228
- access_events_iterable: Iterable[
229
- RedshiftAccessEvent
230
- ] = self._gen_access_events_from_history_query(
231
- query, connection, all_tables=all_tables
228
+ access_events_iterable: Iterable[RedshiftAccessEvent] = (
229
+ self._gen_access_events_from_history_query(
230
+ query, connection, all_tables=all_tables
231
+ )
232
232
  )
233
233
 
234
234
  # Generate operation aspect work units from the access events
@@ -85,8 +85,8 @@ class DataLakeProfilerConfig(ConfigModel):
85
85
  if field_level_metric.startswith("include_field_"):
86
86
  values.setdefault(field_level_metric, False)
87
87
 
88
- assert (
89
- max_num_fields_to_profile is None
90
- ), f"{max_num_fields_to_profile_key} should be set to None"
88
+ assert max_num_fields_to_profile is None, (
89
+ f"{max_num_fields_to_profile_key} should be set to None"
90
+ )
91
91
 
92
92
  return values
@@ -6,9 +6,8 @@ import pathlib
6
6
  import re
7
7
  import time
8
8
  from datetime import datetime
9
- from itertools import groupby
10
9
  from pathlib import PurePath
11
- from typing import Any, Dict, Iterable, List, Optional, Tuple
10
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
12
11
  from urllib.parse import urlparse
13
12
 
14
13
  import smart_open.compression as so_compression
@@ -41,6 +40,7 @@ from datahub.ingestion.source.aws.s3_util import (
41
40
  get_bucket_name,
42
41
  get_bucket_relative_path,
43
42
  get_key_prefix,
43
+ group_s3_objects_by_dirname,
44
44
  strip_s3_prefix,
45
45
  )
46
46
  from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
@@ -75,6 +75,9 @@ from datahub.metadata.schema_classes import (
75
75
  from datahub.telemetry import stats, telemetry
76
76
  from datahub.utilities.perf_timer import PerfTimer
77
77
 
78
+ if TYPE_CHECKING:
79
+ from mypy_boto3_s3.service_resource import Bucket
80
+
78
81
  # hide annoying debug errors from py4j
79
82
  logging.getLogger("py4j").setLevel(logging.ERROR)
80
83
  logger: logging.Logger = logging.getLogger(__name__)
@@ -842,7 +845,7 @@ class S3Source(StatefulIngestionSourceBase):
842
845
  def get_folder_info(
843
846
  self,
844
847
  path_spec: PathSpec,
845
- bucket: Any, # Todo: proper type
848
+ bucket: "Bucket",
846
849
  prefix: str,
847
850
  ) -> List[Folder]:
848
851
  """
@@ -857,22 +860,15 @@ class S3Source(StatefulIngestionSourceBase):
857
860
 
858
861
  Parameters:
859
862
  path_spec (PathSpec): The path specification used to determine partitioning.
860
- bucket (Any): The S3 bucket object.
863
+ bucket (Bucket): The S3 bucket object.
861
864
  prefix (str): The prefix path in the S3 bucket to list objects from.
862
865
 
863
866
  Returns:
864
867
  List[Folder]: A list of Folder objects representing the partitions found.
865
868
  """
866
-
867
- prefix_to_list = prefix
868
- files = list(
869
- bucket.objects.filter(Prefix=f"{prefix_to_list}").page_size(PAGE_SIZE)
870
- )
871
- files = sorted(files, key=lambda a: a.last_modified)
872
- grouped_files = groupby(files, lambda x: x.key.rsplit("/", 1)[0])
873
-
874
869
  partitions: List[Folder] = []
875
- for key, group in grouped_files:
870
+ s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
871
+ for key, group in group_s3_objects_by_dirname(s3_objects).items():
876
872
  file_size = 0
877
873
  creation_time = None
878
874
  modification_time = None
@@ -904,7 +900,7 @@ class S3Source(StatefulIngestionSourceBase):
904
900
  Folder(
905
901
  partition_id=id,
906
902
  is_partition=bool(id),
907
- creation_time=creation_time if creation_time else None,
903
+ creation_time=creation_time if creation_time else None, # type: ignore[arg-type]
908
904
  modification_time=modification_time,
909
905
  sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
910
906
  size=file_size,
@@ -1128,7 +1124,7 @@ class S3Source(StatefulIngestionSourceBase):
1128
1124
  table_data.table_path
1129
1125
  ].timestamp = table_data.timestamp
1130
1126
 
1131
- for guid, table_data in table_dict.items():
1127
+ for _, table_data in table_dict.items():
1132
1128
  yield from self.ingest_table(table_data, path_spec)
1133
1129
 
1134
1130
  if not self.source_config.is_profiling_enabled():
@@ -236,12 +236,12 @@ class SalesforceSource(Source):
236
236
  try:
237
237
  if self.config.auth is SalesforceAuthType.DIRECT_ACCESS_TOKEN:
238
238
  logger.debug("Access Token Provided in Config")
239
- assert (
240
- self.config.access_token is not None
241
- ), "Config access_token is required for DIRECT_ACCESS_TOKEN auth"
242
- assert (
243
- self.config.instance_url is not None
244
- ), "Config instance_url is required for DIRECT_ACCESS_TOKEN auth"
239
+ assert self.config.access_token is not None, (
240
+ "Config access_token is required for DIRECT_ACCESS_TOKEN auth"
241
+ )
242
+ assert self.config.instance_url is not None, (
243
+ "Config instance_url is required for DIRECT_ACCESS_TOKEN auth"
244
+ )
245
245
 
246
246
  self.sf = Salesforce(
247
247
  instance_url=self.config.instance_url,
@@ -250,15 +250,15 @@ class SalesforceSource(Source):
250
250
  )
251
251
  elif self.config.auth is SalesforceAuthType.USERNAME_PASSWORD:
252
252
  logger.debug("Username/Password Provided in Config")
253
- assert (
254
- self.config.username is not None
255
- ), "Config username is required for USERNAME_PASSWORD auth"
256
- assert (
257
- self.config.password is not None
258
- ), "Config password is required for USERNAME_PASSWORD auth"
259
- assert (
260
- self.config.security_token is not None
261
- ), "Config security_token is required for USERNAME_PASSWORD auth"
253
+ assert self.config.username is not None, (
254
+ "Config username is required for USERNAME_PASSWORD auth"
255
+ )
256
+ assert self.config.password is not None, (
257
+ "Config password is required for USERNAME_PASSWORD auth"
258
+ )
259
+ assert self.config.security_token is not None, (
260
+ "Config security_token is required for USERNAME_PASSWORD auth"
261
+ )
262
262
 
263
263
  self.sf = Salesforce(
264
264
  username=self.config.username,
@@ -269,15 +269,15 @@ class SalesforceSource(Source):
269
269
 
270
270
  elif self.config.auth is SalesforceAuthType.JSON_WEB_TOKEN:
271
271
  logger.debug("Json Web Token provided in the config")
272
- assert (
273
- self.config.username is not None
274
- ), "Config username is required for JSON_WEB_TOKEN auth"
275
- assert (
276
- self.config.consumer_key is not None
277
- ), "Config consumer_key is required for JSON_WEB_TOKEN auth"
278
- assert (
279
- self.config.private_key is not None
280
- ), "Config private_key is required for JSON_WEB_TOKEN auth"
272
+ assert self.config.username is not None, (
273
+ "Config username is required for JSON_WEB_TOKEN auth"
274
+ )
275
+ assert self.config.consumer_key is not None, (
276
+ "Config consumer_key is required for JSON_WEB_TOKEN auth"
277
+ )
278
+ assert self.config.private_key is not None, (
279
+ "Config private_key is required for JSON_WEB_TOKEN auth"
280
+ )
281
281
 
282
282
  self.sf = Salesforce(
283
283
  username=self.config.username,
@@ -439,7 +439,8 @@ class SalesforceSource(Source):
439
439
  dataPlatformInstance = DataPlatformInstanceClass(
440
440
  builder.make_data_platform_urn(self.platform),
441
441
  instance=builder.make_dataplatform_instance_urn(
442
- self.platform, self.config.platform_instance # type:ignore
442
+ self.platform,
443
+ self.config.platform_instance, # type:ignore
443
444
  ),
444
445
  )
445
446
 
@@ -354,7 +354,7 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
354
354
  browse_prefix = f"/{self.config.env.lower()}/{self.config.platform}/{self.config.platform_instance}"
355
355
 
356
356
  if os.path.isdir(self.config.path):
357
- for root, dirs, files in os.walk(self.config.path, topdown=False):
357
+ for root, _, files in os.walk(self.config.path, topdown=False):
358
358
  for file_name in [f for f in files if f.endswith(".json")]:
359
359
  try:
360
360
  yield from self._load_one_file(
@@ -477,9 +477,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
477
477
  upstream_dataset_urns
478
478
  and dataset_urn not in self.dataset_upstream_urn_mapping
479
479
  ):
480
- self.dataset_upstream_urn_mapping[
481
- dataset_urn
482
- ] = upstream_dataset_urns
480
+ self.dataset_upstream_urn_mapping[dataset_urn] = (
481
+ upstream_dataset_urns
482
+ )
483
483
 
484
484
  element_input_fields = [
485
485
  InputFieldClass(
@@ -126,9 +126,9 @@ class SigmaAPI:
126
126
  response.raise_for_status()
127
127
  response_dict = response.json()
128
128
  for workspace_dict in response_dict[Constant.ENTRIES]:
129
- self.workspaces[
130
- workspace_dict[Constant.WORKSPACEID]
131
- ] = Workspace.parse_obj(workspace_dict)
129
+ self.workspaces[workspace_dict[Constant.WORKSPACEID]] = (
130
+ Workspace.parse_obj(workspace_dict)
131
+ )
132
132
  if response_dict[Constant.NEXTPAGE]:
133
133
  url = f"{workspace_url}&page={response_dict[Constant.NEXTPAGE]}"
134
134
  else:
@@ -147,9 +147,9 @@ class SigmaAPI:
147
147
  response.raise_for_status()
148
148
  response_dict = response.json()
149
149
  for user_dict in response_dict[Constant.ENTRIES]:
150
- users[
151
- user_dict[Constant.MEMBERID]
152
- ] = f"{user_dict[Constant.FIRSTNAME]}_{user_dict[Constant.LASTNAME]}"
150
+ users[user_dict[Constant.MEMBERID]] = (
151
+ f"{user_dict[Constant.FIRSTNAME]}_{user_dict[Constant.LASTNAME]}"
152
+ )
153
153
  if response_dict[Constant.NEXTPAGE]:
154
154
  url = f"{members_url}&page={response_dict[Constant.NEXTPAGE]}"
155
155
  else:
@@ -327,10 +327,12 @@ class SigmaAPI:
327
327
  response.raise_for_status()
328
328
  for i, element_dict in enumerate(response.json()[Constant.ENTRIES]):
329
329
  if not element_dict.get(Constant.NAME):
330
- element_dict[Constant.NAME] = f"Element {i+1} of Page '{page.name}'"
331
- element_dict[
332
- Constant.URL
333
- ] = f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true"
330
+ element_dict[Constant.NAME] = (
331
+ f"Element {i + 1} of Page '{page.name}'"
332
+ )
333
+ element_dict[Constant.URL] = (
334
+ f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true"
335
+ )
334
336
  element = Element.parse_obj(element_dict)
335
337
  if (
336
338
  self.config.extract_lineage
@@ -221,6 +221,14 @@ class SnowflakeV2Config(
221
221
  default=False,
222
222
  description="If enabled, uses the new queries extractor to extract queries from snowflake.",
223
223
  )
224
+ include_queries: bool = Field(
225
+ default=True,
226
+ description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
227
+ )
228
+ include_query_usage_statistics: bool = Field(
229
+ default=True,
230
+ description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
231
+ )
224
232
 
225
233
  lazy_schema_resolver: bool = Field(
226
234
  default=True,
@@ -236,6 +244,11 @@ class SnowflakeV2Config(
236
244
  description="""Optional. Allowed values are `without_lineage`, `with_lineage`, and `skip` (default). `without_lineage` only extracts tags that have been applied directly to the given entity. `with_lineage` extracts both directly applied and propagated tags, but will be significantly slower. See the [Snowflake documentation](https://docs.snowflake.com/en/user-guide/object-tagging.html#tag-lineage) for information about tag lineage/propagation. """,
237
245
  )
238
246
 
247
+ extract_tags_as_structured_properties: bool = Field(
248
+ default=False,
249
+ description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
250
+ )
251
+
239
252
  include_external_url: bool = Field(
240
253
  default=True,
241
254
  description="Whether to populate Snowsight url for Snowflake Objects",
@@ -255,6 +268,14 @@ class SnowflakeV2Config(
255
268
  description="List of regex patterns for tags to include in ingestion. Only used if `extract_tags` is enabled.",
256
269
  )
257
270
 
271
+ structured_property_pattern: AllowDenyPattern = Field(
272
+ default=AllowDenyPattern.allow_all(),
273
+ description=(
274
+ "List of regex patterns for structured properties to include in ingestion."
275
+ " Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
276
+ ),
277
+ )
278
+
258
279
  # This is required since access_history table does not capture whether the table was temporary table.
259
280
  temporary_tables_pattern: List[str] = Field(
260
281
  default=DEFAULT_TEMP_TABLES_PATTERNS,
@@ -363,18 +384,20 @@ class SnowflakeV2Config(
363
384
  assert all(
364
385
  consumer.platform_instance != share_details.platform_instance
365
386
  for consumer in share_details.consumers
366
- ), "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake."
387
+ ), (
388
+ "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake."
389
+ )
367
390
 
368
391
  databases_included_in_share.append(shared_db)
369
392
  databases_created_from_share.extend(share_details.consumers)
370
393
 
371
394
  for db_from_share in databases_created_from_share:
372
- assert (
373
- db_from_share not in databases_included_in_share
374
- ), "Database included in a share can not be present as consumer in any share."
375
- assert (
376
- databases_created_from_share.count(db_from_share) == 1
377
- ), "Same database can not be present as consumer in more than one share."
395
+ assert db_from_share not in databases_included_in_share, (
396
+ "Database included in a share can not be present as consumer in any share."
397
+ )
398
+ assert databases_created_from_share.count(db_from_share) == 1, (
399
+ "Same database can not be present as consumer in more than one share."
400
+ )
378
401
 
379
402
  return shares
380
403
 
@@ -250,9 +250,9 @@ class SnowflakeConnectionConfig(ConfigModel):
250
250
  if self.private_key is not None:
251
251
  pkey_bytes = self.private_key.replace("\\n", "\n").encode()
252
252
  else:
253
- assert (
254
- self.private_key_path
255
- ), "missing required private key path to read key from"
253
+ assert self.private_key_path, (
254
+ "missing required private key path to read key from"
255
+ )
256
256
  with open(self.private_key_path, "rb") as key:
257
257
  pkey_bytes = key.read()
258
258
 
@@ -284,9 +284,9 @@ class SnowflakeConnectionConfig(ConfigModel):
284
284
  return self.options
285
285
 
286
286
  def get_oauth_connection(self) -> NativeSnowflakeConnection:
287
- assert (
288
- self.oauth_config
289
- ), "oauth_config should be provided if using oauth based authentication"
287
+ assert self.oauth_config, (
288
+ "oauth_config should be provided if using oauth based authentication"
289
+ )
290
290
  generator = OAuthTokenGenerator(
291
291
  client_id=self.oauth_config.client_id,
292
292
  authority_url=self.oauth_config.authority_url,
@@ -40,6 +40,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
40
40
  ColumnRef,
41
41
  DownstreamColumnRef,
42
42
  )
43
+ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
43
44
  from datahub.utilities.perf_timer import PerfTimer
44
45
  from datahub.utilities.time import ts_millis_to_datetime
45
46
 
@@ -239,6 +240,9 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
239
240
  downstream_table_urn = self.identifiers.gen_dataset_urn(dataset_name)
240
241
 
241
242
  known_lineage = KnownQueryLineageInfo(
243
+ query_id=get_query_fingerprint(
244
+ query.query_text, self.identifiers.platform, fast=True
245
+ ),
242
246
  query_text=query.query_text,
243
247
  downstream=downstream_table_urn,
244
248
  upstreams=self.map_query_result_upstreams(
@@ -623,7 +623,7 @@ fingerprinted_queries as (
623
623
  query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3)
624
624
  AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3)
625
625
  AND execution_status = 'SUCCESS'
626
- AND {users_filter or 'TRUE'}
626
+ AND {users_filter or "TRUE"}
627
627
  )
628
628
  , deduplicated_queries as (
629
629
  SELECT
@@ -651,7 +651,7 @@ fingerprinted_queries as (
651
651
  WHERE
652
652
  query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
653
653
  AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
654
- AND {users_filter or 'TRUE'}
654
+ AND {users_filter or "TRUE"}
655
655
  AND query_id IN (
656
656
  SELECT query_id FROM deduplicated_queries
657
657
  )
@@ -166,6 +166,3 @@ class SnowflakeV2Report(
166
166
 
167
167
  def report_tag_processed(self, tag_name: str) -> None:
168
168
  self._processed_tags.add(tag_name)
169
-
170
- def set_ingestion_stage(self, database: str, stage: str) -> None:
171
- self.report_ingestion_stage_start(f"{database}: {stage}")