acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (211) hide show
  1. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
  2. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
  3. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/assertion/assertion_operator.py +3 -5
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  8. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  9. datahub/api/entities/dataset/dataset.py +2 -1
  10. datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
  11. datahub/cli/cli_utils.py +13 -2
  12. datahub/cli/delete_cli.py +3 -3
  13. datahub/cli/docker_cli.py +6 -6
  14. datahub/cli/ingest_cli.py +25 -15
  15. datahub/cli/lite_cli.py +2 -2
  16. datahub/cli/migrate.py +5 -5
  17. datahub/cli/specific/assertions_cli.py +3 -3
  18. datahub/cli/specific/structuredproperties_cli.py +84 -0
  19. datahub/cli/timeline_cli.py +1 -1
  20. datahub/configuration/common.py +1 -2
  21. datahub/configuration/config_loader.py +73 -50
  22. datahub/configuration/git.py +2 -2
  23. datahub/configuration/time_window_config.py +10 -5
  24. datahub/emitter/mce_builder.py +4 -8
  25. datahub/emitter/mcp_builder.py +27 -0
  26. datahub/emitter/mcp_patch_builder.py +1 -2
  27. datahub/emitter/rest_emitter.py +126 -85
  28. datahub/entrypoints.py +6 -0
  29. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  30. datahub/ingestion/api/report.py +1 -2
  31. datahub/ingestion/api/source.py +4 -2
  32. datahub/ingestion/api/source_helpers.py +1 -1
  33. datahub/ingestion/extractor/json_schema_util.py +3 -3
  34. datahub/ingestion/extractor/schema_util.py +3 -5
  35. datahub/ingestion/fs/s3_fs.py +3 -3
  36. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  37. datahub/ingestion/graph/client.py +22 -19
  38. datahub/ingestion/graph/config.py +1 -1
  39. datahub/ingestion/run/pipeline.py +8 -7
  40. datahub/ingestion/run/pipeline_config.py +3 -3
  41. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  42. datahub/ingestion/source/abs/source.py +19 -8
  43. datahub/ingestion/source/aws/glue.py +77 -47
  44. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  45. datahub/ingestion/source/aws/s3_util.py +24 -1
  46. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  47. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  48. datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
  49. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  50. datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
  51. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  52. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  53. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
  54. datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
  55. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  56. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  57. datahub/ingestion/source/bigquery_v2/usage.py +60 -60
  58. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  59. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  60. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
  61. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  62. datahub/ingestion/source/csv_enricher.py +29 -29
  63. datahub/ingestion/source/datahub/config.py +10 -0
  64. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  65. datahub/ingestion/source/datahub/datahub_source.py +12 -2
  66. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  67. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  68. datahub/ingestion/source/delta_lake/source.py +0 -5
  69. datahub/ingestion/source/demo_data.py +1 -1
  70. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  71. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  72. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  73. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  74. datahub/ingestion/source/elastic_search.py +4 -4
  75. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  76. datahub/ingestion/source/gc/datahub_gc.py +11 -14
  77. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  78. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
  79. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  80. datahub/ingestion/source/ge_data_profiler.py +2 -5
  81. datahub/ingestion/source/ge_profiling_config.py +3 -3
  82. datahub/ingestion/source/iceberg/iceberg.py +13 -6
  83. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  84. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  85. datahub/ingestion/source/identity/azure_ad.py +3 -3
  86. datahub/ingestion/source/identity/okta.py +3 -3
  87. datahub/ingestion/source/kafka/kafka.py +11 -9
  88. datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
  89. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  90. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  91. datahub/ingestion/source/looker/looker_common.py +19 -19
  92. datahub/ingestion/source/looker/looker_config.py +11 -6
  93. datahub/ingestion/source/looker/looker_source.py +25 -25
  94. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  95. datahub/ingestion/source/looker/looker_usage.py +5 -7
  96. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  97. datahub/ingestion/source/looker/lookml_source.py +13 -15
  98. datahub/ingestion/source/looker/view_upstream.py +5 -5
  99. datahub/ingestion/source/metabase.py +1 -6
  100. datahub/ingestion/source/mlflow.py +4 -9
  101. datahub/ingestion/source/mode.py +5 -5
  102. datahub/ingestion/source/mongodb.py +6 -4
  103. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  104. datahub/ingestion/source/nifi.py +24 -31
  105. datahub/ingestion/source/openapi.py +9 -9
  106. datahub/ingestion/source/powerbi/config.py +12 -12
  107. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  108. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  109. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  110. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  111. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  112. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  113. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  114. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  115. datahub/ingestion/source/redash.py +0 -5
  116. datahub/ingestion/source/redshift/config.py +3 -3
  117. datahub/ingestion/source/redshift/redshift.py +45 -46
  118. datahub/ingestion/source/redshift/usage.py +33 -33
  119. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  120. datahub/ingestion/source/s3/source.py +11 -15
  121. datahub/ingestion/source/salesforce.py +26 -25
  122. datahub/ingestion/source/schema/json_schema.py +1 -1
  123. datahub/ingestion/source/sigma/sigma.py +3 -3
  124. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  125. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  126. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  127. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  128. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  129. datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
  130. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
  131. datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
  132. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
  133. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  134. datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
  135. datahub/ingestion/source/sql/athena.py +1 -3
  136. datahub/ingestion/source/sql/clickhouse.py +8 -14
  137. datahub/ingestion/source/sql/oracle.py +1 -3
  138. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  139. datahub/ingestion/source/sql/sql_types.py +1 -2
  140. datahub/ingestion/source/sql/sql_utils.py +5 -0
  141. datahub/ingestion/source/sql/teradata.py +18 -5
  142. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  143. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  144. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  145. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  146. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  147. datahub/ingestion/source/superset.py +1 -6
  148. datahub/ingestion/source/tableau/tableau.py +343 -117
  149. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  150. datahub/ingestion/source/unity/config.py +3 -1
  151. datahub/ingestion/source/unity/proxy.py +1 -1
  152. datahub/ingestion/source/unity/source.py +74 -74
  153. datahub/ingestion/source/unity/usage.py +3 -1
  154. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  155. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  156. datahub/ingestion/source/usage/usage_common.py +1 -1
  157. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  158. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  159. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  160. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  161. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  162. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  163. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  164. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  165. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  166. datahub/lite/duckdb_lite.py +12 -10
  167. datahub/metadata/_schema_classes.py +317 -44
  168. datahub/metadata/_urns/urn_defs.py +69 -15
  169. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  170. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  171. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  172. datahub/metadata/schema.avsc +302 -89
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  175. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  176. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  177. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  178. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  179. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  180. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  181. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  182. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  183. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  184. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  185. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  186. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  187. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  188. datahub/secret/datahub_secrets_client.py +12 -21
  189. datahub/secret/secret_common.py +14 -8
  190. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  191. datahub/sql_parsing/schema_resolver.py +5 -10
  192. datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
  193. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  194. datahub/sql_parsing/sqlglot_utils.py +1 -1
  195. datahub/telemetry/stats.py +1 -2
  196. datahub/testing/mcp_diff.py +1 -1
  197. datahub/utilities/file_backed_collections.py +11 -11
  198. datahub/utilities/hive_schema_to_avro.py +2 -2
  199. datahub/utilities/logging_manager.py +2 -2
  200. datahub/utilities/lossy_collections.py +3 -3
  201. datahub/utilities/mapping.py +3 -3
  202. datahub/utilities/memory_footprint.py +3 -2
  203. datahub/utilities/perf_timer.py +11 -6
  204. datahub/utilities/serialized_lru_cache.py +3 -1
  205. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  206. datahub/utilities/sqllineage_patch.py +1 -1
  207. datahub/utilities/stats_collections.py +3 -1
  208. datahub/utilities/urns/_urn_base.py +28 -5
  209. datahub/utilities/urns/urn_iter.py +2 -2
  210. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
  211. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
@@ -642,8 +642,11 @@ class TableauUpstreamReference:
642
642
 
643
643
  @classmethod
644
644
  def create(
645
- cls, d: dict, default_schema_map: Optional[Dict[str, str]] = None
645
+ cls, d: Dict, default_schema_map: Optional[Dict[str, str]] = None
646
646
  ) -> "TableauUpstreamReference":
647
+ if d is None:
648
+ raise ValueError("TableauUpstreamReference.create: d is None")
649
+
647
650
  # Values directly from `table` object from Tableau
648
651
  database_dict = (
649
652
  d.get(c.DATABASE) or {}
@@ -717,7 +720,7 @@ class TableauUpstreamReference:
717
720
  # schema
718
721
 
719
722
  # TODO: Validate the startswith check. Currently required for our integration tests
720
- if full_name is None or not full_name.startswith("["):
723
+ if full_name is None:
721
724
  return None
722
725
 
723
726
  return full_name.replace("[", "").replace("]", "").split(".")
@@ -254,7 +254,9 @@ class UnityCatalogSourceConfig(
254
254
  )
255
255
 
256
256
  # TODO: Remove `type:ignore` by refactoring config
257
- profiling: Union[UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig] = Field( # type: ignore
257
+ profiling: Union[
258
+ UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
259
+ ] = Field( # type: ignore
258
260
  default=UnityCatalogGEProfilerConfig(),
259
261
  description="Data profiling configuration",
260
262
  discriminator="method",
@@ -363,7 +363,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
363
363
 
364
364
  @staticmethod
365
365
  def _create_metastore(
366
- obj: Union[GetMetastoreSummaryResponse, MetastoreInfo]
366
+ obj: Union[GetMetastoreSummaryResponse, MetastoreInfo],
367
367
  ) -> Optional[Metastore]:
368
368
  if not obj.name:
369
369
  return None
@@ -205,9 +205,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
205
205
  self.table_refs: Set[TableReference] = set()
206
206
  self.view_refs: Set[TableReference] = set()
207
207
  self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
208
- self.view_definitions: FileBackedDict[
209
- Tuple[TableReference, str]
210
- ] = FileBackedDict()
208
+ self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
209
+ FileBackedDict()
210
+ )
211
211
 
212
212
  # Global map of tables, for profiling
213
213
  self.tables: FileBackedDict[Table] = FileBackedDict()
@@ -263,86 +263,86 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
263
263
  ]
264
264
 
265
265
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
266
- self.report.report_ingestion_stage_start("Ingestion Setup")
267
- wait_on_warehouse = None
268
- if self.config.include_hive_metastore:
269
- self.report.report_ingestion_stage_start("Start warehouse")
270
- # Can take several minutes, so start now and wait later
271
- wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
272
- if wait_on_warehouse is None:
273
- self.report.report_failure(
274
- "initialization",
275
- f"SQL warehouse {self.config.profiling.warehouse_id} not found",
276
- )
277
- return
278
- else:
279
- # wait until warehouse is started
280
- wait_on_warehouse.result()
266
+ with self.report.new_stage("Ingestion Setup"):
267
+ wait_on_warehouse = None
268
+ if self.config.include_hive_metastore:
269
+ with self.report.new_stage("Start warehouse"):
270
+ # Can take several minutes, so start now and wait later
271
+ wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
272
+ if wait_on_warehouse is None:
273
+ self.report.report_failure(
274
+ "initialization",
275
+ f"SQL warehouse {self.config.profiling.warehouse_id} not found",
276
+ )
277
+ return
278
+ else:
279
+ # wait until warehouse is started
280
+ wait_on_warehouse.result()
281
281
 
282
282
  if self.config.include_ownership:
283
- self.report.report_ingestion_stage_start("Ingest service principals")
284
- self.build_service_principal_map()
285
- self.build_groups_map()
283
+ with self.report.new_stage("Ingest service principals"):
284
+ self.build_service_principal_map()
285
+ self.build_groups_map()
286
286
  if self.config.include_notebooks:
287
- self.report.report_ingestion_stage_start("Ingest notebooks")
288
- yield from self.process_notebooks()
287
+ with self.report.new_stage("Ingest notebooks"):
288
+ yield from self.process_notebooks()
289
289
 
290
290
  yield from self.process_metastores()
291
291
 
292
292
  yield from self.get_view_lineage()
293
293
 
294
294
  if self.config.include_notebooks:
295
- self.report.report_ingestion_stage_start("Notebook lineage")
296
- for notebook in self.notebooks.values():
297
- wu = self._gen_notebook_lineage(notebook)
298
- if wu:
299
- yield wu
295
+ with self.report.new_stage("Notebook lineage"):
296
+ for notebook in self.notebooks.values():
297
+ wu = self._gen_notebook_lineage(notebook)
298
+ if wu:
299
+ yield wu
300
300
 
301
301
  if self.config.include_usage_statistics:
302
- self.report.report_ingestion_stage_start("Ingest usage")
303
- usage_extractor = UnityCatalogUsageExtractor(
304
- config=self.config,
305
- report=self.report,
306
- proxy=self.unity_catalog_api_proxy,
307
- table_urn_builder=self.gen_dataset_urn,
308
- user_urn_builder=self.gen_user_urn,
309
- )
310
- yield from usage_extractor.get_usage_workunits(
311
- self.table_refs | self.view_refs
312
- )
313
-
314
- if self.config.is_profiling_enabled():
315
- self.report.report_ingestion_stage_start("Start warehouse")
316
- # Need to start the warehouse again for profiling,
317
- # as it may have been stopped after ingestion might take
318
- # longer time to complete
319
- wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
320
- if wait_on_warehouse is None:
321
- self.report.report_failure(
322
- "initialization",
323
- f"SQL warehouse {self.config.profiling.warehouse_id} not found",
302
+ with self.report.new_stage("Ingest usage"):
303
+ usage_extractor = UnityCatalogUsageExtractor(
304
+ config=self.config,
305
+ report=self.report,
306
+ proxy=self.unity_catalog_api_proxy,
307
+ table_urn_builder=self.gen_dataset_urn,
308
+ user_urn_builder=self.gen_user_urn,
309
+ )
310
+ yield from usage_extractor.get_usage_workunits(
311
+ self.table_refs | self.view_refs
324
312
  )
325
- return
326
- else:
327
- # wait until warehouse is started
328
- wait_on_warehouse.result()
329
313
 
330
- self.report.report_ingestion_stage_start("Profiling")
331
- if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
332
- yield from UnityCatalogAnalyzeProfiler(
333
- self.config.profiling,
334
- self.report,
335
- self.unity_catalog_api_proxy,
336
- self.gen_dataset_urn,
337
- ).get_workunits(self.table_refs)
338
- elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
339
- yield from UnityCatalogGEProfiler(
340
- sql_common_config=self.config,
341
- profiling_config=self.config.profiling,
342
- report=self.report,
343
- ).get_workunits(list(self.tables.values()))
344
- else:
345
- raise ValueError("Unknown profiling config method")
314
+ if self.config.is_profiling_enabled():
315
+ with self.report.new_stage("Start warehouse"):
316
+ # Need to start the warehouse again for profiling,
317
+ # as it may have been stopped after ingestion might take
318
+ # longer time to complete
319
+ wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
320
+ if wait_on_warehouse is None:
321
+ self.report.report_failure(
322
+ "initialization",
323
+ f"SQL warehouse {self.config.profiling.warehouse_id} not found",
324
+ )
325
+ return
326
+ else:
327
+ # wait until warehouse is started
328
+ wait_on_warehouse.result()
329
+
330
+ with self.report.new_stage("Profiling"):
331
+ if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
332
+ yield from UnityCatalogAnalyzeProfiler(
333
+ self.config.profiling,
334
+ self.report,
335
+ self.unity_catalog_api_proxy,
336
+ self.gen_dataset_urn,
337
+ ).get_workunits(self.table_refs)
338
+ elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
339
+ yield from UnityCatalogGEProfiler(
340
+ sql_common_config=self.config,
341
+ profiling_config=self.config.profiling,
342
+ report=self.report,
343
+ ).get_workunits(list(self.tables.values()))
344
+ else:
345
+ raise ValueError("Unknown profiling config method")
346
346
 
347
347
  def build_service_principal_map(self) -> None:
348
348
  try:
@@ -462,11 +462,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
462
462
  self.report.schemas.dropped(schema.id)
463
463
  continue
464
464
 
465
- self.report.report_ingestion_stage_start(f"Ingest schema {schema.id}")
466
- yield from self.gen_schema_containers(schema)
467
- yield from self.process_tables(schema)
465
+ with self.report.new_stage(f"Ingest schema {schema.id}"):
466
+ yield from self.gen_schema_containers(schema)
467
+ yield from self.process_tables(schema)
468
468
 
469
- self.report.schemas.processed(schema.id)
469
+ self.report.schemas.processed(schema.id)
470
470
 
471
471
  def process_tables(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
472
472
  for table in self.unity_catalog_api_proxy.tables(schema=schema):
@@ -103,7 +103,9 @@ class UnityCatalogUsageExtractor:
103
103
  query, table_info
104
104
  )
105
105
  for source_table in table_info.source_tables:
106
- with self.report.usage_perf_report.aggregator_add_event_timer:
106
+ with (
107
+ self.report.usage_perf_report.aggregator_add_event_timer
108
+ ):
107
109
  self.usage_aggregator.aggregate_event(
108
110
  resource=source_table,
109
111
  start_time=query.start_time,
@@ -213,15 +213,15 @@ class ClickHouseUsageSource(Source):
213
213
  def _aggregate_access_events(
214
214
  self, events: List[ClickHouseJoinedAccessEvent]
215
215
  ) -> Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]]:
216
- datasets: Dict[
217
- datetime, Dict[ClickHouseTableRef, AggregatedDataset]
218
- ] = collections.defaultdict(dict)
216
+ datasets: Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]] = (
217
+ collections.defaultdict(dict)
218
+ )
219
219
 
220
220
  for event in events:
221
221
  floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
222
222
 
223
223
  resource = (
224
- f'{self.config.platform_instance+"." if self.config.platform_instance else ""}'
224
+ f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
225
225
  f"{event.database}.{event.table}"
226
226
  )
227
227
 
@@ -235,9 +235,9 @@ class TrinoUsageSource(Source):
235
235
  def _aggregate_access_events(
236
236
  self, events: List[TrinoJoinedAccessEvent]
237
237
  ) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]:
238
- datasets: Dict[
239
- datetime, Dict[TrinoTableRef, AggregatedDataset]
240
- ] = collections.defaultdict(dict)
238
+ datasets: Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]] = (
239
+ collections.defaultdict(dict)
240
+ )
241
241
 
242
242
  for event in events:
243
243
  floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
@@ -89,7 +89,7 @@ def make_usage_workunit(
89
89
  top_sql_queries: Optional[List[str]] = None
90
90
  if query_freq is not None:
91
91
  if top_n_queries < len(query_freq):
92
- logger.warn(
92
+ logger.warning(
93
93
  f"Top N query limit exceeded on {str(resource)}. Max number of queries {top_n_queries} < {len(query_freq)}. Truncating top queries to {top_n_queries}."
94
94
  )
95
95
  query_freq = query_freq[0:top_n_queries]
@@ -1,7 +1,7 @@
1
1
  import logging
2
+ from contextlib import AbstractContextManager
2
3
  from dataclasses import dataclass, field
3
4
  from datetime import datetime, timezone
4
- from typing import Optional
5
5
 
6
6
  from datahub.utilities.perf_timer import PerfTimer
7
7
  from datahub.utilities.stats_collections import TopKDict
@@ -22,25 +22,29 @@ PROFILING = "Profiling"
22
22
 
23
23
  @dataclass
24
24
  class IngestionStageReport:
25
- ingestion_stage: Optional[str] = None
26
25
  ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
27
26
 
28
- _timer: Optional[PerfTimer] = field(
29
- default=None, init=False, repr=False, compare=False
30
- )
31
-
32
- def report_ingestion_stage_start(self, stage: str) -> None:
33
- if self._timer:
34
- elapsed = round(self._timer.elapsed_seconds(), 2)
35
- logger.info(
36
- f"Time spent in stage <{self.ingestion_stage}>: {elapsed} seconds",
37
- stacklevel=2,
38
- )
39
- if self.ingestion_stage:
40
- self.ingestion_stage_durations[self.ingestion_stage] = elapsed
41
- else:
42
- self._timer = PerfTimer()
43
-
44
- self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
45
- logger.info(f"Stage started: {self.ingestion_stage}")
27
+ def new_stage(self, stage: str) -> "IngestionStageContext":
28
+ return IngestionStageContext(stage, self)
29
+
30
+
31
+ @dataclass
32
+ class IngestionStageContext(AbstractContextManager):
33
+ def __init__(self, stage: str, report: IngestionStageReport):
34
+ self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
35
+ self._timer: PerfTimer = PerfTimer()
36
+ self._report = report
37
+
38
+ def __enter__(self) -> "IngestionStageContext":
39
+ logger.info(f"Stage started: {self._ingestion_stage}")
46
40
  self._timer.start()
41
+ return self
42
+
43
+ def __exit__(self, exc_type, exc_val, exc_tb):
44
+ elapsed = self._timer.elapsed_seconds(digits=2)
45
+ logger.info(
46
+ f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
47
+ stacklevel=2,
48
+ )
49
+ self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed
50
+ return None
@@ -80,10 +80,10 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
80
80
  ).add_asset(container_urn)
81
81
  data_products_container[data_product_urn] = container_product
82
82
  else:
83
- data_products_container[
84
- data_product_urn
85
- ] = data_products_container[data_product_urn].add_asset(
86
- container_urn
83
+ data_products_container[data_product_urn] = (
84
+ data_products_container[data_product_urn].add_asset(
85
+ container_urn
86
+ )
87
87
  )
88
88
 
89
89
  mcps: List[
@@ -61,9 +61,9 @@ class AddDatasetProperties(DatasetPropertiesTransformer):
61
61
  ) -> Optional[DatasetPropertiesClass]:
62
62
  assert dataset_properties_aspect
63
63
 
64
- server_dataset_properties_aspect: Optional[
65
- DatasetPropertiesClass
66
- ] = graph.get_dataset_properties(entity_urn)
64
+ server_dataset_properties_aspect: Optional[DatasetPropertiesClass] = (
65
+ graph.get_dataset_properties(entity_urn)
66
+ )
67
67
  # No need to take any action if server properties is None or there is not customProperties in server properties
68
68
  if (
69
69
  server_dataset_properties_aspect is None
@@ -89,9 +89,9 @@ class AddDatasetSchemaTags(DatasetSchemaMetadataTransformer):
89
89
  server_field_map: dict = {}
90
90
  if self.config.semantics == TransformerSemantics.PATCH:
91
91
  assert self.ctx.graph
92
- server_schema_metadata_aspect: Optional[
93
- SchemaMetadataClass
94
- ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
92
+ server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
93
+ self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
94
+ )
95
95
  if server_schema_metadata_aspect is not None:
96
96
  if not schema_metadata_aspect:
97
97
  schema_metadata_aspect = server_schema_metadata_aspect
@@ -108,9 +108,9 @@ class AddDatasetSchemaTerms(DatasetSchemaMetadataTransformer):
108
108
  ] = {} # Map to cache server field objects, where fieldPath is key
109
109
  if self.config.semantics == TransformerSemantics.PATCH:
110
110
  assert self.ctx.graph
111
- server_schema_metadata_aspect: Optional[
112
- SchemaMetadataClass
113
- ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
111
+ server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
112
+ self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
113
+ )
114
114
  if server_schema_metadata_aspect is not None:
115
115
  if not schema_metadata_aspect:
116
116
  schema_metadata_aspect = server_schema_metadata_aspect
@@ -60,10 +60,10 @@ class DatasetTagDomainMapper(DatasetDomainTransformer):
60
60
  domain_aspect.domains.extend(mapped_domains.domains)
61
61
  if self.config.semantics == TransformerSemantics.PATCH:
62
62
  # Try merging with server-side domains
63
- patch_domain_aspect: Optional[
64
- DomainsClass
65
- ] = AddDatasetDomain._merge_with_server_domains(
66
- self.ctx.graph, entity_urn, domain_aspect
63
+ patch_domain_aspect: Optional[DomainsClass] = (
64
+ AddDatasetDomain._merge_with_server_domains(
65
+ self.ctx.graph, entity_urn, domain_aspect
66
+ )
67
67
  )
68
68
  return cast(Optional[Aspect], patch_domain_aspect)
69
69
  return cast(Optional[Aspect], domain_aspect)
@@ -141,9 +141,9 @@ class ExtractOwnersFromTagsTransformer(DatasetTagsTransformer):
141
141
  else:
142
142
  owner_type = get_owner_type(self.config.owner_type)
143
143
  if owner_type == OwnershipTypeClass.CUSTOM:
144
- assert (
145
- self.config.owner_type_urn is not None
146
- ), "owner_type_urn must be set if owner_type is CUSTOM"
144
+ assert self.config.owner_type_urn is not None, (
145
+ "owner_type_urn must be set if owner_type is CUSTOM"
146
+ )
147
147
 
148
148
  owners.append(
149
149
  OwnerClass(
@@ -92,9 +92,9 @@ class TagsToTermMapper(TagsToTermTransformer):
92
92
  in_global_tags_aspect: Optional[GlobalTagsClass] = self.ctx.graph.get_tags(
93
93
  entity_urn
94
94
  )
95
- in_schema_metadata_aspect: Optional[
96
- SchemaMetadataClass
97
- ] = self.ctx.graph.get_schema_metadata(entity_urn)
95
+ in_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
96
+ self.ctx.graph.get_schema_metadata(entity_urn)
97
+ )
98
98
 
99
99
  if in_global_tags_aspect is None and in_schema_metadata_aspect is None:
100
100
  return cast(Aspect, in_glossary_terms)
@@ -134,10 +134,10 @@ class TagsToTermMapper(TagsToTermTransformer):
134
134
  )
135
135
 
136
136
  if self.config.semantics == TransformerSemantics.PATCH:
137
- patch_glossary_terms: Optional[
138
- GlossaryTermsClass
139
- ] = TagsToTermMapper._merge_with_server_glossary_terms(
140
- self.ctx.graph, entity_urn, out_glossary_terms
137
+ patch_glossary_terms: Optional[GlossaryTermsClass] = (
138
+ TagsToTermMapper._merge_with_server_glossary_terms(
139
+ self.ctx.graph, entity_urn, out_glossary_terms
140
+ )
141
141
  )
142
142
  return cast(Optional[Aspect], patch_glossary_terms)
143
143
  else:
@@ -61,17 +61,17 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
61
61
  def create(
62
62
  cls, output_dir: str, extras: Dict[str, str]
63
63
  ) -> "SnowflakeAssertionCompiler":
64
- assert os.path.exists(
65
- output_dir
66
- ), f"Specified location {output_dir} does not exist."
64
+ assert os.path.exists(output_dir), (
65
+ f"Specified location {output_dir} does not exist."
66
+ )
67
67
 
68
- assert os.path.isdir(
69
- output_dir
70
- ), f"Specified location {output_dir} is not a folder."
68
+ assert os.path.isdir(output_dir), (
69
+ f"Specified location {output_dir} is not a folder."
70
+ )
71
71
 
72
- assert any(
73
- x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras
74
- ), "Must specify value for DMF schema using -x DMF_SCHEMA=<db.schema>"
72
+ assert any(x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras), (
73
+ "Must specify value for DMF schema using -x DMF_SCHEMA=<db.schema>"
74
+ )
75
75
 
76
76
  return SnowflakeAssertionCompiler(output_dir, extras)
77
77
 
@@ -232,6 +232,6 @@ def get_dmf_schedule(trigger: AssertionTrigger) -> str:
232
232
  elif isinstance(trigger.trigger, CronTrigger):
233
233
  return f"USING CRON {trigger.trigger.cron} {trigger.trigger.timezone}"
234
234
  elif isinstance(trigger.trigger, IntervalTrigger):
235
- return f"{trigger.trigger.interval.seconds/60} MIN"
235
+ return f"{trigger.trigger.interval.seconds / 60} MIN"
236
236
  else:
237
237
  raise ValueError(f"Unsupported trigger type {type(trigger.trigger)}")
@@ -163,9 +163,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
163
163
 
164
164
  if "properties" not in writeable_dict["systemMetadata"]:
165
165
  writeable_dict["systemMetadata"]["properties"] = {}
166
- writeable_dict["systemMetadata"]["properties"][
167
- "sysVersion"
168
- ] = new_version
166
+ writeable_dict["systemMetadata"]["properties"]["sysVersion"] = (
167
+ new_version
168
+ )
169
169
  if needs_write:
170
170
  self.duckdb_client.execute(
171
171
  query="INSERT INTO metadata_aspect_v2 VALUES (?, ?, ?, ?, ?, ?)",
@@ -208,9 +208,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
208
208
  "lastObserved": writeable.systemMetadata.lastObserved
209
209
  }
210
210
  else:
211
- system_metadata[
212
- "lastObserved"
213
- ] = writeable.systemMetadata.lastObserved
211
+ system_metadata["lastObserved"] = (
212
+ writeable.systemMetadata.lastObserved
213
+ )
214
214
  self.duckdb_client.execute(
215
215
  query="UPDATE metadata_aspect_v2 SET system_metadata = ? WHERE urn = ? AND aspect_name = ? AND version = 0",
216
216
  parameters=[
@@ -497,9 +497,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
497
497
  aspect_name = r[1]
498
498
  aspect_payload = json.loads(r[2])
499
499
  if typed:
500
- assert (
501
- aspect_name in ASPECT_MAP
502
- ), f"Missing aspect name {aspect_name} in the registry"
500
+ assert aspect_name in ASPECT_MAP, (
501
+ f"Missing aspect name {aspect_name} in the registry"
502
+ )
503
503
  try:
504
504
  aspect_payload = ASPECT_MAP[aspect_name].from_obj(
505
505
  post_json_transform(aspect_payload)
@@ -531,7 +531,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
531
531
  for r in results.fetchall():
532
532
  urn = r[0]
533
533
  aspect_name = r[1]
534
- aspect_metadata = ASPECT_MAP[aspect_name].from_obj(post_json_transform(json.loads(r[2]))) # type: ignore
534
+ aspect_metadata = ASPECT_MAP[aspect_name].from_obj(
535
+ post_json_transform(json.loads(r[2]))
536
+ ) # type: ignore
535
537
  system_metadata = SystemMetadataClass.from_obj(json.loads(r[3]))
536
538
  mcp = MetadataChangeProposalWrapper(
537
539
  entityUrn=urn,