acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (211) hide show
  1. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
  2. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
  3. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/assertion/assertion_operator.py +3 -5
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  8. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  9. datahub/api/entities/dataset/dataset.py +2 -1
  10. datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
  11. datahub/cli/cli_utils.py +13 -2
  12. datahub/cli/delete_cli.py +3 -3
  13. datahub/cli/docker_cli.py +6 -6
  14. datahub/cli/ingest_cli.py +25 -15
  15. datahub/cli/lite_cli.py +2 -2
  16. datahub/cli/migrate.py +5 -5
  17. datahub/cli/specific/assertions_cli.py +3 -3
  18. datahub/cli/specific/structuredproperties_cli.py +84 -0
  19. datahub/cli/timeline_cli.py +1 -1
  20. datahub/configuration/common.py +1 -2
  21. datahub/configuration/config_loader.py +73 -50
  22. datahub/configuration/git.py +2 -2
  23. datahub/configuration/time_window_config.py +10 -5
  24. datahub/emitter/mce_builder.py +4 -8
  25. datahub/emitter/mcp_builder.py +27 -0
  26. datahub/emitter/mcp_patch_builder.py +1 -2
  27. datahub/emitter/rest_emitter.py +126 -85
  28. datahub/entrypoints.py +6 -0
  29. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  30. datahub/ingestion/api/report.py +1 -2
  31. datahub/ingestion/api/source.py +4 -2
  32. datahub/ingestion/api/source_helpers.py +1 -1
  33. datahub/ingestion/extractor/json_schema_util.py +3 -3
  34. datahub/ingestion/extractor/schema_util.py +3 -5
  35. datahub/ingestion/fs/s3_fs.py +3 -3
  36. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  37. datahub/ingestion/graph/client.py +22 -19
  38. datahub/ingestion/graph/config.py +1 -1
  39. datahub/ingestion/run/pipeline.py +8 -7
  40. datahub/ingestion/run/pipeline_config.py +3 -3
  41. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  42. datahub/ingestion/source/abs/source.py +19 -8
  43. datahub/ingestion/source/aws/glue.py +77 -47
  44. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  45. datahub/ingestion/source/aws/s3_util.py +24 -1
  46. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  47. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  48. datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
  49. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  50. datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
  51. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  52. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  53. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
  54. datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
  55. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  56. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  57. datahub/ingestion/source/bigquery_v2/usage.py +60 -60
  58. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  59. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  60. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
  61. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  62. datahub/ingestion/source/csv_enricher.py +29 -29
  63. datahub/ingestion/source/datahub/config.py +10 -0
  64. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  65. datahub/ingestion/source/datahub/datahub_source.py +12 -2
  66. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  67. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  68. datahub/ingestion/source/delta_lake/source.py +0 -5
  69. datahub/ingestion/source/demo_data.py +1 -1
  70. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  71. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  72. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  73. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  74. datahub/ingestion/source/elastic_search.py +4 -4
  75. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  76. datahub/ingestion/source/gc/datahub_gc.py +11 -14
  77. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  78. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
  79. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  80. datahub/ingestion/source/ge_data_profiler.py +2 -5
  81. datahub/ingestion/source/ge_profiling_config.py +3 -3
  82. datahub/ingestion/source/iceberg/iceberg.py +13 -6
  83. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  84. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  85. datahub/ingestion/source/identity/azure_ad.py +3 -3
  86. datahub/ingestion/source/identity/okta.py +3 -3
  87. datahub/ingestion/source/kafka/kafka.py +11 -9
  88. datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
  89. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  90. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  91. datahub/ingestion/source/looker/looker_common.py +19 -19
  92. datahub/ingestion/source/looker/looker_config.py +11 -6
  93. datahub/ingestion/source/looker/looker_source.py +25 -25
  94. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  95. datahub/ingestion/source/looker/looker_usage.py +5 -7
  96. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  97. datahub/ingestion/source/looker/lookml_source.py +13 -15
  98. datahub/ingestion/source/looker/view_upstream.py +5 -5
  99. datahub/ingestion/source/metabase.py +1 -6
  100. datahub/ingestion/source/mlflow.py +4 -9
  101. datahub/ingestion/source/mode.py +5 -5
  102. datahub/ingestion/source/mongodb.py +6 -4
  103. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  104. datahub/ingestion/source/nifi.py +24 -31
  105. datahub/ingestion/source/openapi.py +9 -9
  106. datahub/ingestion/source/powerbi/config.py +12 -12
  107. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  108. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  109. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  110. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  111. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  112. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  113. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  114. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  115. datahub/ingestion/source/redash.py +0 -5
  116. datahub/ingestion/source/redshift/config.py +3 -3
  117. datahub/ingestion/source/redshift/redshift.py +45 -46
  118. datahub/ingestion/source/redshift/usage.py +33 -33
  119. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  120. datahub/ingestion/source/s3/source.py +11 -15
  121. datahub/ingestion/source/salesforce.py +26 -25
  122. datahub/ingestion/source/schema/json_schema.py +1 -1
  123. datahub/ingestion/source/sigma/sigma.py +3 -3
  124. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  125. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  126. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  127. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  128. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  129. datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
  130. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
  131. datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
  132. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
  133. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  134. datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
  135. datahub/ingestion/source/sql/athena.py +1 -3
  136. datahub/ingestion/source/sql/clickhouse.py +8 -14
  137. datahub/ingestion/source/sql/oracle.py +1 -3
  138. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  139. datahub/ingestion/source/sql/sql_types.py +1 -2
  140. datahub/ingestion/source/sql/sql_utils.py +5 -0
  141. datahub/ingestion/source/sql/teradata.py +18 -5
  142. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  143. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  144. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  145. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  146. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  147. datahub/ingestion/source/superset.py +1 -6
  148. datahub/ingestion/source/tableau/tableau.py +343 -117
  149. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  150. datahub/ingestion/source/unity/config.py +3 -1
  151. datahub/ingestion/source/unity/proxy.py +1 -1
  152. datahub/ingestion/source/unity/source.py +74 -74
  153. datahub/ingestion/source/unity/usage.py +3 -1
  154. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  155. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  156. datahub/ingestion/source/usage/usage_common.py +1 -1
  157. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  158. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  159. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  160. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  161. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  162. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  163. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  164. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  165. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  166. datahub/lite/duckdb_lite.py +12 -10
  167. datahub/metadata/_schema_classes.py +317 -44
  168. datahub/metadata/_urns/urn_defs.py +69 -15
  169. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  170. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  171. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  172. datahub/metadata/schema.avsc +302 -89
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  175. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  176. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  177. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  178. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  179. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  180. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  181. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  182. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  183. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  184. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  185. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  186. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  187. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  188. datahub/secret/datahub_secrets_client.py +12 -21
  189. datahub/secret/secret_common.py +14 -8
  190. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  191. datahub/sql_parsing/schema_resolver.py +5 -10
  192. datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
  193. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  194. datahub/sql_parsing/sqlglot_utils.py +1 -1
  195. datahub/telemetry/stats.py +1 -2
  196. datahub/testing/mcp_diff.py +1 -1
  197. datahub/utilities/file_backed_collections.py +11 -11
  198. datahub/utilities/hive_schema_to_avro.py +2 -2
  199. datahub/utilities/logging_manager.py +2 -2
  200. datahub/utilities/lossy_collections.py +3 -3
  201. datahub/utilities/mapping.py +3 -3
  202. datahub/utilities/memory_footprint.py +3 -2
  203. datahub/utilities/perf_timer.py +11 -6
  204. datahub/utilities/serialized_lru_cache.py +3 -1
  205. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  206. datahub/utilities/sqllineage_patch.py +1 -1
  207. datahub/utilities/stats_collections.py +3 -1
  208. datahub/utilities/urns/_urn_base.py +28 -5
  209. datahub/utilities/urns/urn_iter.py +2 -2
  210. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
  211. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
@@ -248,9 +248,9 @@ class BigQuerySchemaGenerator:
248
248
  def get_project_workunits(
249
249
  self, project: BigqueryProject
250
250
  ) -> Iterable[MetadataWorkUnit]:
251
- self.report.set_ingestion_stage(project.id, METADATA_EXTRACTION)
252
- logger.info(f"Processing project: {project.id}")
253
- yield from self._process_project(project)
251
+ with self.report.new_stage(f"{project.id}: {METADATA_EXTRACTION}"):
252
+ logger.info(f"Processing project: {project.id}")
253
+ yield from self._process_project(project)
254
254
 
255
255
  def get_dataplatform_instance_aspect(
256
256
  self, dataset_urn: str, project_id: str
@@ -311,8 +311,10 @@ class BigQuerySchemaGenerator:
311
311
  platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
312
312
  label, tag_urn, managed_by_datahub=False
313
313
  )
314
- label_info: BigQueryLabelInfo = platform_resource.resource_info.value.as_pydantic_object( # type: ignore
315
- BigQueryLabelInfo
314
+ label_info: BigQueryLabelInfo = (
315
+ platform_resource.resource_info.value.as_pydantic_object( # type: ignore
316
+ BigQueryLabelInfo
317
+ )
316
318
  )
317
319
  tag_urn = TagUrn.from_string(label_info.datahub_urn)
318
320
 
@@ -405,11 +407,11 @@ class BigQuerySchemaGenerator:
405
407
 
406
408
  if self.config.is_profiling_enabled():
407
409
  logger.info(f"Starting profiling project {project_id}")
408
- self.report.set_ingestion_stage(project_id, PROFILING)
409
- yield from self.profiler.get_workunits(
410
- project_id=project_id,
411
- tables=db_tables,
412
- )
410
+ with self.report.new_stage(f"{project_id}: {PROFILING}"):
411
+ yield from self.profiler.get_workunits(
412
+ project_id=project_id,
413
+ tables=db_tables,
414
+ )
413
415
 
414
416
  def _process_project_datasets(
415
417
  self,
@@ -820,8 +822,10 @@ class BigQuerySchemaGenerator:
820
822
  platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
821
823
  label, tag_urn, managed_by_datahub=False
822
824
  )
823
- label_info: BigQueryLabelInfo = platform_resource.resource_info.value.as_pydantic_object( # type: ignore
824
- BigQueryLabelInfo
825
+ label_info: BigQueryLabelInfo = (
826
+ platform_resource.resource_info.value.as_pydantic_object( # type: ignore
827
+ BigQueryLabelInfo
828
+ )
825
829
  )
826
830
  tag_urn = TagUrn.from_string(label_info.datahub_urn)
827
831
 
@@ -860,8 +864,10 @@ class BigQuerySchemaGenerator:
860
864
  platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
861
865
  label, tag_urn, managed_by_datahub=False
862
866
  )
863
- label_info: BigQueryLabelInfo = platform_resource.resource_info.value.as_pydantic_object( # type: ignore
864
- BigQueryLabelInfo
867
+ label_info: BigQueryLabelInfo = (
868
+ platform_resource.resource_info.value.as_pydantic_object( # type: ignore
869
+ BigQueryLabelInfo
870
+ )
865
871
  )
866
872
  tag_urn = TagUrn.from_string(label_info.datahub_urn)
867
873
 
@@ -1203,8 +1209,8 @@ class BigQuerySchemaGenerator:
1203
1209
  report=self.report,
1204
1210
  )
1205
1211
 
1206
- self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = round(
1207
- timer.elapsed_seconds(), 2
1212
+ self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = (
1213
+ timer.elapsed_seconds(digits=2)
1208
1214
  )
1209
1215
 
1210
1216
  def get_core_table_details(
@@ -330,11 +330,11 @@ class BigqueryLineageExtractor:
330
330
  projects = ["*"] # project_id not used when using exported metadata
331
331
 
332
332
  for project in projects:
333
- self.report.set_ingestion_stage(project, LINEAGE_EXTRACTION)
334
- yield from self.generate_lineage(
335
- project,
336
- table_refs,
337
- )
333
+ with self.report.new_stage(f"{project}: {LINEAGE_EXTRACTION}"):
334
+ yield from self.generate_lineage(
335
+ project,
336
+ table_refs,
337
+ )
338
338
 
339
339
  if self.redundant_run_skip_handler:
340
340
  # Update the checkpoint state for this run.
@@ -368,8 +368,8 @@ class BigqueryLineageExtractor:
368
368
  self.report.lineage_metadata_entries[project_id] = len(lineage)
369
369
  logger.info(f"Built lineage map containing {len(lineage)} entries.")
370
370
  logger.debug(f"lineage metadata is {lineage}")
371
- self.report.lineage_extraction_sec[project_id] = round(
372
- timer.elapsed_seconds(), 2
371
+ self.report.lineage_extraction_sec[project_id] = timer.elapsed_seconds(
372
+ digits=2
373
373
  )
374
374
  self.report.lineage_mem_size[project_id] = humanfriendly.format_size(
375
375
  memory_footprint.total_size(lineage)
@@ -697,7 +697,7 @@ class BigqueryLineageExtractor:
697
697
  if parsed_queries[-1]:
698
698
  query = f"""create table `{destination_table.get_sanitized_table_ref().table_identifier.get_table_name()}` AS
699
699
  (
700
- {parsed_queries[-1].sql(dialect='bigquery')}
700
+ {parsed_queries[-1].sql(dialect="bigquery")}
701
701
  )"""
702
702
  else:
703
703
  query = e.query
@@ -809,11 +809,11 @@ class BigqueryLineageExtractor:
809
809
  upstream_lineage, temp_table_upstream
810
810
  )
811
811
 
812
- upstreams[
813
- ref_temp_table_upstream
814
- ] = _merge_lineage_edge_columns(
815
- upstreams.get(ref_temp_table_upstream),
816
- collapsed_lineage,
812
+ upstreams[ref_temp_table_upstream] = (
813
+ _merge_lineage_edge_columns(
814
+ upstreams.get(ref_temp_table_upstream),
815
+ collapsed_lineage,
816
+ )
817
817
  )
818
818
  else:
819
819
  upstreams[upstream_table_ref] = _merge_lineage_edge_columns(
@@ -1004,9 +1004,9 @@ class BigqueryLineageExtractor:
1004
1004
  dataset_urn
1005
1005
  )
1006
1006
  for gcs_dataset_urn in gcs_urns:
1007
- schema_metadata_for_gcs: Optional[
1008
- SchemaMetadataClass
1009
- ] = graph.get_schema_metadata(gcs_dataset_urn)
1007
+ schema_metadata_for_gcs: Optional[SchemaMetadataClass] = (
1008
+ graph.get_schema_metadata(gcs_dataset_urn)
1009
+ )
1010
1010
  if schema_metadata and schema_metadata_for_gcs:
1011
1011
  fine_grained_lineage = self.get_fine_grained_lineages_with_gcs(
1012
1012
  dataset_urn,
@@ -387,9 +387,7 @@ AND
387
387
  OR
388
388
  protoPayload.metadata.tableDataRead.reason = "JOB"
389
389
  )
390
- """.strip(
391
- "\t \n"
392
- )
390
+ """.strip("\t \n")
393
391
 
394
392
 
395
393
  def bigquery_audit_metadata_query_template_lineage(
@@ -271,9 +271,9 @@ class BigQueryQueriesExtractor(Closeable):
271
271
  # Preprocessing stage that deduplicates the queries using query hash per usage bucket
272
272
  # Note: FileBackedDict is an ordered dictionary, so the order of execution of
273
273
  # queries is inherently maintained
274
- queries_deduped: FileBackedDict[
275
- Dict[int, ObservedQuery]
276
- ] = self.deduplicate_queries(queries)
274
+ queries_deduped: FileBackedDict[Dict[int, ObservedQuery]] = (
275
+ self.deduplicate_queries(queries)
276
+ )
277
277
  self.report.num_unique_queries = len(queries_deduped)
278
278
  logger.info(f"Found {self.report.num_unique_queries} unique queries")
279
279
 
@@ -495,62 +495,62 @@ class BigQueryUsageExtractor:
495
495
  def _generate_operational_workunits(
496
496
  self, usage_state: BigQueryUsageState, table_refs: Collection[str]
497
497
  ) -> Iterable[MetadataWorkUnit]:
498
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS)
499
- for audit_event in usage_state.standalone_events():
500
- try:
501
- operational_wu = self._create_operation_workunit(
502
- audit_event, table_refs
503
- )
504
- if operational_wu:
505
- yield operational_wu
506
- self.report.num_operational_stats_workunits_emitted += 1
507
- except Exception as e:
508
- self.report.warning(
509
- message="Unable to generate operation workunit",
510
- context=f"{audit_event}",
511
- exc=e,
512
- )
498
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
499
+ for audit_event in usage_state.standalone_events():
500
+ try:
501
+ operational_wu = self._create_operation_workunit(
502
+ audit_event, table_refs
503
+ )
504
+ if operational_wu:
505
+ yield operational_wu
506
+ self.report.num_operational_stats_workunits_emitted += 1
507
+ except Exception as e:
508
+ self.report.warning(
509
+ message="Unable to generate operation workunit",
510
+ context=f"{audit_event}",
511
+ exc=e,
512
+ )
513
513
 
514
514
  def _generate_usage_workunits(
515
515
  self, usage_state: BigQueryUsageState
516
516
  ) -> Iterable[MetadataWorkUnit]:
517
- self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION)
518
- top_n = (
519
- self.config.usage.top_n_queries
520
- if self.config.usage.include_top_n_queries
521
- else 0
522
- )
523
- for entry in usage_state.usage_statistics(top_n=top_n):
524
- try:
525
- query_freq = [
526
- (
527
- self.uuid_to_query.get(
528
- query_hash, usage_state.queries[query_hash]
529
- ),
530
- count,
517
+ with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
518
+ top_n = (
519
+ self.config.usage.top_n_queries
520
+ if self.config.usage.include_top_n_queries
521
+ else 0
522
+ )
523
+ for entry in usage_state.usage_statistics(top_n=top_n):
524
+ try:
525
+ query_freq = [
526
+ (
527
+ self.uuid_to_query.get(
528
+ query_hash, usage_state.queries[query_hash]
529
+ ),
530
+ count,
531
+ )
532
+ for query_hash, count in entry.query_freq
533
+ ]
534
+ yield make_usage_workunit(
535
+ bucket_start_time=datetime.fromisoformat(entry.timestamp),
536
+ resource=BigQueryTableRef.from_string_name(entry.resource),
537
+ query_count=entry.query_count,
538
+ query_freq=query_freq,
539
+ user_freq=entry.user_freq,
540
+ column_freq=entry.column_freq,
541
+ bucket_duration=self.config.bucket_duration,
542
+ resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
543
+ top_n_queries=self.config.usage.top_n_queries,
544
+ format_sql_queries=self.config.usage.format_sql_queries,
545
+ queries_character_limit=self.config.usage.queries_character_limit,
546
+ )
547
+ self.report.num_usage_workunits_emitted += 1
548
+ except Exception as e:
549
+ self.report.warning(
550
+ message="Unable to generate usage statistics workunit",
551
+ context=f"{entry.timestamp}, {entry.resource}",
552
+ exc=e,
531
553
  )
532
- for query_hash, count in entry.query_freq
533
- ]
534
- yield make_usage_workunit(
535
- bucket_start_time=datetime.fromisoformat(entry.timestamp),
536
- resource=BigQueryTableRef.from_string_name(entry.resource),
537
- query_count=entry.query_count,
538
- query_freq=query_freq,
539
- user_freq=entry.user_freq,
540
- column_freq=entry.column_freq,
541
- bucket_duration=self.config.bucket_duration,
542
- resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
543
- top_n_queries=self.config.usage.top_n_queries,
544
- format_sql_queries=self.config.usage.format_sql_queries,
545
- queries_character_limit=self.config.usage.queries_character_limit,
546
- )
547
- self.report.num_usage_workunits_emitted += 1
548
- except Exception as e:
549
- self.report.warning(
550
- message="Unable to generate usage statistics workunit",
551
- context=f"{entry.timestamp}, {entry.resource}",
552
- exc=e,
553
- )
554
554
 
555
555
  def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]:
556
556
  if self.config.use_exported_bigquery_audit_metadata:
@@ -559,10 +559,10 @@ class BigQueryUsageExtractor:
559
559
  for project_id in projects:
560
560
  with PerfTimer() as timer:
561
561
  try:
562
- self.report.set_ingestion_stage(
563
- project_id, USAGE_EXTRACTION_INGESTION
564
- )
565
- yield from self._get_parsed_bigquery_log_events(project_id)
562
+ with self.report.new_stage(
563
+ f"{project_id}: {USAGE_EXTRACTION_INGESTION}"
564
+ ):
565
+ yield from self._get_parsed_bigquery_log_events(project_id)
566
566
  except Exception as e:
567
567
  self.report.usage_failed_extraction.append(project_id)
568
568
  self.report.warning(
@@ -572,8 +572,8 @@ class BigQueryUsageExtractor:
572
572
  )
573
573
  self.report_status(f"usage-extraction-{project_id}", False)
574
574
 
575
- self.report.usage_extraction_sec[project_id] = round(
576
- timer.elapsed_seconds(), 2
575
+ self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds(
576
+ digits=2
577
577
  )
578
578
 
579
579
  def _store_usage_event(
@@ -763,9 +763,9 @@ class BigQueryUsageExtractor:
763
763
  )
764
764
 
765
765
  if event.query_event.default_dataset:
766
- custom_properties[
767
- "defaultDatabase"
768
- ] = event.query_event.default_dataset
766
+ custom_properties["defaultDatabase"] = (
767
+ event.query_event.default_dataset
768
+ )
769
769
  if event.read_event:
770
770
  if event.read_event.readReason:
771
771
  custom_properties["readReason"] = event.read_event.readReason
@@ -91,7 +91,6 @@ class KeyspaceKey(ContainerKey):
91
91
  supported=True,
92
92
  )
93
93
  class CassandraSource(StatefulIngestionSourceBase):
94
-
95
94
  """
96
95
  This plugin extracts the following:
97
96
 
@@ -70,30 +70,30 @@ class CassandraProfiler:
70
70
  ) -> Iterable[MetadataWorkUnit]:
71
71
  for keyspace_name in cassandra_data.keyspaces:
72
72
  tables = cassandra_data.tables.get(keyspace_name, [])
73
- self.report.set_ingestion_stage(keyspace_name, PROFILING)
74
- with ThreadPoolExecutor(
75
- max_workers=self.config.profiling.max_workers
76
- ) as executor:
77
- future_to_dataset = {
78
- executor.submit(
79
- self.generate_profile,
80
- keyspace_name,
81
- table_name,
82
- cassandra_data.columns.get(table_name, []),
83
- ): table_name
84
- for table_name in tables
85
- }
86
- for future in as_completed(future_to_dataset):
87
- table_name = future_to_dataset[future]
88
- try:
89
- yield from future.result()
90
- except Exception as exc:
91
- self.report.profiling_skipped_other[table_name] += 1
92
- self.report.failure(
93
- message="Failed to profile for table",
94
- context=f"{keyspace_name}.{table_name}",
95
- exc=exc,
96
- )
73
+ with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
74
+ with ThreadPoolExecutor(
75
+ max_workers=self.config.profiling.max_workers
76
+ ) as executor:
77
+ future_to_dataset = {
78
+ executor.submit(
79
+ self.generate_profile,
80
+ keyspace_name,
81
+ table_name,
82
+ cassandra_data.columns.get(table_name, []),
83
+ ): table_name
84
+ for table_name in tables
85
+ }
86
+ for future in as_completed(future_to_dataset):
87
+ table_name = future_to_dataset[future]
88
+ try:
89
+ yield from future.result()
90
+ except Exception as exc:
91
+ self.report.profiling_skipped_other[table_name] += 1
92
+ self.report.failure(
93
+ message="Failed to profile for table",
94
+ context=f"{keyspace_name}.{table_name}",
95
+ exc=exc,
96
+ )
97
97
 
98
98
  def generate_profile(
99
99
  self,
@@ -54,9 +54,6 @@ class CassandraSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport
54
54
  else:
55
55
  raise KeyError(f"Unknown entity {ent_type}.")
56
56
 
57
- def set_ingestion_stage(self, keyspace: str, stage: str) -> None:
58
- self.report_ingestion_stage_start(f"{keyspace}: {stage}")
59
-
60
57
  # TODO Need to create seperate common config for profiling report
61
58
  profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
62
59
  profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
@@ -110,10 +107,10 @@ class CassandraToSchemaFieldConverter:
110
107
 
111
108
  @staticmethod
112
109
  def get_column_type(cassandra_column_type: str) -> SchemaFieldDataType:
113
- type_class: Optional[
114
- Type
115
- ] = CassandraToSchemaFieldConverter._field_type_to_schema_field_type.get(
116
- cassandra_column_type
110
+ type_class: Optional[Type] = (
111
+ CassandraToSchemaFieldConverter._field_type_to_schema_field_type.get(
112
+ cassandra_column_type
113
+ )
117
114
  )
118
115
  if type_class is None:
119
116
  logger.warning(
@@ -293,9 +293,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
293
293
  def _load_json_schema_with_resolved_references(
294
294
  self, schema: Schema, name: str, subject: str
295
295
  ) -> dict:
296
- imported_json_schemas: List[
297
- JsonSchemaWrapper
298
- ] = self.get_schemas_from_confluent_ref_json(schema, name=name, subject=subject)
296
+ imported_json_schemas: List[JsonSchemaWrapper] = (
297
+ self.get_schemas_from_confluent_ref_json(schema, name=name, subject=subject)
298
+ )
299
299
  schema_dict = json.loads(schema.schema_str)
300
300
  reference_map = {}
301
301
  for imported_schema in imported_json_schemas:
@@ -332,9 +332,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
332
332
  )
333
333
 
334
334
  elif schema.schema_type == "PROTOBUF":
335
- imported_schemas: List[
336
- ProtobufSchema
337
- ] = self.get_schemas_from_confluent_ref_protobuf(schema)
335
+ imported_schemas: List[ProtobufSchema] = (
336
+ self.get_schemas_from_confluent_ref_protobuf(schema)
337
+ )
338
338
  base_name: str = topic.replace(".", "_")
339
339
  fields = protobuf_util.protobuf_schema_to_mce_fields(
340
340
  ProtobufSchema(
@@ -371,11 +371,11 @@ class CSVEnricherSource(Source):
371
371
  domain: Optional[str],
372
372
  description: Optional[str],
373
373
  ) -> Iterable[MetadataWorkUnit]:
374
- maybe_terms_wu: Optional[
375
- MetadataWorkUnit
376
- ] = self.get_resource_glossary_terms_work_unit(
377
- entity_urn=entity_urn,
378
- term_associations=term_associations,
374
+ maybe_terms_wu: Optional[MetadataWorkUnit] = (
375
+ self.get_resource_glossary_terms_work_unit(
376
+ entity_urn=entity_urn,
377
+ term_associations=term_associations,
378
+ )
379
379
  )
380
380
  if maybe_terms_wu:
381
381
  self.report.num_glossary_term_workunits_produced += 1
@@ -389,31 +389,31 @@ class CSVEnricherSource(Source):
389
389
  self.report.num_tag_workunits_produced += 1
390
390
  yield maybe_tags_wu
391
391
 
392
- maybe_owners_wu: Optional[
393
- MetadataWorkUnit
394
- ] = self.get_resource_owners_work_unit(
395
- entity_urn=entity_urn,
396
- owners=owners,
392
+ maybe_owners_wu: Optional[MetadataWorkUnit] = (
393
+ self.get_resource_owners_work_unit(
394
+ entity_urn=entity_urn,
395
+ owners=owners,
396
+ )
397
397
  )
398
398
  if maybe_owners_wu:
399
399
  self.report.num_owners_workunits_produced += 1
400
400
  yield maybe_owners_wu
401
401
 
402
- maybe_domain_wu: Optional[
403
- MetadataWorkUnit
404
- ] = self.get_resource_domain_work_unit(
405
- entity_urn=entity_urn,
406
- domain=domain,
402
+ maybe_domain_wu: Optional[MetadataWorkUnit] = (
403
+ self.get_resource_domain_work_unit(
404
+ entity_urn=entity_urn,
405
+ domain=domain,
406
+ )
407
407
  )
408
408
  if maybe_domain_wu:
409
409
  self.report.num_domain_workunits_produced += 1
410
410
  yield maybe_domain_wu
411
411
 
412
- maybe_description_wu: Optional[
413
- MetadataWorkUnit
414
- ] = self.get_resource_description_work_unit(
415
- entity_urn=entity_urn,
416
- description=description,
412
+ maybe_description_wu: Optional[MetadataWorkUnit] = (
413
+ self.get_resource_description_work_unit(
414
+ entity_urn=entity_urn,
415
+ description=description,
416
+ )
417
417
  )
418
418
  if maybe_description_wu:
419
419
  self.report.num_description_workunits_produced += 1
@@ -426,9 +426,9 @@ class CSVEnricherSource(Source):
426
426
  needs_write: bool,
427
427
  ) -> Tuple[EditableSchemaMetadataClass, bool]:
428
428
  field_path: str = sub_resource_row.field_path
429
- term_associations: List[
430
- GlossaryTermAssociationClass
431
- ] = sub_resource_row.term_associations
429
+ term_associations: List[GlossaryTermAssociationClass] = (
430
+ sub_resource_row.term_associations
431
+ )
432
432
  tag_associations: List[TagAssociationClass] = sub_resource_row.tag_associations
433
433
  description: Optional[str] = sub_resource_row.description
434
434
  has_terms: bool = len(term_associations) > 0
@@ -517,9 +517,9 @@ class CSVEnricherSource(Source):
517
517
  # Boolean field to tell whether we need to write an MCPW.
518
518
  needs_write = False
519
519
 
520
- current_editable_schema_metadata: Optional[
521
- EditableSchemaMetadataClass
522
- ] = None
520
+ current_editable_schema_metadata: Optional[EditableSchemaMetadataClass] = (
521
+ None
522
+ )
523
523
  if self.ctx.graph and not self.should_overwrite:
524
524
  # Fetch the current editable schema metadata
525
525
  current_editable_schema_metadata = self.ctx.graph.get_aspect(
@@ -655,9 +655,9 @@ class CSVEnricherSource(Source):
655
655
  entity_urn = row["resource"]
656
656
  entity_type = Urn.from_string(row["resource"]).get_type()
657
657
 
658
- term_associations: List[
659
- GlossaryTermAssociationClass
660
- ] = self.maybe_extract_glossary_terms(row)
658
+ term_associations: List[GlossaryTermAssociationClass] = (
659
+ self.maybe_extract_glossary_terms(row)
660
+ )
661
661
  tag_associations: List[TagAssociationClass] = self.maybe_extract_tags(row)
662
662
  owners: List[OwnerClass] = self.maybe_extract_owners(row, is_resource_row)
663
663
 
@@ -25,6 +25,10 @@ DEFAULT_EXCLUDE_ASPECTS = {
25
25
  "globalSettingsKey",
26
26
  "globalSettingsInfo",
27
27
  "testResults",
28
+ "dataHubExecutionRequestKey",
29
+ "dataHubExecutionRequestInput",
30
+ "dataHubExecutionRequestSignal",
31
+ "dataHubExecutionRequestResult",
28
32
  }
29
33
 
30
34
 
@@ -108,6 +112,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
108
112
 
109
113
  urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
110
114
 
115
+ drop_duplicate_schema_fields: bool = Field(
116
+ default=False,
117
+ description="Whether to drop duplicate schema fields in the schemaMetadata aspect. "
118
+ "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
119
+ )
120
+
111
121
  @root_validator(skip_on_failure=True)
112
122
  def check_ingesting_data(cls, values):
113
123
  if (
@@ -152,7 +152,9 @@ class DataHubDatabaseReader:
152
152
  ) -> Iterable[Dict[str, Any]]:
153
153
  with self.engine.connect() as conn:
154
154
  if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
155
- with conn.begin(): # Transaction required for PostgreSQL server-side cursor
155
+ with (
156
+ conn.begin()
157
+ ): # Transaction required for PostgreSQL server-side cursor
156
158
  # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
157
159
  # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
158
160
  conn = conn.execution_options(
@@ -222,7 +224,7 @@ class DataHubDatabaseReader:
222
224
  )
223
225
  except Exception as e:
224
226
  logger.warning(
225
- f'Failed to parse metadata for {row["urn"]}: {e}', exc_info=True
227
+ f"Failed to parse metadata for {row['urn']}: {e}", exc_info=True
226
228
  )
227
229
  self.report.num_database_parse_errors += 1
228
230
  self.report.database_parse_errors.setdefault(
@@ -12,7 +12,10 @@ from datahub.ingestion.api.decorators import (
12
12
  support_status,
13
13
  )
14
14
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
15
- from datahub.ingestion.api.source_helpers import auto_workunit_reporter
15
+ from datahub.ingestion.api.source_helpers import (
16
+ auto_fix_duplicate_schema_field_paths,
17
+ auto_workunit_reporter,
18
+ )
16
19
  from datahub.ingestion.api.workunit import MetadataWorkUnit
17
20
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
18
21
  from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
@@ -57,7 +60,14 @@ class DataHubSource(StatefulIngestionSourceBase):
57
60
 
58
61
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
59
62
  # Exactly replicate data from DataHub source
60
- return [partial(auto_workunit_reporter, self.get_report())]
63
+ return [
64
+ (
65
+ auto_fix_duplicate_schema_field_paths
66
+ if self.config.drop_duplicate_schema_fields
67
+ else None
68
+ ),
69
+ partial(auto_workunit_reporter, self.get_report()),
70
+ ]
61
71
 
62
72
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
63
73
  self.report.stop_time = datetime.now(tz=timezone.utc)