acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.3rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (161) hide show
  1. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/METADATA +2378 -2380
  2. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/RECORD +161 -161
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/assertion/assertion_operator.py +3 -5
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  7. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  8. datahub/api/entities/dataset/dataset.py +2 -1
  9. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  10. datahub/cli/cli_utils.py +1 -1
  11. datahub/cli/delete_cli.py +16 -2
  12. datahub/cli/docker_cli.py +6 -6
  13. datahub/cli/lite_cli.py +2 -2
  14. datahub/cli/migrate.py +3 -3
  15. datahub/cli/specific/assertions_cli.py +3 -3
  16. datahub/cli/timeline_cli.py +1 -1
  17. datahub/configuration/common.py +1 -2
  18. datahub/configuration/config_loader.py +73 -50
  19. datahub/configuration/git.py +2 -2
  20. datahub/configuration/time_window_config.py +10 -5
  21. datahub/emitter/mce_builder.py +4 -8
  22. datahub/emitter/mcp_patch_builder.py +1 -2
  23. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  24. datahub/ingestion/api/report.py +1 -2
  25. datahub/ingestion/api/source_helpers.py +1 -1
  26. datahub/ingestion/extractor/json_schema_util.py +3 -3
  27. datahub/ingestion/extractor/schema_util.py +3 -5
  28. datahub/ingestion/fs/s3_fs.py +3 -3
  29. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  30. datahub/ingestion/graph/client.py +4 -6
  31. datahub/ingestion/run/pipeline.py +8 -7
  32. datahub/ingestion/run/pipeline_config.py +3 -3
  33. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  34. datahub/ingestion/source/abs/source.py +19 -8
  35. datahub/ingestion/source/aws/glue.py +11 -11
  36. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  37. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  38. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  39. datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
  40. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  41. datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
  42. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  43. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
  44. datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
  45. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  46. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  47. datahub/ingestion/source/bigquery_v2/usage.py +3 -3
  48. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  49. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
  50. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  51. datahub/ingestion/source/csv_enricher.py +29 -29
  52. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  53. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  54. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  55. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  56. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  57. datahub/ingestion/source/elastic_search.py +4 -4
  58. datahub/ingestion/source/fivetran/config.py +4 -0
  59. datahub/ingestion/source/fivetran/fivetran.py +15 -5
  60. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
  61. datahub/ingestion/source/gcs/gcs_source.py +5 -3
  62. datahub/ingestion/source/ge_data_profiler.py +4 -5
  63. datahub/ingestion/source/ge_profiling_config.py +3 -3
  64. datahub/ingestion/source/iceberg/iceberg.py +3 -3
  65. datahub/ingestion/source/identity/azure_ad.py +3 -3
  66. datahub/ingestion/source/identity/okta.py +3 -3
  67. datahub/ingestion/source/kafka/kafka.py +11 -9
  68. datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
  69. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  70. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  71. datahub/ingestion/source/looker/looker_common.py +19 -19
  72. datahub/ingestion/source/looker/looker_config.py +3 -3
  73. datahub/ingestion/source/looker/looker_source.py +25 -25
  74. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  75. datahub/ingestion/source/looker/looker_usage.py +5 -7
  76. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  77. datahub/ingestion/source/looker/lookml_source.py +13 -15
  78. datahub/ingestion/source/looker/view_upstream.py +5 -5
  79. datahub/ingestion/source/mlflow.py +4 -4
  80. datahub/ingestion/source/mongodb.py +6 -4
  81. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  82. datahub/ingestion/source/nifi.py +24 -26
  83. datahub/ingestion/source/openapi.py +9 -9
  84. datahub/ingestion/source/powerbi/config.py +12 -12
  85. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  86. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  87. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  88. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  89. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  90. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  91. datahub/ingestion/source/redshift/config.py +3 -3
  92. datahub/ingestion/source/redshift/query.py +77 -47
  93. datahub/ingestion/source/redshift/redshift.py +12 -12
  94. datahub/ingestion/source/redshift/usage.py +8 -8
  95. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  96. datahub/ingestion/source/s3/source.py +1 -1
  97. datahub/ingestion/source/salesforce.py +26 -25
  98. datahub/ingestion/source/schema/json_schema.py +1 -1
  99. datahub/ingestion/source/sigma/sigma.py +3 -3
  100. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  101. datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
  102. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  103. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  104. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
  105. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
  106. datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
  107. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
  108. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  109. datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
  110. datahub/ingestion/source/sql/athena.py +1 -3
  111. datahub/ingestion/source/sql/clickhouse.py +8 -14
  112. datahub/ingestion/source/sql/oracle.py +1 -3
  113. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  114. datahub/ingestion/source/sql/teradata.py +16 -3
  115. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  116. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  117. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  118. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  119. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  120. datahub/ingestion/source/tableau/tableau.py +48 -49
  121. datahub/ingestion/source/unity/config.py +3 -1
  122. datahub/ingestion/source/unity/proxy.py +1 -1
  123. datahub/ingestion/source/unity/source.py +3 -3
  124. datahub/ingestion/source/unity/usage.py +3 -1
  125. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  126. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  127. datahub/ingestion/source/usage/usage_common.py +1 -1
  128. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  129. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  130. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  131. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  132. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  133. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  134. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  135. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  136. datahub/lite/duckdb_lite.py +12 -10
  137. datahub/metadata/_schema_classes.py +1 -1
  138. datahub/metadata/schema.avsc +6 -2
  139. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  140. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  141. datahub/secret/secret_common.py +14 -8
  142. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  143. datahub/sql_parsing/schema_resolver.py +5 -10
  144. datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
  145. datahub/sql_parsing/sqlglot_lineage.py +5 -4
  146. datahub/sql_parsing/sqlglot_utils.py +3 -2
  147. datahub/telemetry/stats.py +1 -2
  148. datahub/testing/mcp_diff.py +1 -1
  149. datahub/utilities/file_backed_collections.py +10 -10
  150. datahub/utilities/hive_schema_to_avro.py +2 -2
  151. datahub/utilities/logging_manager.py +2 -2
  152. datahub/utilities/lossy_collections.py +3 -3
  153. datahub/utilities/mapping.py +3 -3
  154. datahub/utilities/serialized_lru_cache.py +3 -1
  155. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  156. datahub/utilities/sqllineage_patch.py +1 -1
  157. datahub/utilities/stats_collections.py +3 -1
  158. datahub/utilities/urns/urn_iter.py +2 -2
  159. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/WHEEL +0 -0
  160. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/entry_points.txt +0 -0
  161. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/top_level.txt +0 -0
@@ -519,9 +519,9 @@ class DataHubGraph(DatahubRestEmitter):
519
519
  :return: Optionally, a map of aspect_name to aspect_value as a dictionary if present, aspect_value will be set to None if that aspect was not found. Returns None on HTTP status 404.
520
520
  :raises HttpError: if the HTTP response is not a 200
521
521
  """
522
- assert len(aspects) == len(
523
- aspect_types
524
- ), f"number of aspects requested ({len(aspects)}) should be the same as number of aspect types provided ({len(aspect_types)})"
522
+ assert len(aspects) == len(aspect_types), (
523
+ f"number of aspects requested ({len(aspects)}) should be the same as number of aspect types provided ({len(aspect_types)})"
524
+ )
525
525
 
526
526
  # TODO: generate aspects list from type classes
527
527
  response_json = self.get_entity_raw(entity_urn, aspects)
@@ -1576,9 +1576,7 @@ class DataHubGraph(DatahubRestEmitter):
1576
1576
  ... assertionResult
1577
1577
  }
1578
1578
  }
1579
- """ % (
1580
- self._assertion_result_shared()
1581
- )
1579
+ """ % (self._assertion_result_shared())
1582
1580
 
1583
1581
  variables = {
1584
1582
  "assertionUrn": urn,
@@ -76,8 +76,9 @@ class LoggingCallback(WriteCallback):
76
76
  failure_metadata: dict,
77
77
  ) -> None:
78
78
  logger.error(
79
- f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}"
80
- f" with {failure_exception} and info {failure_metadata}"
79
+ f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}",
80
+ extra={"failure_metadata": failure_metadata},
81
+ exc_info=failure_exception,
81
82
  )
82
83
 
83
84
 
@@ -108,9 +109,9 @@ class DeadLetterQueueCallback(WriteCallback):
108
109
  mcp.systemMetadata.properties = {}
109
110
  if "workunit_id" not in mcp.systemMetadata.properties:
110
111
  # update the workunit id
111
- mcp.systemMetadata.properties[
112
- "workunit_id"
113
- ] = record_envelope.metadata["workunit_id"]
112
+ mcp.systemMetadata.properties["workunit_id"] = (
113
+ record_envelope.metadata["workunit_id"]
114
+ )
114
115
  record_envelope.record = mcp
115
116
  self.file_sink.write_record_async(record_envelope, self.logging_callback)
116
117
 
@@ -700,7 +701,7 @@ class Pipeline:
700
701
  num_failures_sink = len(self.sink.get_report().failures)
701
702
  click.secho(
702
703
  message_template.format(
703
- status=f"with at least {num_failures_source+num_failures_sink} failures"
704
+ status=f"with at least {num_failures_source + num_failures_sink} failures"
704
705
  ),
705
706
  fg=self._get_text_color(
706
707
  running=currently_running, failures=True, warnings=False
@@ -718,7 +719,7 @@ class Pipeline:
718
719
  num_warn_global = len(global_warnings)
719
720
  click.secho(
720
721
  message_template.format(
721
- status=f"with at least {num_warn_source+num_warn_sink+num_warn_global} warnings"
722
+ status=f"with at least {num_warn_source + num_warn_sink + num_warn_global} warnings"
722
723
  ),
723
724
  fg=self._get_text_color(
724
725
  running=currently_running, failures=False, warnings=True
@@ -92,9 +92,9 @@ class PipelineConfig(ConfigModel):
92
92
  pipeline_name: Optional[str] = None
93
93
  failure_log: FailureLoggingConfig = FailureLoggingConfig()
94
94
 
95
- _raw_dict: Optional[
96
- dict
97
- ] = None # the raw dict that was parsed to construct this config
95
+ _raw_dict: Optional[dict] = (
96
+ None # the raw dict that was parsed to construct this config
97
+ )
98
98
 
99
99
  @validator("run_id", pre=True, always=True)
100
100
  def run_id_should_be_semantic(
@@ -85,8 +85,8 @@ class DataLakeProfilerConfig(ConfigModel):
85
85
  if field_level_metric.startswith("include_field_"):
86
86
  values.setdefault(field_level_metric, False)
87
87
 
88
- assert (
89
- max_num_fields_to_profile is None
90
- ), f"{max_num_fields_to_profile_key} should be set to None"
88
+ assert max_num_fields_to_profile is None, (
89
+ f"{max_num_fields_to_profile_key} should be set to None"
90
+ )
91
91
 
92
92
  return values
@@ -508,7 +508,12 @@ class ABSSource(StatefulIngestionSourceBase):
508
508
  ):
509
509
  abs_path = self.create_abs_path(obj.name)
510
510
  logger.debug(f"Sampling file: {abs_path}")
511
- yield abs_path, obj.name, obj.last_modified, obj.size,
511
+ yield (
512
+ abs_path,
513
+ obj.name,
514
+ obj.last_modified,
515
+ obj.size,
516
+ )
512
517
  except Exception as e:
513
518
  # This odd check if being done because boto does not have a proper exception to catch
514
519
  # The exception that appears in stacktrace cannot actually be caught without a lot more work
@@ -552,9 +557,12 @@ class ABSSource(StatefulIngestionSourceBase):
552
557
  if os.path.isfile(prefix):
553
558
  logger.debug(f"Scanning single local file: {prefix}")
554
559
  file_name = prefix
555
- yield prefix, file_name, datetime.utcfromtimestamp(
556
- os.path.getmtime(prefix)
557
- ), os.path.getsize(prefix)
560
+ yield (
561
+ prefix,
562
+ file_name,
563
+ datetime.utcfromtimestamp(os.path.getmtime(prefix)),
564
+ os.path.getsize(prefix),
565
+ )
558
566
  else:
559
567
  logger.debug(f"Scanning files under local folder: {prefix}")
560
568
  for root, dirs, files in os.walk(prefix):
@@ -565,9 +573,12 @@ class ABSSource(StatefulIngestionSourceBase):
565
573
  full_path = PurePath(
566
574
  os.path.normpath(os.path.join(root, file))
567
575
  ).as_posix()
568
- yield full_path, file, datetime.utcfromtimestamp(
569
- os.path.getmtime(full_path)
570
- ), os.path.getsize(full_path)
576
+ yield (
577
+ full_path,
578
+ file,
579
+ datetime.utcfromtimestamp(os.path.getmtime(full_path)),
580
+ os.path.getsize(full_path),
581
+ )
571
582
 
572
583
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
573
584
  self.container_WU_creator = ContainerWUCreator(
@@ -613,7 +624,7 @@ class ABSSource(StatefulIngestionSourceBase):
613
624
  table_data.table_path
614
625
  ].timestamp = table_data.timestamp
615
626
 
616
- for guid, table_data in table_dict.items():
627
+ for _, table_data in table_dict.items():
617
628
  yield from self.ingest_table(table_data, path_spec)
618
629
 
619
630
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
@@ -521,7 +521,7 @@ class GlueSource(StatefulIngestionSourceBase):
521
521
  # otherwise, a node represents a transformation
522
522
  else:
523
523
  node_urn = mce_builder.make_data_job_urn_with_flow(
524
- flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}'
524
+ flow_urn, job_id=f"{node['NodeType']}-{node['Id']}"
525
525
  )
526
526
 
527
527
  return {
@@ -679,7 +679,7 @@ class GlueSource(StatefulIngestionSourceBase):
679
679
  )
680
680
  )
681
681
 
682
- return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce)
682
+ return MetadataWorkUnit(id=f"{job_name}-{node['Id']}", mce=mce)
683
683
 
684
684
  def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
685
685
  logger.debug("Getting all databases")
@@ -750,13 +750,13 @@ class GlueSource(StatefulIngestionSourceBase):
750
750
  ) -> Optional[MetadataWorkUnit]:
751
751
  if self.source_config.emit_s3_lineage:
752
752
  # extract dataset properties aspect
753
- dataset_properties: Optional[
754
- DatasetPropertiesClass
755
- ] = mce_builder.get_aspect_if_available(mce, DatasetPropertiesClass)
753
+ dataset_properties: Optional[DatasetPropertiesClass] = (
754
+ mce_builder.get_aspect_if_available(mce, DatasetPropertiesClass)
755
+ )
756
756
  # extract dataset schema aspect
757
- schema_metadata: Optional[
758
- SchemaMetadataClass
759
- ] = mce_builder.get_aspect_if_available(mce, SchemaMetadataClass)
757
+ schema_metadata: Optional[SchemaMetadataClass] = (
758
+ mce_builder.get_aspect_if_available(mce, SchemaMetadataClass)
759
+ )
760
760
 
761
761
  if dataset_properties and "Location" in dataset_properties.customProperties:
762
762
  location = dataset_properties.customProperties["Location"]
@@ -765,9 +765,9 @@ class GlueSource(StatefulIngestionSourceBase):
765
765
  location, self.source_config.env
766
766
  )
767
767
  assert self.ctx.graph
768
- schema_metadata_for_s3: Optional[
769
- SchemaMetadataClass
770
- ] = self.ctx.graph.get_schema_metadata(s3_dataset_urn)
768
+ schema_metadata_for_s3: Optional[SchemaMetadataClass] = (
769
+ self.ctx.graph.get_schema_metadata(s3_dataset_urn)
770
+ )
771
771
 
772
772
  if self.source_config.glue_s3_lineage_direction == "upstream":
773
773
  fine_grained_lineages = None
@@ -40,7 +40,7 @@ def get_s3_tags(
40
40
  ]
41
41
  )
42
42
  except s3.meta.client.exceptions.ClientError:
43
- logger.warn(f"No tags found for bucket={bucket_name}")
43
+ logger.warning(f"No tags found for bucket={bucket_name}")
44
44
 
45
45
  if use_s3_object_tags and key_name is not None:
46
46
  s3_client = aws_config.get_s3_client()
@@ -53,7 +53,7 @@ def get_s3_tags(
53
53
  else:
54
54
  # Unlike bucket tags, if an object does not have tags, it will just return an empty array
55
55
  # as opposed to an exception.
56
- logger.warn(f"No tags found for bucket={bucket_name} key={key_name}")
56
+ logger.warning(f"No tags found for bucket={bucket_name} key={key_name}")
57
57
  if len(tags_to_add) == 0:
58
58
  return None
59
59
  if ctx.graph is not None:
@@ -65,7 +65,7 @@ def get_s3_tags(
65
65
  if current_tags:
66
66
  tags_to_add.extend([current_tag.tag for current_tag in current_tags.tags])
67
67
  else:
68
- logger.warn("Could not connect to DatahubApi. No current tags to maintain")
68
+ logger.warning("Could not connect to DatahubApi. No current tags to maintain")
69
69
  # Remove duplicate tags
70
70
  tags_to_add = sorted(list(set(tags_to_add)))
71
71
  new_tags = GlobalTagsClass(
@@ -257,7 +257,7 @@ class FeatureGroupProcessor:
257
257
  mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)
258
258
 
259
259
  return MetadataWorkUnit(
260
- id=f'{feature_group_details["FeatureGroupName"]}-{feature["FeatureName"]}',
260
+ id=f"{feature_group_details['FeatureGroupName']}-{feature['FeatureName']}",
261
261
  mce=mce,
262
262
  )
263
263
 
@@ -212,7 +212,7 @@ class ModelProcessor:
212
212
  mce = MetadataChangeEvent(proposedSnapshot=endpoint_snapshot)
213
213
 
214
214
  return MetadataWorkUnit(
215
- id=f'{endpoint_details["EndpointName"]}',
215
+ id=f"{endpoint_details['EndpointName']}",
216
216
  mce=mce,
217
217
  )
218
218
 
@@ -503,7 +503,7 @@ class ModelProcessor:
503
503
  mce = MetadataChangeEvent(proposedSnapshot=model_snapshot)
504
504
 
505
505
  return MetadataWorkUnit(
506
- id=f'{model_details["ModelName"]}',
506
+ id=f"{model_details['ModelName']}",
507
507
  mce=mce,
508
508
  )
509
509
 
@@ -132,9 +132,9 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
132
132
  self.filters = BigQueryFilter(self.config, self.report)
133
133
  self.identifiers = BigQueryIdentifierBuilder(self.config, self.report)
134
134
 
135
- redundant_lineage_run_skip_handler: Optional[
136
- RedundantLineageRunSkipHandler
137
- ] = None
135
+ redundant_lineage_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = (
136
+ None
137
+ )
138
138
  if self.config.enable_stateful_lineage_ingestion:
139
139
  redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
140
140
  source=self,
@@ -37,9 +37,9 @@ class BigqueryTableIdentifier:
37
37
 
38
38
  # Note: this regex may get overwritten by the sharded_table_pattern config.
39
39
  # The class-level constant, however, will not be overwritten.
40
- _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[
41
- str
42
- ] = _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX
40
+ _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[str] = (
41
+ _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX
42
+ )
43
43
  _BIGQUERY_WILDCARD_REGEX: ClassVar[str] = "((_(\\d+)?)\\*$)|\\*$"
44
44
  _BQ_SHARDED_TABLE_SUFFIX: str = "_yyyymmdd"
45
45
 
@@ -137,9 +137,9 @@ class BigQueryCredential(ConfigModel):
137
137
  @root_validator(skip_on_failure=True)
138
138
  def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
139
139
  if values.get("client_x509_cert_url") is None:
140
- values[
141
- "client_x509_cert_url"
142
- ] = f'https://www.googleapis.com/robot/v1/metadata/x509/{values["client_email"]}'
140
+ values["client_x509_cert_url"] = (
141
+ f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}"
142
+ )
143
143
  return values
144
144
 
145
145
  def create_credential_temp_file(self) -> str:
@@ -611,9 +611,9 @@ class BigQueryV2Config(
611
611
  cls, v: Optional[List[str]], values: Dict
612
612
  ) -> Optional[List[str]]:
613
613
  if values.get("use_exported_bigquery_audit_metadata"):
614
- assert (
615
- v and len(v) > 0
616
- ), "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
614
+ assert v and len(v) > 0, (
615
+ "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
616
+ )
617
617
 
618
618
  return v
619
619
 
@@ -87,9 +87,9 @@ class BigQueryPlatformResourceHelper:
87
87
  key=platform_resource_key, graph_client=self.graph
88
88
  )
89
89
  if platform_resource:
90
- self.platform_resource_cache[
91
- platform_resource_key.primary_key
92
- ] = platform_resource
90
+ self.platform_resource_cache[platform_resource_key.primary_key] = (
91
+ platform_resource
92
+ )
93
93
  return platform_resource
94
94
  return None
95
95
 
@@ -115,7 +115,11 @@ class BigQueryPlatformResourceHelper:
115
115
  and platform_resource.resource_info.value
116
116
  ):
117
117
  try:
118
- existing_info: Optional[BigQueryLabelInfo] = platform_resource.resource_info.value.as_pydantic_object(BigQueryLabelInfo) # type: ignore
118
+ existing_info: Optional[BigQueryLabelInfo] = (
119
+ platform_resource.resource_info.value.as_pydantic_object( # type: ignore
120
+ BigQueryLabelInfo
121
+ )
122
+ )
119
123
  except ValidationError as e:
120
124
  logger.error(
121
125
  f"Error converting existing value to BigQueryLabelInfo: {e}. Creating new one. Maybe this is because of a non backward compatible schema change."
@@ -311,8 +311,10 @@ class BigQuerySchemaGenerator:
311
311
  platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
312
312
  label, tag_urn, managed_by_datahub=False
313
313
  )
314
- label_info: BigQueryLabelInfo = platform_resource.resource_info.value.as_pydantic_object( # type: ignore
315
- BigQueryLabelInfo
314
+ label_info: BigQueryLabelInfo = (
315
+ platform_resource.resource_info.value.as_pydantic_object( # type: ignore
316
+ BigQueryLabelInfo
317
+ )
316
318
  )
317
319
  tag_urn = TagUrn.from_string(label_info.datahub_urn)
318
320
 
@@ -820,8 +822,10 @@ class BigQuerySchemaGenerator:
820
822
  platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
821
823
  label, tag_urn, managed_by_datahub=False
822
824
  )
823
- label_info: BigQueryLabelInfo = platform_resource.resource_info.value.as_pydantic_object( # type: ignore
824
- BigQueryLabelInfo
825
+ label_info: BigQueryLabelInfo = (
826
+ platform_resource.resource_info.value.as_pydantic_object( # type: ignore
827
+ BigQueryLabelInfo
828
+ )
825
829
  )
826
830
  tag_urn = TagUrn.from_string(label_info.datahub_urn)
827
831
 
@@ -860,8 +864,10 @@ class BigQuerySchemaGenerator:
860
864
  platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
861
865
  label, tag_urn, managed_by_datahub=False
862
866
  )
863
- label_info: BigQueryLabelInfo = platform_resource.resource_info.value.as_pydantic_object( # type: ignore
864
- BigQueryLabelInfo
867
+ label_info: BigQueryLabelInfo = (
868
+ platform_resource.resource_info.value.as_pydantic_object( # type: ignore
869
+ BigQueryLabelInfo
870
+ )
865
871
  )
866
872
  tag_urn = TagUrn.from_string(label_info.datahub_urn)
867
873
 
@@ -1203,9 +1209,9 @@ class BigQuerySchemaGenerator:
1203
1209
  report=self.report,
1204
1210
  )
1205
1211
 
1206
- self.report.metadata_extraction_sec[
1207
- f"{project_id}.{dataset.name}"
1208
- ] = timer.elapsed_seconds(digits=2)
1212
+ self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = (
1213
+ timer.elapsed_seconds(digits=2)
1214
+ )
1209
1215
 
1210
1216
  def get_core_table_details(
1211
1217
  self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str
@@ -697,7 +697,7 @@ class BigqueryLineageExtractor:
697
697
  if parsed_queries[-1]:
698
698
  query = f"""create table `{destination_table.get_sanitized_table_ref().table_identifier.get_table_name()}` AS
699
699
  (
700
- {parsed_queries[-1].sql(dialect='bigquery')}
700
+ {parsed_queries[-1].sql(dialect="bigquery")}
701
701
  )"""
702
702
  else:
703
703
  query = e.query
@@ -809,11 +809,11 @@ class BigqueryLineageExtractor:
809
809
  upstream_lineage, temp_table_upstream
810
810
  )
811
811
 
812
- upstreams[
813
- ref_temp_table_upstream
814
- ] = _merge_lineage_edge_columns(
815
- upstreams.get(ref_temp_table_upstream),
816
- collapsed_lineage,
812
+ upstreams[ref_temp_table_upstream] = (
813
+ _merge_lineage_edge_columns(
814
+ upstreams.get(ref_temp_table_upstream),
815
+ collapsed_lineage,
816
+ )
817
817
  )
818
818
  else:
819
819
  upstreams[upstream_table_ref] = _merge_lineage_edge_columns(
@@ -1004,9 +1004,9 @@ class BigqueryLineageExtractor:
1004
1004
  dataset_urn
1005
1005
  )
1006
1006
  for gcs_dataset_urn in gcs_urns:
1007
- schema_metadata_for_gcs: Optional[
1008
- SchemaMetadataClass
1009
- ] = graph.get_schema_metadata(gcs_dataset_urn)
1007
+ schema_metadata_for_gcs: Optional[SchemaMetadataClass] = (
1008
+ graph.get_schema_metadata(gcs_dataset_urn)
1009
+ )
1010
1010
  if schema_metadata and schema_metadata_for_gcs:
1011
1011
  fine_grained_lineage = self.get_fine_grained_lineages_with_gcs(
1012
1012
  dataset_urn,
@@ -387,9 +387,7 @@ AND
387
387
  OR
388
388
  protoPayload.metadata.tableDataRead.reason = "JOB"
389
389
  )
390
- """.strip(
391
- "\t \n"
392
- )
390
+ """.strip("\t \n")
393
391
 
394
392
 
395
393
  def bigquery_audit_metadata_query_template_lineage(
@@ -271,9 +271,9 @@ class BigQueryQueriesExtractor(Closeable):
271
271
  # Preprocessing stage that deduplicates the queries using query hash per usage bucket
272
272
  # Note: FileBackedDict is an ordered dictionary, so the order of execution of
273
273
  # queries is inherently maintained
274
- queries_deduped: FileBackedDict[
275
- Dict[int, ObservedQuery]
276
- ] = self.deduplicate_queries(queries)
274
+ queries_deduped: FileBackedDict[Dict[int, ObservedQuery]] = (
275
+ self.deduplicate_queries(queries)
276
+ )
277
277
  self.report.num_unique_queries = len(queries_deduped)
278
278
  logger.info(f"Found {self.report.num_unique_queries} unique queries")
279
279
 
@@ -763,9 +763,9 @@ class BigQueryUsageExtractor:
763
763
  )
764
764
 
765
765
  if event.query_event.default_dataset:
766
- custom_properties[
767
- "defaultDatabase"
768
- ] = event.query_event.default_dataset
766
+ custom_properties["defaultDatabase"] = (
767
+ event.query_event.default_dataset
768
+ )
769
769
  if event.read_event:
770
770
  if event.read_event.readReason:
771
771
  custom_properties["readReason"] = event.read_event.readReason
@@ -91,7 +91,6 @@ class KeyspaceKey(ContainerKey):
91
91
  supported=True,
92
92
  )
93
93
  class CassandraSource(StatefulIngestionSourceBase):
94
-
95
94
  """
96
95
  This plugin extracts the following:
97
96
 
@@ -107,10 +107,10 @@ class CassandraToSchemaFieldConverter:
107
107
 
108
108
  @staticmethod
109
109
  def get_column_type(cassandra_column_type: str) -> SchemaFieldDataType:
110
- type_class: Optional[
111
- Type
112
- ] = CassandraToSchemaFieldConverter._field_type_to_schema_field_type.get(
113
- cassandra_column_type
110
+ type_class: Optional[Type] = (
111
+ CassandraToSchemaFieldConverter._field_type_to_schema_field_type.get(
112
+ cassandra_column_type
113
+ )
114
114
  )
115
115
  if type_class is None:
116
116
  logger.warning(
@@ -293,9 +293,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
293
293
  def _load_json_schema_with_resolved_references(
294
294
  self, schema: Schema, name: str, subject: str
295
295
  ) -> dict:
296
- imported_json_schemas: List[
297
- JsonSchemaWrapper
298
- ] = self.get_schemas_from_confluent_ref_json(schema, name=name, subject=subject)
296
+ imported_json_schemas: List[JsonSchemaWrapper] = (
297
+ self.get_schemas_from_confluent_ref_json(schema, name=name, subject=subject)
298
+ )
299
299
  schema_dict = json.loads(schema.schema_str)
300
300
  reference_map = {}
301
301
  for imported_schema in imported_json_schemas:
@@ -332,9 +332,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
332
332
  )
333
333
 
334
334
  elif schema.schema_type == "PROTOBUF":
335
- imported_schemas: List[
336
- ProtobufSchema
337
- ] = self.get_schemas_from_confluent_ref_protobuf(schema)
335
+ imported_schemas: List[ProtobufSchema] = (
336
+ self.get_schemas_from_confluent_ref_protobuf(schema)
337
+ )
338
338
  base_name: str = topic.replace(".", "_")
339
339
  fields = protobuf_util.protobuf_schema_to_mce_fields(
340
340
  ProtobufSchema(
@@ -371,11 +371,11 @@ class CSVEnricherSource(Source):
371
371
  domain: Optional[str],
372
372
  description: Optional[str],
373
373
  ) -> Iterable[MetadataWorkUnit]:
374
- maybe_terms_wu: Optional[
375
- MetadataWorkUnit
376
- ] = self.get_resource_glossary_terms_work_unit(
377
- entity_urn=entity_urn,
378
- term_associations=term_associations,
374
+ maybe_terms_wu: Optional[MetadataWorkUnit] = (
375
+ self.get_resource_glossary_terms_work_unit(
376
+ entity_urn=entity_urn,
377
+ term_associations=term_associations,
378
+ )
379
379
  )
380
380
  if maybe_terms_wu:
381
381
  self.report.num_glossary_term_workunits_produced += 1
@@ -389,31 +389,31 @@ class CSVEnricherSource(Source):
389
389
  self.report.num_tag_workunits_produced += 1
390
390
  yield maybe_tags_wu
391
391
 
392
- maybe_owners_wu: Optional[
393
- MetadataWorkUnit
394
- ] = self.get_resource_owners_work_unit(
395
- entity_urn=entity_urn,
396
- owners=owners,
392
+ maybe_owners_wu: Optional[MetadataWorkUnit] = (
393
+ self.get_resource_owners_work_unit(
394
+ entity_urn=entity_urn,
395
+ owners=owners,
396
+ )
397
397
  )
398
398
  if maybe_owners_wu:
399
399
  self.report.num_owners_workunits_produced += 1
400
400
  yield maybe_owners_wu
401
401
 
402
- maybe_domain_wu: Optional[
403
- MetadataWorkUnit
404
- ] = self.get_resource_domain_work_unit(
405
- entity_urn=entity_urn,
406
- domain=domain,
402
+ maybe_domain_wu: Optional[MetadataWorkUnit] = (
403
+ self.get_resource_domain_work_unit(
404
+ entity_urn=entity_urn,
405
+ domain=domain,
406
+ )
407
407
  )
408
408
  if maybe_domain_wu:
409
409
  self.report.num_domain_workunits_produced += 1
410
410
  yield maybe_domain_wu
411
411
 
412
- maybe_description_wu: Optional[
413
- MetadataWorkUnit
414
- ] = self.get_resource_description_work_unit(
415
- entity_urn=entity_urn,
416
- description=description,
412
+ maybe_description_wu: Optional[MetadataWorkUnit] = (
413
+ self.get_resource_description_work_unit(
414
+ entity_urn=entity_urn,
415
+ description=description,
416
+ )
417
417
  )
418
418
  if maybe_description_wu:
419
419
  self.report.num_description_workunits_produced += 1
@@ -426,9 +426,9 @@ class CSVEnricherSource(Source):
426
426
  needs_write: bool,
427
427
  ) -> Tuple[EditableSchemaMetadataClass, bool]:
428
428
  field_path: str = sub_resource_row.field_path
429
- term_associations: List[
430
- GlossaryTermAssociationClass
431
- ] = sub_resource_row.term_associations
429
+ term_associations: List[GlossaryTermAssociationClass] = (
430
+ sub_resource_row.term_associations
431
+ )
432
432
  tag_associations: List[TagAssociationClass] = sub_resource_row.tag_associations
433
433
  description: Optional[str] = sub_resource_row.description
434
434
  has_terms: bool = len(term_associations) > 0
@@ -517,9 +517,9 @@ class CSVEnricherSource(Source):
517
517
  # Boolean field to tell whether we need to write an MCPW.
518
518
  needs_write = False
519
519
 
520
- current_editable_schema_metadata: Optional[
521
- EditableSchemaMetadataClass
522
- ] = None
520
+ current_editable_schema_metadata: Optional[EditableSchemaMetadataClass] = (
521
+ None
522
+ )
523
523
  if self.ctx.graph and not self.should_overwrite:
524
524
  # Fetch the current editable schema metadata
525
525
  current_editable_schema_metadata = self.ctx.graph.get_aspect(
@@ -655,9 +655,9 @@ class CSVEnricherSource(Source):
655
655
  entity_urn = row["resource"]
656
656
  entity_type = Urn.from_string(row["resource"]).get_type()
657
657
 
658
- term_associations: List[
659
- GlossaryTermAssociationClass
660
- ] = self.maybe_extract_glossary_terms(row)
658
+ term_associations: List[GlossaryTermAssociationClass] = (
659
+ self.maybe_extract_glossary_terms(row)
660
+ )
661
661
  tag_associations: List[TagAssociationClass] = self.maybe_extract_tags(row)
662
662
  owners: List[OwnerClass] = self.maybe_extract_owners(row, is_resource_row)
663
663
 
@@ -152,7 +152,9 @@ class DataHubDatabaseReader:
152
152
  ) -> Iterable[Dict[str, Any]]:
153
153
  with self.engine.connect() as conn:
154
154
  if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
155
- with conn.begin(): # Transaction required for PostgreSQL server-side cursor
155
+ with (
156
+ conn.begin()
157
+ ): # Transaction required for PostgreSQL server-side cursor
156
158
  # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
157
159
  # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
158
160
  conn = conn.execution_options(
@@ -222,7 +224,7 @@ class DataHubDatabaseReader:
222
224
  )
223
225
  except Exception as e:
224
226
  logger.warning(
225
- f'Failed to parse metadata for {row["urn"]}: {e}', exc_info=True
227
+ f"Failed to parse metadata for {row['urn']}: {e}", exc_info=True
226
228
  )
227
229
  self.report.num_database_parse_errors += 1
228
230
  self.report.database_parse_errors.setdefault(