acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (161) hide show
  1. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/METADATA +2461 -2463
  2. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/RECORD +161 -161
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/assertion/assertion_operator.py +3 -5
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  7. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  8. datahub/api/entities/dataset/dataset.py +2 -1
  9. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  10. datahub/cli/cli_utils.py +1 -1
  11. datahub/cli/delete_cli.py +16 -2
  12. datahub/cli/docker_cli.py +6 -6
  13. datahub/cli/lite_cli.py +2 -2
  14. datahub/cli/migrate.py +3 -3
  15. datahub/cli/specific/assertions_cli.py +3 -3
  16. datahub/cli/timeline_cli.py +1 -1
  17. datahub/configuration/common.py +1 -2
  18. datahub/configuration/config_loader.py +73 -50
  19. datahub/configuration/git.py +2 -2
  20. datahub/configuration/time_window_config.py +10 -5
  21. datahub/emitter/mce_builder.py +4 -8
  22. datahub/emitter/mcp_patch_builder.py +1 -2
  23. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  24. datahub/ingestion/api/report.py +1 -2
  25. datahub/ingestion/api/source_helpers.py +1 -1
  26. datahub/ingestion/extractor/json_schema_util.py +3 -3
  27. datahub/ingestion/extractor/schema_util.py +3 -5
  28. datahub/ingestion/fs/s3_fs.py +3 -3
  29. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  30. datahub/ingestion/graph/client.py +4 -6
  31. datahub/ingestion/run/pipeline.py +8 -7
  32. datahub/ingestion/run/pipeline_config.py +3 -3
  33. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  34. datahub/ingestion/source/abs/source.py +19 -8
  35. datahub/ingestion/source/aws/glue.py +11 -11
  36. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  37. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  38. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  39. datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
  40. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  41. datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
  42. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  43. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
  44. datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
  45. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  46. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  47. datahub/ingestion/source/bigquery_v2/usage.py +3 -3
  48. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  49. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
  50. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  51. datahub/ingestion/source/csv_enricher.py +29 -29
  52. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  53. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  54. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  55. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  56. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  57. datahub/ingestion/source/elastic_search.py +4 -4
  58. datahub/ingestion/source/fivetran/config.py +4 -0
  59. datahub/ingestion/source/fivetran/fivetran.py +15 -5
  60. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
  61. datahub/ingestion/source/gcs/gcs_source.py +5 -3
  62. datahub/ingestion/source/ge_data_profiler.py +4 -5
  63. datahub/ingestion/source/ge_profiling_config.py +3 -3
  64. datahub/ingestion/source/iceberg/iceberg.py +3 -3
  65. datahub/ingestion/source/identity/azure_ad.py +3 -3
  66. datahub/ingestion/source/identity/okta.py +3 -3
  67. datahub/ingestion/source/kafka/kafka.py +11 -9
  68. datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
  69. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  70. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  71. datahub/ingestion/source/looker/looker_common.py +19 -19
  72. datahub/ingestion/source/looker/looker_config.py +3 -3
  73. datahub/ingestion/source/looker/looker_source.py +25 -25
  74. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  75. datahub/ingestion/source/looker/looker_usage.py +5 -7
  76. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  77. datahub/ingestion/source/looker/lookml_source.py +13 -15
  78. datahub/ingestion/source/looker/view_upstream.py +5 -5
  79. datahub/ingestion/source/mlflow.py +4 -4
  80. datahub/ingestion/source/mongodb.py +6 -4
  81. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  82. datahub/ingestion/source/nifi.py +24 -26
  83. datahub/ingestion/source/openapi.py +9 -9
  84. datahub/ingestion/source/powerbi/config.py +12 -12
  85. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  86. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  87. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  88. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  89. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  90. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  91. datahub/ingestion/source/redshift/config.py +3 -3
  92. datahub/ingestion/source/redshift/query.py +77 -47
  93. datahub/ingestion/source/redshift/redshift.py +12 -12
  94. datahub/ingestion/source/redshift/usage.py +8 -8
  95. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  96. datahub/ingestion/source/s3/source.py +1 -1
  97. datahub/ingestion/source/salesforce.py +26 -25
  98. datahub/ingestion/source/schema/json_schema.py +1 -1
  99. datahub/ingestion/source/sigma/sigma.py +3 -3
  100. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  101. datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
  102. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  103. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  104. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
  105. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
  106. datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
  107. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
  108. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  109. datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
  110. datahub/ingestion/source/sql/athena.py +1 -3
  111. datahub/ingestion/source/sql/clickhouse.py +8 -14
  112. datahub/ingestion/source/sql/oracle.py +1 -3
  113. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  114. datahub/ingestion/source/sql/teradata.py +16 -3
  115. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  116. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  117. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  118. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  119. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  120. datahub/ingestion/source/tableau/tableau.py +48 -49
  121. datahub/ingestion/source/unity/config.py +3 -1
  122. datahub/ingestion/source/unity/proxy.py +1 -1
  123. datahub/ingestion/source/unity/source.py +3 -3
  124. datahub/ingestion/source/unity/usage.py +3 -1
  125. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  126. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  127. datahub/ingestion/source/usage/usage_common.py +1 -1
  128. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  129. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  130. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  131. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  132. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  133. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  134. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  135. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  136. datahub/lite/duckdb_lite.py +12 -10
  137. datahub/metadata/_schema_classes.py +1 -1
  138. datahub/metadata/schema.avsc +6 -2
  139. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  140. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  141. datahub/secret/secret_common.py +14 -8
  142. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  143. datahub/sql_parsing/schema_resolver.py +5 -10
  144. datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
  145. datahub/sql_parsing/sqlglot_lineage.py +5 -4
  146. datahub/sql_parsing/sqlglot_utils.py +3 -2
  147. datahub/telemetry/stats.py +1 -2
  148. datahub/testing/mcp_diff.py +1 -1
  149. datahub/utilities/file_backed_collections.py +10 -10
  150. datahub/utilities/hive_schema_to_avro.py +2 -2
  151. datahub/utilities/logging_manager.py +2 -2
  152. datahub/utilities/lossy_collections.py +3 -3
  153. datahub/utilities/mapping.py +3 -3
  154. datahub/utilities/serialized_lru_cache.py +3 -1
  155. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  156. datahub/utilities/sqllineage_patch.py +1 -1
  157. datahub/utilities/stats_collections.py +3 -1
  158. datahub/utilities/urns/urn_iter.py +2 -2
  159. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/WHEEL +0 -0
  160. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/entry_points.txt +0 -0
  161. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/top_level.txt +0 -0
@@ -218,9 +218,7 @@ def _get_all_table_comments_and_properties(self, connection, **kw):
218
218
  , comment
219
219
  , {properties_clause} AS properties
220
220
  FROM system.tables
221
- WHERE name NOT LIKE '.inner%'""".format(
222
- properties_clause=properties_clause
223
- )
221
+ WHERE name NOT LIKE '.inner%'""".format(properties_clause=properties_clause)
224
222
  )
225
223
 
226
224
  all_table_comments: Dict[Tuple[str, str], Dict[str, Any]] = {}
@@ -268,7 +266,7 @@ def _get_table_or_view_names(self, relkind, connection, schema=None, **kw):
268
266
  info_cache = kw.get("info_cache")
269
267
  all_relations = self._get_all_relation_info(connection, info_cache=info_cache)
270
268
  relation_names = []
271
- for key, relation in all_relations.items():
269
+ for _, relation in all_relations.items():
272
270
  if relation.database == schema and relation.relkind == relkind:
273
271
  relation_names.append(relation.relname)
274
272
  return relation_names
@@ -301,9 +299,7 @@ def _get_schema_column_info(self, connection, schema=None, **kw):
301
299
  , comment
302
300
  FROM system.columns
303
301
  WHERE {schema_clause}
304
- ORDER BY database, table, position""".format(
305
- schema_clause=schema_clause
306
- )
302
+ ORDER BY database, table, position""".format(schema_clause=schema_clause)
307
303
  )
308
304
  )
309
305
  )
@@ -474,7 +470,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
474
470
  logger.debug(f"sql_alchemy_url={url}")
475
471
  engine = create_engine(url, **self.config.options)
476
472
  for db_row in engine.execute(text(all_tables_query)):
477
- all_tables_set.add(f'{db_row["database"]}.{db_row["table_name"]}')
473
+ all_tables_set.add(f"{db_row['database']}.{db_row['table_name']}")
478
474
 
479
475
  return all_tables_set
480
476
 
@@ -503,7 +499,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
503
499
 
504
500
  try:
505
501
  for db_row in engine.execute(text(query)):
506
- dataset_name = f'{db_row["target_schema"]}.{db_row["target_table"]}'
502
+ dataset_name = f"{db_row['target_schema']}.{db_row['target_table']}"
507
503
  if not self.config.database_pattern.allowed(
508
504
  db_row["target_schema"]
509
505
  ) or not self.config.table_pattern.allowed(dataset_name):
@@ -512,7 +508,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
512
508
 
513
509
  # Target
514
510
  target_path = (
515
- f'{self.config.platform_instance+"." if self.config.platform_instance else ""}'
511
+ f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
516
512
  f"{dataset_name}"
517
513
  )
518
514
  target = LineageItem(
@@ -525,7 +521,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
525
521
 
526
522
  # Source
527
523
  platform = LineageDatasetPlatform.CLICKHOUSE
528
- path = f'{db_row["source_schema"]}.{db_row["source_table"]}'
524
+ path = f"{db_row['source_schema']}.{db_row['source_table']}"
529
525
 
530
526
  sources = [
531
527
  LineageDataset(
@@ -552,9 +548,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
552
548
  target.dataset.path
553
549
  ].upstreams = self._lineage_map[
554
550
  target.dataset.path
555
- ].upstreams.union(
556
- target.upstreams
557
- )
551
+ ].upstreams.union(target.upstreams)
558
552
 
559
553
  else:
560
554
  self._lineage_map[target.dataset.path] = target
@@ -234,9 +234,7 @@ class OracleInspectorObjectWrapper:
234
234
  WHERE col.table_name = id.table_name
235
235
  AND col.column_name = id.column_name
236
236
  AND col.owner = id.owner
237
- ) AS identity_options""".format(
238
- dblink=dblink
239
- )
237
+ ) AS identity_options""".format(dblink=dblink)
240
238
  else:
241
239
  identity_cols = "NULL as default_on_null, NULL as identity_options"
242
240
 
@@ -278,8 +278,7 @@ class GenericProfiler:
278
278
 
279
279
  if self.config.profiling.profile_table_size_limit is not None and (
280
280
  size_in_bytes is not None
281
- and size_in_bytes / (2**30)
282
- > self.config.profiling.profile_table_size_limit
281
+ and size_in_bytes / (2**30) > self.config.profiling.profile_table_size_limit
283
282
  ):
284
283
  self.report.profiling_skipped_size_limit[schema_name] += 1
285
284
  logger.debug(
@@ -599,7 +599,12 @@ ORDER by DataBaseName, TableName;
599
599
  setattr( # noqa: B010
600
600
  TeradataDialect,
601
601
  "get_columns",
602
- lambda self, connection, table_name, schema=None, use_qvci=self.config.use_qvci, **kw: optimized_get_columns(
602
+ lambda self,
603
+ connection,
604
+ table_name,
605
+ schema=None,
606
+ use_qvci=self.config.use_qvci,
607
+ **kw: optimized_get_columns(
603
608
  self,
604
609
  connection,
605
610
  table_name,
@@ -613,7 +618,11 @@ ORDER by DataBaseName, TableName;
613
618
  setattr( # noqa: B010
614
619
  TeradataDialect,
615
620
  "get_pk_constraint",
616
- lambda self, connection, table_name, schema=None, **kw: optimized_get_pk_constraint(
621
+ lambda self,
622
+ connection,
623
+ table_name,
624
+ schema=None,
625
+ **kw: optimized_get_pk_constraint(
617
626
  self, connection, table_name, schema, **kw
618
627
  ),
619
628
  )
@@ -621,7 +630,11 @@ ORDER by DataBaseName, TableName;
621
630
  setattr( # noqa: B010
622
631
  TeradataDialect,
623
632
  "get_foreign_keys",
624
- lambda self, connection, table_name, schema=None, **kw: optimized_get_foreign_keys(
633
+ lambda self,
634
+ connection,
635
+ table_name,
636
+ schema=None,
637
+ **kw: optimized_get_foreign_keys(
625
638
  self, connection, table_name, schema, **kw
626
639
  ),
627
640
  )
@@ -41,9 +41,9 @@ class ProfilingHandler(StatefulIngestionUsecaseHandlerBase[ProfilingCheckpointSt
41
41
  run_id: str,
42
42
  ):
43
43
  self.state_provider = source.state_provider
44
- self.stateful_ingestion_config: Optional[
45
- ProfilingStatefulIngestionConfig
46
- ] = config.stateful_ingestion
44
+ self.stateful_ingestion_config: Optional[ProfilingStatefulIngestionConfig] = (
45
+ config.stateful_ingestion
46
+ )
47
47
  self.pipeline_name = pipeline_name
48
48
  self.run_id = run_id
49
49
  self.checkpointing_enabled: bool = (
@@ -48,9 +48,9 @@ class RedundantRunSkipHandler(
48
48
  ):
49
49
  self.source = source
50
50
  self.state_provider = source.state_provider
51
- self.stateful_ingestion_config: Optional[
52
- StatefulIngestionConfig
53
- ] = config.stateful_ingestion
51
+ self.stateful_ingestion_config: Optional[StatefulIngestionConfig] = (
52
+ config.stateful_ingestion
53
+ )
54
54
  self.pipeline_name = pipeline_name
55
55
  self.run_id = run_id
56
56
  self._job_id = self._init_job_id()
@@ -145,8 +145,7 @@ class RedundantRunSkipHandler(
145
145
  )
146
146
 
147
147
  logger.debug(
148
- f"{self.job_id} : Last run start, end times:"
149
- f"({last_run_time_window})"
148
+ f"{self.job_id} : Last run start, end times:({last_run_time_window})"
150
149
  )
151
150
 
152
151
  # If current run's time window is subset of last run's time window, then skip.
@@ -212,8 +211,7 @@ class RedundantRunSkipHandler(
212
211
  )
213
212
 
214
213
  self.log(
215
- "Adjusted start, end times: "
216
- f"({suggested_start_time}, {suggested_end_time})"
214
+ f"Adjusted start, end times: ({suggested_start_time}, {suggested_end_time})"
217
215
  )
218
216
  return (suggested_start_time, suggested_end_time)
219
217
 
@@ -111,9 +111,9 @@ class StaleEntityRemovalHandler(
111
111
  self.state_type_class = state_type_class
112
112
  self.pipeline_name = pipeline_name
113
113
  self.run_id = run_id
114
- self.stateful_ingestion_config: Optional[
115
- StatefulStaleMetadataRemovalConfig
116
- ] = config.stateful_ingestion
114
+ self.stateful_ingestion_config: Optional[StatefulStaleMetadataRemovalConfig] = (
115
+ config.stateful_ingestion
116
+ )
117
117
  self.checkpointing_enabled: bool = (
118
118
  True
119
119
  if (
@@ -70,20 +70,20 @@ class DatahubIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
70
70
  self.orchestrator_name, pipeline_name, job_name
71
71
  )
72
72
 
73
- latest_checkpoint: Optional[
74
- DatahubIngestionCheckpointClass
75
- ] = self.graph.get_latest_timeseries_value(
76
- entity_urn=data_job_urn,
77
- aspect_type=DatahubIngestionCheckpointClass,
78
- filter_criteria_map={
79
- "pipelineName": pipeline_name,
80
- },
73
+ latest_checkpoint: Optional[DatahubIngestionCheckpointClass] = (
74
+ self.graph.get_latest_timeseries_value(
75
+ entity_urn=data_job_urn,
76
+ aspect_type=DatahubIngestionCheckpointClass,
77
+ filter_criteria_map={
78
+ "pipelineName": pipeline_name,
79
+ },
80
+ )
81
81
  )
82
82
  if latest_checkpoint:
83
83
  logger.debug(
84
84
  f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
85
85
  f" job_name:'{job_name}' found with start_time:"
86
- f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}"
86
+ f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis / 1000)}"
87
87
  )
88
88
  return latest_checkpoint
89
89
  else:
@@ -67,7 +67,7 @@ class FileIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
67
67
  logger.debug(
68
68
  f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
69
69
  f" job_name:'{job_name}' found with start_time:"
70
- f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}"
70
+ f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis / 1000)}"
71
71
  )
72
72
  return latest_checkpoint
73
73
  else:
@@ -281,9 +281,9 @@ class TableauConnectionConfig(ConfigModel):
281
281
  return authentication
282
282
 
283
283
  def make_tableau_client(self, site: str) -> Server:
284
- authentication: Union[
285
- TableauAuth, PersonalAccessTokenAuth
286
- ] = self.get_tableau_auth(site)
284
+ authentication: Union[TableauAuth, PersonalAccessTokenAuth] = (
285
+ self.get_tableau_auth(site)
286
+ )
287
287
  try:
288
288
  server = Server(
289
289
  self.connect_uri,
@@ -635,7 +635,7 @@ class TableauConfig(
635
635
  project_path_pattern = values.get("project_path_pattern")
636
636
  if project_pattern is None and project_path_pattern is None and projects:
637
637
  logger.warning(
638
- "projects is deprecated, please use " "project_path_pattern instead."
638
+ "projects is deprecated, please use project_path_pattern instead."
639
639
  )
640
640
  logger.info("Initializing project_pattern from projects")
641
641
  values["project_pattern"] = AllowDenyPattern(
@@ -708,18 +708,18 @@ class DatabaseTable:
708
708
  """
709
709
 
710
710
  urn: str
711
- id: Optional[
712
- str
713
- ] = None # is not None only for tables that came from Tableau metadata
711
+ id: Optional[str] = (
712
+ None # is not None only for tables that came from Tableau metadata
713
+ )
714
714
  num_cols: Optional[int] = None
715
715
 
716
- paths: Optional[
717
- Set[str]
718
- ] = None # maintains all browse paths encountered for this table
716
+ paths: Optional[Set[str]] = (
717
+ None # maintains all browse paths encountered for this table
718
+ )
719
719
 
720
- parsed_columns: Optional[
721
- Set[str]
722
- ] = None # maintains all columns encountered for this table during parsing SQL queries
720
+ parsed_columns: Optional[Set[str]] = (
721
+ None # maintains all columns encountered for this table during parsing SQL queries
722
+ )
723
723
 
724
724
  def update_table(
725
725
  self,
@@ -2310,8 +2310,7 @@ class TableauSiteSource:
2310
2310
  c.EMBEDDED_DATA_SOURCE,
2311
2311
  ):
2312
2312
  logger.debug(
2313
- f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is "
2314
- f"unsupported"
2313
+ f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is unsupported"
2315
2314
  )
2316
2315
  return None
2317
2316
 
@@ -2493,9 +2492,9 @@ class TableauSiteSource:
2493
2492
  def _enrich_database_tables_with_parsed_schemas(
2494
2493
  self, parsing_result: SqlParsingResult
2495
2494
  ) -> None:
2496
- in_tables_schemas: Dict[
2497
- str, Set[str]
2498
- ] = transform_parsing_result_to_in_tables_schemas(parsing_result)
2495
+ in_tables_schemas: Dict[str, Set[str]] = (
2496
+ transform_parsing_result_to_in_tables_schemas(parsing_result)
2497
+ )
2499
2498
 
2500
2499
  if not in_tables_schemas:
2501
2500
  logger.info("Unable to extract table schema from parsing result")
@@ -3559,25 +3558,25 @@ class TableauSiteSource:
3559
3558
 
3560
3559
  generated_project_keys.add(project_key.guid())
3561
3560
 
3562
- parent_project_key: Optional[
3563
- Union[ProjectKey, SiteKey]
3564
- ] = None # It is going
3561
+ parent_project_key: Optional[Union[ProjectKey, SiteKey]] = (
3562
+ None # It is going
3563
+ )
3565
3564
  # to be used as a parent container key for the current tableau project
3566
3565
 
3567
3566
  if project_.parent_id is not None:
3568
3567
  # Go to the parent project as we need to generate container first for parent
3569
3568
  parent_project_key = self.gen_project_key(project_.parent_id)
3570
3569
 
3571
- parent_tableau_project: Optional[
3572
- TableauProject
3573
- ] = self.tableau_project_registry.get(project_.parent_id)
3570
+ parent_tableau_project: Optional[TableauProject] = (
3571
+ self.tableau_project_registry.get(project_.parent_id)
3572
+ )
3574
3573
 
3575
3574
  if (
3576
3575
  parent_tableau_project is None
3577
3576
  ): # It is not in project registry because of project_pattern
3578
- assert (
3579
- project_.parent_name
3580
- ), f"project {project_.name} should not be null"
3577
+ assert project_.parent_name, (
3578
+ f"project {project_.name} should not be null"
3579
+ )
3581
3580
  parent_tableau_project = TableauProject(
3582
3581
  id=project_.parent_id,
3583
3582
  name=project_.parent_name,
@@ -3605,7 +3604,7 @@ class TableauSiteSource:
3605
3604
  parent_container_key=parent_project_key,
3606
3605
  )
3607
3606
 
3608
- for id_, project in self.tableau_project_registry.items():
3607
+ for project in self.tableau_project_registry.values():
3609
3608
  logger.debug(
3610
3609
  f"project {project.name} and it's parent {project.parent_name} and parent id {project.parent_id}"
3611
3610
  )
@@ -3669,16 +3668,16 @@ class TableauSiteSource:
3669
3668
  if self.config.extract_usage_stats:
3670
3669
  with PerfTimer() as timer:
3671
3670
  self._populate_usage_stat_registry()
3672
- self.report.extract_usage_stats_timer[
3673
- self.site_content_url
3674
- ] = timer.elapsed_seconds(digits=2)
3671
+ self.report.extract_usage_stats_timer[self.site_content_url] = (
3672
+ timer.elapsed_seconds(digits=2)
3673
+ )
3675
3674
 
3676
3675
  if self.config.permission_ingestion:
3677
3676
  with PerfTimer() as timer:
3678
3677
  self._fetch_groups()
3679
- self.report.fetch_groups_timer[
3680
- self.site_content_url
3681
- ] = timer.elapsed_seconds(digits=2)
3678
+ self.report.fetch_groups_timer[self.site_content_url] = (
3679
+ timer.elapsed_seconds(digits=2)
3680
+ )
3682
3681
 
3683
3682
  # Populate the map of database names and database hostnames to be used later to map
3684
3683
  # databases to platform instances.
@@ -3691,9 +3690,9 @@ class TableauSiteSource:
3691
3690
 
3692
3691
  with PerfTimer() as timer:
3693
3692
  self._populate_projects_registry()
3694
- self.report.populate_projects_registry_timer[
3695
- self.site_content_url
3696
- ] = timer.elapsed_seconds(digits=2)
3693
+ self.report.populate_projects_registry_timer[self.site_content_url] = (
3694
+ timer.elapsed_seconds(digits=2)
3695
+ )
3697
3696
 
3698
3697
  if self.config.add_site_container:
3699
3698
  yield from self.emit_site_container()
@@ -3701,23 +3700,23 @@ class TableauSiteSource:
3701
3700
 
3702
3701
  with PerfTimer() as timer:
3703
3702
  yield from self.emit_workbooks()
3704
- self.report.emit_workbooks_timer[
3705
- self.site_content_url
3706
- ] = timer.elapsed_seconds(digits=2)
3703
+ self.report.emit_workbooks_timer[self.site_content_url] = (
3704
+ timer.elapsed_seconds(digits=2)
3705
+ )
3707
3706
 
3708
3707
  if self.sheet_ids:
3709
3708
  with PerfTimer() as timer:
3710
3709
  yield from self.emit_sheets()
3711
- self.report.emit_sheets_timer[
3712
- self.site_content_url
3713
- ] = timer.elapsed_seconds(digits=2)
3710
+ self.report.emit_sheets_timer[self.site_content_url] = (
3711
+ timer.elapsed_seconds(digits=2)
3712
+ )
3714
3713
 
3715
3714
  if self.dashboard_ids:
3716
3715
  with PerfTimer() as timer:
3717
3716
  yield from self.emit_dashboards()
3718
- self.report.emit_dashboards_timer[
3719
- self.site_content_url
3720
- ] = timer.elapsed_seconds(digits=2)
3717
+ self.report.emit_dashboards_timer[self.site_content_url] = (
3718
+ timer.elapsed_seconds(digits=2)
3719
+ )
3721
3720
 
3722
3721
  if self.embedded_datasource_ids_being_used:
3723
3722
  with PerfTimer() as timer:
@@ -3743,6 +3742,6 @@ class TableauSiteSource:
3743
3742
  if self.database_tables:
3744
3743
  with PerfTimer() as timer:
3745
3744
  yield from self.emit_upstream_tables()
3746
- self.report.emit_upstream_tables_timer[
3747
- self.site_content_url
3748
- ] = timer.elapsed_seconds(digits=2)
3745
+ self.report.emit_upstream_tables_timer[self.site_content_url] = (
3746
+ timer.elapsed_seconds(digits=2)
3747
+ )
@@ -254,7 +254,9 @@ class UnityCatalogSourceConfig(
254
254
  )
255
255
 
256
256
  # TODO: Remove `type:ignore` by refactoring config
257
- profiling: Union[UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig] = Field( # type: ignore
257
+ profiling: Union[
258
+ UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
259
+ ] = Field( # type: ignore
258
260
  default=UnityCatalogGEProfilerConfig(),
259
261
  description="Data profiling configuration",
260
262
  discriminator="method",
@@ -363,7 +363,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
363
363
 
364
364
  @staticmethod
365
365
  def _create_metastore(
366
- obj: Union[GetMetastoreSummaryResponse, MetastoreInfo]
366
+ obj: Union[GetMetastoreSummaryResponse, MetastoreInfo],
367
367
  ) -> Optional[Metastore]:
368
368
  if not obj.name:
369
369
  return None
@@ -205,9 +205,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
205
205
  self.table_refs: Set[TableReference] = set()
206
206
  self.view_refs: Set[TableReference] = set()
207
207
  self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
208
- self.view_definitions: FileBackedDict[
209
- Tuple[TableReference, str]
210
- ] = FileBackedDict()
208
+ self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
209
+ FileBackedDict()
210
+ )
211
211
 
212
212
  # Global map of tables, for profiling
213
213
  self.tables: FileBackedDict[Table] = FileBackedDict()
@@ -103,7 +103,9 @@ class UnityCatalogUsageExtractor:
103
103
  query, table_info
104
104
  )
105
105
  for source_table in table_info.source_tables:
106
- with self.report.usage_perf_report.aggregator_add_event_timer:
106
+ with (
107
+ self.report.usage_perf_report.aggregator_add_event_timer
108
+ ):
107
109
  self.usage_aggregator.aggregate_event(
108
110
  resource=source_table,
109
111
  start_time=query.start_time,
@@ -213,15 +213,15 @@ class ClickHouseUsageSource(Source):
213
213
  def _aggregate_access_events(
214
214
  self, events: List[ClickHouseJoinedAccessEvent]
215
215
  ) -> Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]]:
216
- datasets: Dict[
217
- datetime, Dict[ClickHouseTableRef, AggregatedDataset]
218
- ] = collections.defaultdict(dict)
216
+ datasets: Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]] = (
217
+ collections.defaultdict(dict)
218
+ )
219
219
 
220
220
  for event in events:
221
221
  floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
222
222
 
223
223
  resource = (
224
- f'{self.config.platform_instance+"." if self.config.platform_instance else ""}'
224
+ f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
225
225
  f"{event.database}.{event.table}"
226
226
  )
227
227
 
@@ -235,9 +235,9 @@ class TrinoUsageSource(Source):
235
235
  def _aggregate_access_events(
236
236
  self, events: List[TrinoJoinedAccessEvent]
237
237
  ) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]:
238
- datasets: Dict[
239
- datetime, Dict[TrinoTableRef, AggregatedDataset]
240
- ] = collections.defaultdict(dict)
238
+ datasets: Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]] = (
239
+ collections.defaultdict(dict)
240
+ )
241
241
 
242
242
  for event in events:
243
243
  floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
@@ -89,7 +89,7 @@ def make_usage_workunit(
89
89
  top_sql_queries: Optional[List[str]] = None
90
90
  if query_freq is not None:
91
91
  if top_n_queries < len(query_freq):
92
- logger.warn(
92
+ logger.warning(
93
93
  f"Top N query limit exceeded on {str(resource)}. Max number of queries {top_n_queries} < {len(query_freq)}. Truncating top queries to {top_n_queries}."
94
94
  )
95
95
  query_freq = query_freq[0:top_n_queries]
@@ -80,10 +80,10 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
80
80
  ).add_asset(container_urn)
81
81
  data_products_container[data_product_urn] = container_product
82
82
  else:
83
- data_products_container[
84
- data_product_urn
85
- ] = data_products_container[data_product_urn].add_asset(
86
- container_urn
83
+ data_products_container[data_product_urn] = (
84
+ data_products_container[data_product_urn].add_asset(
85
+ container_urn
86
+ )
87
87
  )
88
88
 
89
89
  mcps: List[
@@ -61,9 +61,9 @@ class AddDatasetProperties(DatasetPropertiesTransformer):
61
61
  ) -> Optional[DatasetPropertiesClass]:
62
62
  assert dataset_properties_aspect
63
63
 
64
- server_dataset_properties_aspect: Optional[
65
- DatasetPropertiesClass
66
- ] = graph.get_dataset_properties(entity_urn)
64
+ server_dataset_properties_aspect: Optional[DatasetPropertiesClass] = (
65
+ graph.get_dataset_properties(entity_urn)
66
+ )
67
67
  # No need to take any action if server properties is None or there is not customProperties in server properties
68
68
  if (
69
69
  server_dataset_properties_aspect is None
@@ -89,9 +89,9 @@ class AddDatasetSchemaTags(DatasetSchemaMetadataTransformer):
89
89
  server_field_map: dict = {}
90
90
  if self.config.semantics == TransformerSemantics.PATCH:
91
91
  assert self.ctx.graph
92
- server_schema_metadata_aspect: Optional[
93
- SchemaMetadataClass
94
- ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
92
+ server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
93
+ self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
94
+ )
95
95
  if server_schema_metadata_aspect is not None:
96
96
  if not schema_metadata_aspect:
97
97
  schema_metadata_aspect = server_schema_metadata_aspect
@@ -108,9 +108,9 @@ class AddDatasetSchemaTerms(DatasetSchemaMetadataTransformer):
108
108
  ] = {} # Map to cache server field objects, where fieldPath is key
109
109
  if self.config.semantics == TransformerSemantics.PATCH:
110
110
  assert self.ctx.graph
111
- server_schema_metadata_aspect: Optional[
112
- SchemaMetadataClass
113
- ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
111
+ server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
112
+ self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
113
+ )
114
114
  if server_schema_metadata_aspect is not None:
115
115
  if not schema_metadata_aspect:
116
116
  schema_metadata_aspect = server_schema_metadata_aspect
@@ -60,10 +60,10 @@ class DatasetTagDomainMapper(DatasetDomainTransformer):
60
60
  domain_aspect.domains.extend(mapped_domains.domains)
61
61
  if self.config.semantics == TransformerSemantics.PATCH:
62
62
  # Try merging with server-side domains
63
- patch_domain_aspect: Optional[
64
- DomainsClass
65
- ] = AddDatasetDomain._merge_with_server_domains(
66
- self.ctx.graph, entity_urn, domain_aspect
63
+ patch_domain_aspect: Optional[DomainsClass] = (
64
+ AddDatasetDomain._merge_with_server_domains(
65
+ self.ctx.graph, entity_urn, domain_aspect
66
+ )
67
67
  )
68
68
  return cast(Optional[Aspect], patch_domain_aspect)
69
69
  return cast(Optional[Aspect], domain_aspect)
@@ -141,9 +141,9 @@ class ExtractOwnersFromTagsTransformer(DatasetTagsTransformer):
141
141
  else:
142
142
  owner_type = get_owner_type(self.config.owner_type)
143
143
  if owner_type == OwnershipTypeClass.CUSTOM:
144
- assert (
145
- self.config.owner_type_urn is not None
146
- ), "owner_type_urn must be set if owner_type is CUSTOM"
144
+ assert self.config.owner_type_urn is not None, (
145
+ "owner_type_urn must be set if owner_type is CUSTOM"
146
+ )
147
147
 
148
148
  owners.append(
149
149
  OwnerClass(