acryl-datahub 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (168) hide show
  1. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2513 -2521
  2. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +168 -168
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/assertion/assertion_operator.py +3 -5
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  7. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  8. datahub/api/entities/dataset/dataset.py +2 -1
  9. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  10. datahub/cli/cli_utils.py +1 -1
  11. datahub/cli/docker_cli.py +6 -6
  12. datahub/cli/ingest_cli.py +25 -15
  13. datahub/cli/lite_cli.py +2 -2
  14. datahub/cli/migrate.py +3 -3
  15. datahub/cli/specific/assertions_cli.py +3 -3
  16. datahub/cli/timeline_cli.py +1 -1
  17. datahub/configuration/common.py +1 -2
  18. datahub/configuration/config_loader.py +73 -50
  19. datahub/configuration/git.py +2 -2
  20. datahub/configuration/time_window_config.py +10 -5
  21. datahub/emitter/mce_builder.py +4 -8
  22. datahub/emitter/mcp_patch_builder.py +1 -2
  23. datahub/entrypoints.py +6 -0
  24. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  25. datahub/ingestion/api/report.py +1 -2
  26. datahub/ingestion/api/source_helpers.py +1 -1
  27. datahub/ingestion/extractor/json_schema_util.py +3 -3
  28. datahub/ingestion/extractor/schema_util.py +3 -5
  29. datahub/ingestion/fs/s3_fs.py +3 -3
  30. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  31. datahub/ingestion/graph/client.py +4 -6
  32. datahub/ingestion/run/pipeline.py +8 -7
  33. datahub/ingestion/run/pipeline_config.py +3 -3
  34. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  35. datahub/ingestion/source/abs/source.py +19 -8
  36. datahub/ingestion/source/aws/glue.py +11 -11
  37. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  38. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  39. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  40. datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
  41. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  42. datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
  43. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  44. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
  45. datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
  46. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  47. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  48. datahub/ingestion/source/bigquery_v2/usage.py +3 -3
  49. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  50. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
  51. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  52. datahub/ingestion/source/csv_enricher.py +29 -29
  53. datahub/ingestion/source/datahub/config.py +4 -0
  54. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  55. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  56. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  57. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  58. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  59. datahub/ingestion/source/elastic_search.py +4 -4
  60. datahub/ingestion/source/gc/datahub_gc.py +1 -0
  61. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +17 -5
  62. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  63. datahub/ingestion/source/ge_data_profiler.py +2 -5
  64. datahub/ingestion/source/ge_profiling_config.py +3 -3
  65. datahub/ingestion/source/iceberg/iceberg.py +3 -3
  66. datahub/ingestion/source/identity/azure_ad.py +3 -3
  67. datahub/ingestion/source/identity/okta.py +3 -3
  68. datahub/ingestion/source/kafka/kafka.py +11 -9
  69. datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
  70. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  71. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  72. datahub/ingestion/source/looker/looker_common.py +19 -19
  73. datahub/ingestion/source/looker/looker_config.py +3 -3
  74. datahub/ingestion/source/looker/looker_source.py +25 -25
  75. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  76. datahub/ingestion/source/looker/looker_usage.py +5 -7
  77. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  78. datahub/ingestion/source/looker/lookml_source.py +13 -15
  79. datahub/ingestion/source/looker/view_upstream.py +5 -5
  80. datahub/ingestion/source/mlflow.py +4 -4
  81. datahub/ingestion/source/mode.py +5 -5
  82. datahub/ingestion/source/mongodb.py +6 -4
  83. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  84. datahub/ingestion/source/nifi.py +24 -26
  85. datahub/ingestion/source/openapi.py +9 -9
  86. datahub/ingestion/source/powerbi/config.py +12 -12
  87. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  88. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  89. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  90. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  91. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  92. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  93. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  94. datahub/ingestion/source/redshift/config.py +3 -3
  95. datahub/ingestion/source/redshift/redshift.py +12 -12
  96. datahub/ingestion/source/redshift/usage.py +8 -8
  97. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  98. datahub/ingestion/source/s3/source.py +1 -1
  99. datahub/ingestion/source/salesforce.py +26 -25
  100. datahub/ingestion/source/schema/json_schema.py +1 -1
  101. datahub/ingestion/source/sigma/sigma.py +3 -3
  102. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  103. datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
  104. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  105. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  106. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
  107. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
  108. datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
  109. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
  110. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  111. datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
  112. datahub/ingestion/source/sql/athena.py +1 -3
  113. datahub/ingestion/source/sql/clickhouse.py +8 -14
  114. datahub/ingestion/source/sql/oracle.py +1 -3
  115. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  116. datahub/ingestion/source/sql/sql_types.py +0 -1
  117. datahub/ingestion/source/sql/teradata.py +16 -3
  118. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  119. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  120. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  121. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  122. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  123. datahub/ingestion/source/tableau/tableau.py +245 -101
  124. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  125. datahub/ingestion/source/unity/config.py +3 -1
  126. datahub/ingestion/source/unity/proxy.py +1 -1
  127. datahub/ingestion/source/unity/source.py +3 -3
  128. datahub/ingestion/source/unity/usage.py +3 -1
  129. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  130. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  131. datahub/ingestion/source/usage/usage_common.py +1 -1
  132. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  133. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  134. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  135. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  136. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  137. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  138. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  139. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  140. datahub/lite/duckdb_lite.py +12 -10
  141. datahub/metadata/_schema_classes.py +1 -1
  142. datahub/metadata/schema.avsc +6 -2
  143. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  144. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  145. datahub/secret/datahub_secrets_client.py +12 -21
  146. datahub/secret/secret_common.py +14 -8
  147. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  148. datahub/sql_parsing/schema_resolver.py +5 -10
  149. datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
  150. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  151. datahub/sql_parsing/sqlglot_utils.py +1 -1
  152. datahub/telemetry/stats.py +1 -2
  153. datahub/testing/mcp_diff.py +1 -1
  154. datahub/utilities/file_backed_collections.py +10 -10
  155. datahub/utilities/hive_schema_to_avro.py +2 -2
  156. datahub/utilities/logging_manager.py +2 -2
  157. datahub/utilities/lossy_collections.py +3 -3
  158. datahub/utilities/mapping.py +3 -3
  159. datahub/utilities/memory_footprint.py +3 -2
  160. datahub/utilities/serialized_lru_cache.py +3 -1
  161. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  162. datahub/utilities/sqllineage_patch.py +1 -1
  163. datahub/utilities/stats_collections.py +3 -1
  164. datahub/utilities/urns/_urn_base.py +28 -5
  165. datahub/utilities/urns/urn_iter.py +2 -2
  166. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
  167. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
  168. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
@@ -218,9 +218,7 @@ def _get_all_table_comments_and_properties(self, connection, **kw):
218
218
  , comment
219
219
  , {properties_clause} AS properties
220
220
  FROM system.tables
221
- WHERE name NOT LIKE '.inner%'""".format(
222
- properties_clause=properties_clause
223
- )
221
+ WHERE name NOT LIKE '.inner%'""".format(properties_clause=properties_clause)
224
222
  )
225
223
 
226
224
  all_table_comments: Dict[Tuple[str, str], Dict[str, Any]] = {}
@@ -268,7 +266,7 @@ def _get_table_or_view_names(self, relkind, connection, schema=None, **kw):
268
266
  info_cache = kw.get("info_cache")
269
267
  all_relations = self._get_all_relation_info(connection, info_cache=info_cache)
270
268
  relation_names = []
271
- for key, relation in all_relations.items():
269
+ for _, relation in all_relations.items():
272
270
  if relation.database == schema and relation.relkind == relkind:
273
271
  relation_names.append(relation.relname)
274
272
  return relation_names
@@ -301,9 +299,7 @@ def _get_schema_column_info(self, connection, schema=None, **kw):
301
299
  , comment
302
300
  FROM system.columns
303
301
  WHERE {schema_clause}
304
- ORDER BY database, table, position""".format(
305
- schema_clause=schema_clause
306
- )
302
+ ORDER BY database, table, position""".format(schema_clause=schema_clause)
307
303
  )
308
304
  )
309
305
  )
@@ -474,7 +470,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
474
470
  logger.debug(f"sql_alchemy_url={url}")
475
471
  engine = create_engine(url, **self.config.options)
476
472
  for db_row in engine.execute(text(all_tables_query)):
477
- all_tables_set.add(f'{db_row["database"]}.{db_row["table_name"]}')
473
+ all_tables_set.add(f"{db_row['database']}.{db_row['table_name']}")
478
474
 
479
475
  return all_tables_set
480
476
 
@@ -503,7 +499,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
503
499
 
504
500
  try:
505
501
  for db_row in engine.execute(text(query)):
506
- dataset_name = f'{db_row["target_schema"]}.{db_row["target_table"]}'
502
+ dataset_name = f"{db_row['target_schema']}.{db_row['target_table']}"
507
503
  if not self.config.database_pattern.allowed(
508
504
  db_row["target_schema"]
509
505
  ) or not self.config.table_pattern.allowed(dataset_name):
@@ -512,7 +508,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
512
508
 
513
509
  # Target
514
510
  target_path = (
515
- f'{self.config.platform_instance+"." if self.config.platform_instance else ""}'
511
+ f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
516
512
  f"{dataset_name}"
517
513
  )
518
514
  target = LineageItem(
@@ -525,7 +521,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
525
521
 
526
522
  # Source
527
523
  platform = LineageDatasetPlatform.CLICKHOUSE
528
- path = f'{db_row["source_schema"]}.{db_row["source_table"]}'
524
+ path = f"{db_row['source_schema']}.{db_row['source_table']}"
529
525
 
530
526
  sources = [
531
527
  LineageDataset(
@@ -552,9 +548,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
552
548
  target.dataset.path
553
549
  ].upstreams = self._lineage_map[
554
550
  target.dataset.path
555
- ].upstreams.union(
556
- target.upstreams
557
- )
551
+ ].upstreams.union(target.upstreams)
558
552
 
559
553
  else:
560
554
  self._lineage_map[target.dataset.path] = target
@@ -234,9 +234,7 @@ class OracleInspectorObjectWrapper:
234
234
  WHERE col.table_name = id.table_name
235
235
  AND col.column_name = id.column_name
236
236
  AND col.owner = id.owner
237
- ) AS identity_options""".format(
238
- dblink=dblink
239
- )
237
+ ) AS identity_options""".format(dblink=dblink)
240
238
  else:
241
239
  identity_cols = "NULL as default_on_null, NULL as identity_options"
242
240
 
@@ -278,8 +278,7 @@ class GenericProfiler:
278
278
 
279
279
  if self.config.profiling.profile_table_size_limit is not None and (
280
280
  size_in_bytes is not None
281
- and size_in_bytes / (2**30)
282
- > self.config.profiling.profile_table_size_limit
281
+ and size_in_bytes / (2**30) > self.config.profiling.profile_table_size_limit
283
282
  ):
284
283
  self.report.profiling_skipped_size_limit[schema_name] += 1
285
284
  logger.debug(
@@ -384,7 +384,6 @@ TRINO_SQL_TYPES_MAP: Dict[str, Any] = {
384
384
  "varchar": StringType,
385
385
  "char": StringType,
386
386
  "varbinary": BytesType,
387
- "json": RecordType,
388
387
  "date": DateType,
389
388
  "time": TimeType,
390
389
  "timestamp": TimeType,
@@ -599,7 +599,12 @@ ORDER by DataBaseName, TableName;
599
599
  setattr( # noqa: B010
600
600
  TeradataDialect,
601
601
  "get_columns",
602
- lambda self, connection, table_name, schema=None, use_qvci=self.config.use_qvci, **kw: optimized_get_columns(
602
+ lambda self,
603
+ connection,
604
+ table_name,
605
+ schema=None,
606
+ use_qvci=self.config.use_qvci,
607
+ **kw: optimized_get_columns(
603
608
  self,
604
609
  connection,
605
610
  table_name,
@@ -613,7 +618,11 @@ ORDER by DataBaseName, TableName;
613
618
  setattr( # noqa: B010
614
619
  TeradataDialect,
615
620
  "get_pk_constraint",
616
- lambda self, connection, table_name, schema=None, **kw: optimized_get_pk_constraint(
621
+ lambda self,
622
+ connection,
623
+ table_name,
624
+ schema=None,
625
+ **kw: optimized_get_pk_constraint(
617
626
  self, connection, table_name, schema, **kw
618
627
  ),
619
628
  )
@@ -621,7 +630,11 @@ ORDER by DataBaseName, TableName;
621
630
  setattr( # noqa: B010
622
631
  TeradataDialect,
623
632
  "get_foreign_keys",
624
- lambda self, connection, table_name, schema=None, **kw: optimized_get_foreign_keys(
633
+ lambda self,
634
+ connection,
635
+ table_name,
636
+ schema=None,
637
+ **kw: optimized_get_foreign_keys(
625
638
  self, connection, table_name, schema, **kw
626
639
  ),
627
640
  )
@@ -41,9 +41,9 @@ class ProfilingHandler(StatefulIngestionUsecaseHandlerBase[ProfilingCheckpointSt
41
41
  run_id: str,
42
42
  ):
43
43
  self.state_provider = source.state_provider
44
- self.stateful_ingestion_config: Optional[
45
- ProfilingStatefulIngestionConfig
46
- ] = config.stateful_ingestion
44
+ self.stateful_ingestion_config: Optional[ProfilingStatefulIngestionConfig] = (
45
+ config.stateful_ingestion
46
+ )
47
47
  self.pipeline_name = pipeline_name
48
48
  self.run_id = run_id
49
49
  self.checkpointing_enabled: bool = (
@@ -48,9 +48,9 @@ class RedundantRunSkipHandler(
48
48
  ):
49
49
  self.source = source
50
50
  self.state_provider = source.state_provider
51
- self.stateful_ingestion_config: Optional[
52
- StatefulIngestionConfig
53
- ] = config.stateful_ingestion
51
+ self.stateful_ingestion_config: Optional[StatefulIngestionConfig] = (
52
+ config.stateful_ingestion
53
+ )
54
54
  self.pipeline_name = pipeline_name
55
55
  self.run_id = run_id
56
56
  self._job_id = self._init_job_id()
@@ -145,8 +145,7 @@ class RedundantRunSkipHandler(
145
145
  )
146
146
 
147
147
  logger.debug(
148
- f"{self.job_id} : Last run start, end times:"
149
- f"({last_run_time_window})"
148
+ f"{self.job_id} : Last run start, end times:({last_run_time_window})"
150
149
  )
151
150
 
152
151
  # If current run's time window is subset of last run's time window, then skip.
@@ -212,8 +211,7 @@ class RedundantRunSkipHandler(
212
211
  )
213
212
 
214
213
  self.log(
215
- "Adjusted start, end times: "
216
- f"({suggested_start_time}, {suggested_end_time})"
214
+ f"Adjusted start, end times: ({suggested_start_time}, {suggested_end_time})"
217
215
  )
218
216
  return (suggested_start_time, suggested_end_time)
219
217
 
@@ -111,9 +111,9 @@ class StaleEntityRemovalHandler(
111
111
  self.state_type_class = state_type_class
112
112
  self.pipeline_name = pipeline_name
113
113
  self.run_id = run_id
114
- self.stateful_ingestion_config: Optional[
115
- StatefulStaleMetadataRemovalConfig
116
- ] = config.stateful_ingestion
114
+ self.stateful_ingestion_config: Optional[StatefulStaleMetadataRemovalConfig] = (
115
+ config.stateful_ingestion
116
+ )
117
117
  self.checkpointing_enabled: bool = (
118
118
  True
119
119
  if (
@@ -70,20 +70,20 @@ class DatahubIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
70
70
  self.orchestrator_name, pipeline_name, job_name
71
71
  )
72
72
 
73
- latest_checkpoint: Optional[
74
- DatahubIngestionCheckpointClass
75
- ] = self.graph.get_latest_timeseries_value(
76
- entity_urn=data_job_urn,
77
- aspect_type=DatahubIngestionCheckpointClass,
78
- filter_criteria_map={
79
- "pipelineName": pipeline_name,
80
- },
73
+ latest_checkpoint: Optional[DatahubIngestionCheckpointClass] = (
74
+ self.graph.get_latest_timeseries_value(
75
+ entity_urn=data_job_urn,
76
+ aspect_type=DatahubIngestionCheckpointClass,
77
+ filter_criteria_map={
78
+ "pipelineName": pipeline_name,
79
+ },
80
+ )
81
81
  )
82
82
  if latest_checkpoint:
83
83
  logger.debug(
84
84
  f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
85
85
  f" job_name:'{job_name}' found with start_time:"
86
- f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}"
86
+ f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis / 1000)}"
87
87
  )
88
88
  return latest_checkpoint
89
89
  else:
@@ -67,7 +67,7 @@ class FileIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
67
67
  logger.debug(
68
68
  f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
69
69
  f" job_name:'{job_name}' found with start_time:"
70
- f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}"
70
+ f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis / 1000)}"
71
71
  )
72
72
  return latest_checkpoint
73
73
  else: