acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (157) hide show
  1. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2335 -2337
  2. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +157 -157
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/assertion/assertion_operator.py +3 -5
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  7. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  8. datahub/api/entities/dataset/dataset.py +2 -1
  9. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  10. datahub/cli/cli_utils.py +1 -1
  11. datahub/cli/docker_cli.py +6 -6
  12. datahub/cli/lite_cli.py +2 -2
  13. datahub/cli/migrate.py +3 -3
  14. datahub/cli/specific/assertions_cli.py +3 -3
  15. datahub/cli/timeline_cli.py +1 -1
  16. datahub/configuration/common.py +1 -2
  17. datahub/configuration/config_loader.py +73 -50
  18. datahub/configuration/git.py +2 -2
  19. datahub/configuration/time_window_config.py +10 -5
  20. datahub/emitter/mce_builder.py +4 -8
  21. datahub/emitter/mcp_patch_builder.py +1 -2
  22. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  23. datahub/ingestion/api/report.py +1 -2
  24. datahub/ingestion/api/source_helpers.py +1 -1
  25. datahub/ingestion/extractor/json_schema_util.py +3 -3
  26. datahub/ingestion/extractor/schema_util.py +3 -5
  27. datahub/ingestion/fs/s3_fs.py +3 -3
  28. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  29. datahub/ingestion/graph/client.py +4 -6
  30. datahub/ingestion/run/pipeline.py +8 -7
  31. datahub/ingestion/run/pipeline_config.py +3 -3
  32. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  33. datahub/ingestion/source/abs/source.py +19 -8
  34. datahub/ingestion/source/aws/glue.py +11 -11
  35. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  36. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  37. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  38. datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
  39. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  40. datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
  41. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  42. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
  43. datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
  44. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  45. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  46. datahub/ingestion/source/bigquery_v2/usage.py +3 -3
  47. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  48. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
  49. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  50. datahub/ingestion/source/csv_enricher.py +29 -29
  51. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  52. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  53. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  54. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  55. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  56. datahub/ingestion/source/elastic_search.py +4 -4
  57. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
  58. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  59. datahub/ingestion/source/ge_data_profiler.py +4 -5
  60. datahub/ingestion/source/ge_profiling_config.py +3 -3
  61. datahub/ingestion/source/iceberg/iceberg.py +3 -3
  62. datahub/ingestion/source/identity/azure_ad.py +3 -3
  63. datahub/ingestion/source/identity/okta.py +3 -3
  64. datahub/ingestion/source/kafka/kafka.py +11 -9
  65. datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
  66. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  67. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  68. datahub/ingestion/source/looker/looker_common.py +19 -19
  69. datahub/ingestion/source/looker/looker_config.py +3 -3
  70. datahub/ingestion/source/looker/looker_source.py +25 -25
  71. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  72. datahub/ingestion/source/looker/looker_usage.py +5 -7
  73. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  74. datahub/ingestion/source/looker/lookml_source.py +13 -15
  75. datahub/ingestion/source/looker/view_upstream.py +5 -5
  76. datahub/ingestion/source/mlflow.py +4 -4
  77. datahub/ingestion/source/mongodb.py +6 -4
  78. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  79. datahub/ingestion/source/nifi.py +24 -26
  80. datahub/ingestion/source/openapi.py +9 -9
  81. datahub/ingestion/source/powerbi/config.py +12 -12
  82. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  83. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  84. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  85. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  86. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  87. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  88. datahub/ingestion/source/redshift/config.py +3 -3
  89. datahub/ingestion/source/redshift/redshift.py +12 -12
  90. datahub/ingestion/source/redshift/usage.py +8 -8
  91. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  92. datahub/ingestion/source/s3/source.py +1 -1
  93. datahub/ingestion/source/salesforce.py +26 -25
  94. datahub/ingestion/source/schema/json_schema.py +1 -1
  95. datahub/ingestion/source/sigma/sigma.py +3 -3
  96. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  97. datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
  98. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  99. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  100. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
  101. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
  102. datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
  103. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
  104. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  105. datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
  106. datahub/ingestion/source/sql/athena.py +1 -3
  107. datahub/ingestion/source/sql/clickhouse.py +8 -14
  108. datahub/ingestion/source/sql/oracle.py +1 -3
  109. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  110. datahub/ingestion/source/sql/teradata.py +16 -3
  111. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  112. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  113. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  114. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  115. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  116. datahub/ingestion/source/tableau/tableau.py +48 -49
  117. datahub/ingestion/source/unity/config.py +3 -1
  118. datahub/ingestion/source/unity/proxy.py +1 -1
  119. datahub/ingestion/source/unity/source.py +3 -3
  120. datahub/ingestion/source/unity/usage.py +3 -1
  121. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  122. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  123. datahub/ingestion/source/usage/usage_common.py +1 -1
  124. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  125. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  126. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  127. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  128. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  129. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  130. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  131. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  132. datahub/lite/duckdb_lite.py +12 -10
  133. datahub/metadata/_schema_classes.py +1 -1
  134. datahub/metadata/schema.avsc +6 -2
  135. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  136. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  137. datahub/secret/secret_common.py +14 -8
  138. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  139. datahub/sql_parsing/schema_resolver.py +5 -10
  140. datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
  141. datahub/sql_parsing/sqlglot_lineage.py +5 -4
  142. datahub/sql_parsing/sqlglot_utils.py +3 -2
  143. datahub/telemetry/stats.py +1 -2
  144. datahub/testing/mcp_diff.py +1 -1
  145. datahub/utilities/file_backed_collections.py +10 -10
  146. datahub/utilities/hive_schema_to_avro.py +2 -2
  147. datahub/utilities/logging_manager.py +2 -2
  148. datahub/utilities/lossy_collections.py +3 -3
  149. datahub/utilities/mapping.py +3 -3
  150. datahub/utilities/serialized_lru_cache.py +3 -1
  151. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  152. datahub/utilities/sqllineage_patch.py +1 -1
  153. datahub/utilities/stats_collections.py +3 -1
  154. datahub/utilities/urns/urn_iter.py +2 -2
  155. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
  156. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
  157. {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
@@ -281,9 +281,9 @@ class TableauConnectionConfig(ConfigModel):
281
281
  return authentication
282
282
 
283
283
  def make_tableau_client(self, site: str) -> Server:
284
- authentication: Union[
285
- TableauAuth, PersonalAccessTokenAuth
286
- ] = self.get_tableau_auth(site)
284
+ authentication: Union[TableauAuth, PersonalAccessTokenAuth] = (
285
+ self.get_tableau_auth(site)
286
+ )
287
287
  try:
288
288
  server = Server(
289
289
  self.connect_uri,
@@ -635,7 +635,7 @@ class TableauConfig(
635
635
  project_path_pattern = values.get("project_path_pattern")
636
636
  if project_pattern is None and project_path_pattern is None and projects:
637
637
  logger.warning(
638
- "projects is deprecated, please use " "project_path_pattern instead."
638
+ "projects is deprecated, please use project_path_pattern instead."
639
639
  )
640
640
  logger.info("Initializing project_pattern from projects")
641
641
  values["project_pattern"] = AllowDenyPattern(
@@ -708,18 +708,18 @@ class DatabaseTable:
708
708
  """
709
709
 
710
710
  urn: str
711
- id: Optional[
712
- str
713
- ] = None # is not None only for tables that came from Tableau metadata
711
+ id: Optional[str] = (
712
+ None # is not None only for tables that came from Tableau metadata
713
+ )
714
714
  num_cols: Optional[int] = None
715
715
 
716
- paths: Optional[
717
- Set[str]
718
- ] = None # maintains all browse paths encountered for this table
716
+ paths: Optional[Set[str]] = (
717
+ None # maintains all browse paths encountered for this table
718
+ )
719
719
 
720
- parsed_columns: Optional[
721
- Set[str]
722
- ] = None # maintains all columns encountered for this table during parsing SQL queries
720
+ parsed_columns: Optional[Set[str]] = (
721
+ None # maintains all columns encountered for this table during parsing SQL queries
722
+ )
723
723
 
724
724
  def update_table(
725
725
  self,
@@ -2310,8 +2310,7 @@ class TableauSiteSource:
2310
2310
  c.EMBEDDED_DATA_SOURCE,
2311
2311
  ):
2312
2312
  logger.debug(
2313
- f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is "
2314
- f"unsupported"
2313
+ f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is unsupported"
2315
2314
  )
2316
2315
  return None
2317
2316
 
@@ -2493,9 +2492,9 @@ class TableauSiteSource:
2493
2492
  def _enrich_database_tables_with_parsed_schemas(
2494
2493
  self, parsing_result: SqlParsingResult
2495
2494
  ) -> None:
2496
- in_tables_schemas: Dict[
2497
- str, Set[str]
2498
- ] = transform_parsing_result_to_in_tables_schemas(parsing_result)
2495
+ in_tables_schemas: Dict[str, Set[str]] = (
2496
+ transform_parsing_result_to_in_tables_schemas(parsing_result)
2497
+ )
2499
2498
 
2500
2499
  if not in_tables_schemas:
2501
2500
  logger.info("Unable to extract table schema from parsing result")
@@ -3559,25 +3558,25 @@ class TableauSiteSource:
3559
3558
 
3560
3559
  generated_project_keys.add(project_key.guid())
3561
3560
 
3562
- parent_project_key: Optional[
3563
- Union[ProjectKey, SiteKey]
3564
- ] = None # It is going
3561
+ parent_project_key: Optional[Union[ProjectKey, SiteKey]] = (
3562
+ None # It is going
3563
+ )
3565
3564
  # to be used as a parent container key for the current tableau project
3566
3565
 
3567
3566
  if project_.parent_id is not None:
3568
3567
  # Go to the parent project as we need to generate container first for parent
3569
3568
  parent_project_key = self.gen_project_key(project_.parent_id)
3570
3569
 
3571
- parent_tableau_project: Optional[
3572
- TableauProject
3573
- ] = self.tableau_project_registry.get(project_.parent_id)
3570
+ parent_tableau_project: Optional[TableauProject] = (
3571
+ self.tableau_project_registry.get(project_.parent_id)
3572
+ )
3574
3573
 
3575
3574
  if (
3576
3575
  parent_tableau_project is None
3577
3576
  ): # It is not in project registry because of project_pattern
3578
- assert (
3579
- project_.parent_name
3580
- ), f"project {project_.name} should not be null"
3577
+ assert project_.parent_name, (
3578
+ f"project {project_.name} should not be null"
3579
+ )
3581
3580
  parent_tableau_project = TableauProject(
3582
3581
  id=project_.parent_id,
3583
3582
  name=project_.parent_name,
@@ -3605,7 +3604,7 @@ class TableauSiteSource:
3605
3604
  parent_container_key=parent_project_key,
3606
3605
  )
3607
3606
 
3608
- for id_, project in self.tableau_project_registry.items():
3607
+ for project in self.tableau_project_registry.values():
3609
3608
  logger.debug(
3610
3609
  f"project {project.name} and it's parent {project.parent_name} and parent id {project.parent_id}"
3611
3610
  )
@@ -3669,16 +3668,16 @@ class TableauSiteSource:
3669
3668
  if self.config.extract_usage_stats:
3670
3669
  with PerfTimer() as timer:
3671
3670
  self._populate_usage_stat_registry()
3672
- self.report.extract_usage_stats_timer[
3673
- self.site_content_url
3674
- ] = timer.elapsed_seconds(digits=2)
3671
+ self.report.extract_usage_stats_timer[self.site_content_url] = (
3672
+ timer.elapsed_seconds(digits=2)
3673
+ )
3675
3674
 
3676
3675
  if self.config.permission_ingestion:
3677
3676
  with PerfTimer() as timer:
3678
3677
  self._fetch_groups()
3679
- self.report.fetch_groups_timer[
3680
- self.site_content_url
3681
- ] = timer.elapsed_seconds(digits=2)
3678
+ self.report.fetch_groups_timer[self.site_content_url] = (
3679
+ timer.elapsed_seconds(digits=2)
3680
+ )
3682
3681
 
3683
3682
  # Populate the map of database names and database hostnames to be used later to map
3684
3683
  # databases to platform instances.
@@ -3691,9 +3690,9 @@ class TableauSiteSource:
3691
3690
 
3692
3691
  with PerfTimer() as timer:
3693
3692
  self._populate_projects_registry()
3694
- self.report.populate_projects_registry_timer[
3695
- self.site_content_url
3696
- ] = timer.elapsed_seconds(digits=2)
3693
+ self.report.populate_projects_registry_timer[self.site_content_url] = (
3694
+ timer.elapsed_seconds(digits=2)
3695
+ )
3697
3696
 
3698
3697
  if self.config.add_site_container:
3699
3698
  yield from self.emit_site_container()
@@ -3701,23 +3700,23 @@ class TableauSiteSource:
3701
3700
 
3702
3701
  with PerfTimer() as timer:
3703
3702
  yield from self.emit_workbooks()
3704
- self.report.emit_workbooks_timer[
3705
- self.site_content_url
3706
- ] = timer.elapsed_seconds(digits=2)
3703
+ self.report.emit_workbooks_timer[self.site_content_url] = (
3704
+ timer.elapsed_seconds(digits=2)
3705
+ )
3707
3706
 
3708
3707
  if self.sheet_ids:
3709
3708
  with PerfTimer() as timer:
3710
3709
  yield from self.emit_sheets()
3711
- self.report.emit_sheets_timer[
3712
- self.site_content_url
3713
- ] = timer.elapsed_seconds(digits=2)
3710
+ self.report.emit_sheets_timer[self.site_content_url] = (
3711
+ timer.elapsed_seconds(digits=2)
3712
+ )
3714
3713
 
3715
3714
  if self.dashboard_ids:
3716
3715
  with PerfTimer() as timer:
3717
3716
  yield from self.emit_dashboards()
3718
- self.report.emit_dashboards_timer[
3719
- self.site_content_url
3720
- ] = timer.elapsed_seconds(digits=2)
3717
+ self.report.emit_dashboards_timer[self.site_content_url] = (
3718
+ timer.elapsed_seconds(digits=2)
3719
+ )
3721
3720
 
3722
3721
  if self.embedded_datasource_ids_being_used:
3723
3722
  with PerfTimer() as timer:
@@ -3743,6 +3742,6 @@ class TableauSiteSource:
3743
3742
  if self.database_tables:
3744
3743
  with PerfTimer() as timer:
3745
3744
  yield from self.emit_upstream_tables()
3746
- self.report.emit_upstream_tables_timer[
3747
- self.site_content_url
3748
- ] = timer.elapsed_seconds(digits=2)
3745
+ self.report.emit_upstream_tables_timer[self.site_content_url] = (
3746
+ timer.elapsed_seconds(digits=2)
3747
+ )
@@ -254,7 +254,9 @@ class UnityCatalogSourceConfig(
254
254
  )
255
255
 
256
256
  # TODO: Remove `type:ignore` by refactoring config
257
- profiling: Union[UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig] = Field( # type: ignore
257
+ profiling: Union[
258
+ UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
259
+ ] = Field( # type: ignore
258
260
  default=UnityCatalogGEProfilerConfig(),
259
261
  description="Data profiling configuration",
260
262
  discriminator="method",
@@ -363,7 +363,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
363
363
 
364
364
  @staticmethod
365
365
  def _create_metastore(
366
- obj: Union[GetMetastoreSummaryResponse, MetastoreInfo]
366
+ obj: Union[GetMetastoreSummaryResponse, MetastoreInfo],
367
367
  ) -> Optional[Metastore]:
368
368
  if not obj.name:
369
369
  return None
@@ -205,9 +205,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
205
205
  self.table_refs: Set[TableReference] = set()
206
206
  self.view_refs: Set[TableReference] = set()
207
207
  self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
208
- self.view_definitions: FileBackedDict[
209
- Tuple[TableReference, str]
210
- ] = FileBackedDict()
208
+ self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
209
+ FileBackedDict()
210
+ )
211
211
 
212
212
  # Global map of tables, for profiling
213
213
  self.tables: FileBackedDict[Table] = FileBackedDict()
@@ -103,7 +103,9 @@ class UnityCatalogUsageExtractor:
103
103
  query, table_info
104
104
  )
105
105
  for source_table in table_info.source_tables:
106
- with self.report.usage_perf_report.aggregator_add_event_timer:
106
+ with (
107
+ self.report.usage_perf_report.aggregator_add_event_timer
108
+ ):
107
109
  self.usage_aggregator.aggregate_event(
108
110
  resource=source_table,
109
111
  start_time=query.start_time,
@@ -213,15 +213,15 @@ class ClickHouseUsageSource(Source):
213
213
  def _aggregate_access_events(
214
214
  self, events: List[ClickHouseJoinedAccessEvent]
215
215
  ) -> Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]]:
216
- datasets: Dict[
217
- datetime, Dict[ClickHouseTableRef, AggregatedDataset]
218
- ] = collections.defaultdict(dict)
216
+ datasets: Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]] = (
217
+ collections.defaultdict(dict)
218
+ )
219
219
 
220
220
  for event in events:
221
221
  floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
222
222
 
223
223
  resource = (
224
- f'{self.config.platform_instance+"." if self.config.platform_instance else ""}'
224
+ f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
225
225
  f"{event.database}.{event.table}"
226
226
  )
227
227
 
@@ -235,9 +235,9 @@ class TrinoUsageSource(Source):
235
235
  def _aggregate_access_events(
236
236
  self, events: List[TrinoJoinedAccessEvent]
237
237
  ) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]:
238
- datasets: Dict[
239
- datetime, Dict[TrinoTableRef, AggregatedDataset]
240
- ] = collections.defaultdict(dict)
238
+ datasets: Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]] = (
239
+ collections.defaultdict(dict)
240
+ )
241
241
 
242
242
  for event in events:
243
243
  floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
@@ -89,7 +89,7 @@ def make_usage_workunit(
89
89
  top_sql_queries: Optional[List[str]] = None
90
90
  if query_freq is not None:
91
91
  if top_n_queries < len(query_freq):
92
- logger.warn(
92
+ logger.warning(
93
93
  f"Top N query limit exceeded on {str(resource)}. Max number of queries {top_n_queries} < {len(query_freq)}. Truncating top queries to {top_n_queries}."
94
94
  )
95
95
  query_freq = query_freq[0:top_n_queries]
@@ -80,10 +80,10 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
80
80
  ).add_asset(container_urn)
81
81
  data_products_container[data_product_urn] = container_product
82
82
  else:
83
- data_products_container[
84
- data_product_urn
85
- ] = data_products_container[data_product_urn].add_asset(
86
- container_urn
83
+ data_products_container[data_product_urn] = (
84
+ data_products_container[data_product_urn].add_asset(
85
+ container_urn
86
+ )
87
87
  )
88
88
 
89
89
  mcps: List[
@@ -61,9 +61,9 @@ class AddDatasetProperties(DatasetPropertiesTransformer):
61
61
  ) -> Optional[DatasetPropertiesClass]:
62
62
  assert dataset_properties_aspect
63
63
 
64
- server_dataset_properties_aspect: Optional[
65
- DatasetPropertiesClass
66
- ] = graph.get_dataset_properties(entity_urn)
64
+ server_dataset_properties_aspect: Optional[DatasetPropertiesClass] = (
65
+ graph.get_dataset_properties(entity_urn)
66
+ )
67
67
  # No need to take any action if server properties is None or there is not customProperties in server properties
68
68
  if (
69
69
  server_dataset_properties_aspect is None
@@ -89,9 +89,9 @@ class AddDatasetSchemaTags(DatasetSchemaMetadataTransformer):
89
89
  server_field_map: dict = {}
90
90
  if self.config.semantics == TransformerSemantics.PATCH:
91
91
  assert self.ctx.graph
92
- server_schema_metadata_aspect: Optional[
93
- SchemaMetadataClass
94
- ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
92
+ server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
93
+ self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
94
+ )
95
95
  if server_schema_metadata_aspect is not None:
96
96
  if not schema_metadata_aspect:
97
97
  schema_metadata_aspect = server_schema_metadata_aspect
@@ -108,9 +108,9 @@ class AddDatasetSchemaTerms(DatasetSchemaMetadataTransformer):
108
108
  ] = {} # Map to cache server field objects, where fieldPath is key
109
109
  if self.config.semantics == TransformerSemantics.PATCH:
110
110
  assert self.ctx.graph
111
- server_schema_metadata_aspect: Optional[
112
- SchemaMetadataClass
113
- ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
111
+ server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
112
+ self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
113
+ )
114
114
  if server_schema_metadata_aspect is not None:
115
115
  if not schema_metadata_aspect:
116
116
  schema_metadata_aspect = server_schema_metadata_aspect
@@ -60,10 +60,10 @@ class DatasetTagDomainMapper(DatasetDomainTransformer):
60
60
  domain_aspect.domains.extend(mapped_domains.domains)
61
61
  if self.config.semantics == TransformerSemantics.PATCH:
62
62
  # Try merging with server-side domains
63
- patch_domain_aspect: Optional[
64
- DomainsClass
65
- ] = AddDatasetDomain._merge_with_server_domains(
66
- self.ctx.graph, entity_urn, domain_aspect
63
+ patch_domain_aspect: Optional[DomainsClass] = (
64
+ AddDatasetDomain._merge_with_server_domains(
65
+ self.ctx.graph, entity_urn, domain_aspect
66
+ )
67
67
  )
68
68
  return cast(Optional[Aspect], patch_domain_aspect)
69
69
  return cast(Optional[Aspect], domain_aspect)
@@ -141,9 +141,9 @@ class ExtractOwnersFromTagsTransformer(DatasetTagsTransformer):
141
141
  else:
142
142
  owner_type = get_owner_type(self.config.owner_type)
143
143
  if owner_type == OwnershipTypeClass.CUSTOM:
144
- assert (
145
- self.config.owner_type_urn is not None
146
- ), "owner_type_urn must be set if owner_type is CUSTOM"
144
+ assert self.config.owner_type_urn is not None, (
145
+ "owner_type_urn must be set if owner_type is CUSTOM"
146
+ )
147
147
 
148
148
  owners.append(
149
149
  OwnerClass(
@@ -92,9 +92,9 @@ class TagsToTermMapper(TagsToTermTransformer):
92
92
  in_global_tags_aspect: Optional[GlobalTagsClass] = self.ctx.graph.get_tags(
93
93
  entity_urn
94
94
  )
95
- in_schema_metadata_aspect: Optional[
96
- SchemaMetadataClass
97
- ] = self.ctx.graph.get_schema_metadata(entity_urn)
95
+ in_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
96
+ self.ctx.graph.get_schema_metadata(entity_urn)
97
+ )
98
98
 
99
99
  if in_global_tags_aspect is None and in_schema_metadata_aspect is None:
100
100
  return cast(Aspect, in_glossary_terms)
@@ -134,10 +134,10 @@ class TagsToTermMapper(TagsToTermTransformer):
134
134
  )
135
135
 
136
136
  if self.config.semantics == TransformerSemantics.PATCH:
137
- patch_glossary_terms: Optional[
138
- GlossaryTermsClass
139
- ] = TagsToTermMapper._merge_with_server_glossary_terms(
140
- self.ctx.graph, entity_urn, out_glossary_terms
137
+ patch_glossary_terms: Optional[GlossaryTermsClass] = (
138
+ TagsToTermMapper._merge_with_server_glossary_terms(
139
+ self.ctx.graph, entity_urn, out_glossary_terms
140
+ )
141
141
  )
142
142
  return cast(Optional[Aspect], patch_glossary_terms)
143
143
  else:
@@ -61,17 +61,17 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
61
61
  def create(
62
62
  cls, output_dir: str, extras: Dict[str, str]
63
63
  ) -> "SnowflakeAssertionCompiler":
64
- assert os.path.exists(
65
- output_dir
66
- ), f"Specified location {output_dir} does not exist."
64
+ assert os.path.exists(output_dir), (
65
+ f"Specified location {output_dir} does not exist."
66
+ )
67
67
 
68
- assert os.path.isdir(
69
- output_dir
70
- ), f"Specified location {output_dir} is not a folder."
68
+ assert os.path.isdir(output_dir), (
69
+ f"Specified location {output_dir} is not a folder."
70
+ )
71
71
 
72
- assert any(
73
- x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras
74
- ), "Must specify value for DMF schema using -x DMF_SCHEMA=<db.schema>"
72
+ assert any(x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras), (
73
+ "Must specify value for DMF schema using -x DMF_SCHEMA=<db.schema>"
74
+ )
75
75
 
76
76
  return SnowflakeAssertionCompiler(output_dir, extras)
77
77
 
@@ -232,6 +232,6 @@ def get_dmf_schedule(trigger: AssertionTrigger) -> str:
232
232
  elif isinstance(trigger.trigger, CronTrigger):
233
233
  return f"USING CRON {trigger.trigger.cron} {trigger.trigger.timezone}"
234
234
  elif isinstance(trigger.trigger, IntervalTrigger):
235
- return f"{trigger.trigger.interval.seconds/60} MIN"
235
+ return f"{trigger.trigger.interval.seconds / 60} MIN"
236
236
  else:
237
237
  raise ValueError(f"Unsupported trigger type {type(trigger.trigger)}")
@@ -163,9 +163,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
163
163
 
164
164
  if "properties" not in writeable_dict["systemMetadata"]:
165
165
  writeable_dict["systemMetadata"]["properties"] = {}
166
- writeable_dict["systemMetadata"]["properties"][
167
- "sysVersion"
168
- ] = new_version
166
+ writeable_dict["systemMetadata"]["properties"]["sysVersion"] = (
167
+ new_version
168
+ )
169
169
  if needs_write:
170
170
  self.duckdb_client.execute(
171
171
  query="INSERT INTO metadata_aspect_v2 VALUES (?, ?, ?, ?, ?, ?)",
@@ -208,9 +208,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
208
208
  "lastObserved": writeable.systemMetadata.lastObserved
209
209
  }
210
210
  else:
211
- system_metadata[
212
- "lastObserved"
213
- ] = writeable.systemMetadata.lastObserved
211
+ system_metadata["lastObserved"] = (
212
+ writeable.systemMetadata.lastObserved
213
+ )
214
214
  self.duckdb_client.execute(
215
215
  query="UPDATE metadata_aspect_v2 SET system_metadata = ? WHERE urn = ? AND aspect_name = ? AND version = 0",
216
216
  parameters=[
@@ -497,9 +497,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
497
497
  aspect_name = r[1]
498
498
  aspect_payload = json.loads(r[2])
499
499
  if typed:
500
- assert (
501
- aspect_name in ASPECT_MAP
502
- ), f"Missing aspect name {aspect_name} in the registry"
500
+ assert aspect_name in ASPECT_MAP, (
501
+ f"Missing aspect name {aspect_name} in the registry"
502
+ )
503
503
  try:
504
504
  aspect_payload = ASPECT_MAP[aspect_name].from_obj(
505
505
  post_json_transform(aspect_payload)
@@ -531,7 +531,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
531
531
  for r in results.fetchall():
532
532
  urn = r[0]
533
533
  aspect_name = r[1]
534
- aspect_metadata = ASPECT_MAP[aspect_name].from_obj(post_json_transform(json.loads(r[2]))) # type: ignore
534
+ aspect_metadata = ASPECT_MAP[aspect_name].from_obj(
535
+ post_json_transform(json.loads(r[2]))
536
+ ) # type: ignore
535
537
  system_metadata = SystemMetadataClass.from_obj(json.loads(r[3]))
536
538
  mcp = MetadataChangeProposalWrapper(
537
539
  entityUrn=urn,
@@ -9096,7 +9096,7 @@ class DataProcessInstanceInputClass(_Aspect):
9096
9096
 
9097
9097
  @property
9098
9098
  def inputs(self) -> List[str]:
9099
- """Input datasets to be consumed"""
9099
+ """Input assets consumed"""
9100
9100
  return self._inner_dict.get('inputs') # type: ignore
9101
9101
 
9102
9102
  @inputs.setter
@@ -12699,8 +12699,10 @@
12699
12699
  "Relationship": {
12700
12700
  "/*": {
12701
12701
  "entityTypes": [
12702
- "dataset"
12702
+ "dataset",
12703
+ "mlModel"
12703
12704
  ],
12705
+ "isLineage": true,
12704
12706
  "name": "Consumes"
12705
12707
  }
12706
12708
  },
@@ -12720,7 +12722,7 @@
12720
12722
  "items": "string"
12721
12723
  },
12722
12724
  "name": "inputs",
12723
- "doc": "Input datasets to be consumed"
12725
+ "doc": "Input assets consumed"
12724
12726
  }
12725
12727
  ],
12726
12728
  "doc": "Information about the inputs datasets of a Data process"
@@ -12883,6 +12885,8 @@
12883
12885
  "dataset",
12884
12886
  "mlModel"
12885
12887
  ],
12888
+ "isLineage": true,
12889
+ "isUpstream": false,
12886
12890
  "name": "Produces"
12887
12891
  }
12888
12892
  },
@@ -10,8 +10,10 @@
10
10
  "Relationship": {
11
11
  "/*": {
12
12
  "entityTypes": [
13
- "dataset"
13
+ "dataset",
14
+ "mlModel"
14
15
  ],
16
+ "isLineage": true,
15
17
  "name": "Consumes"
16
18
  }
17
19
  },
@@ -29,7 +31,7 @@
29
31
  "items": "string"
30
32
  },
31
33
  "name": "inputs",
32
- "doc": "Input datasets to be consumed",
34
+ "doc": "Input assets consumed",
33
35
  "Urn": "Urn",
34
36
  "urn_is_array": true
35
37
  }
@@ -13,6 +13,8 @@
13
13
  "dataset",
14
14
  "mlModel"
15
15
  ],
16
+ "isLineage": true,
17
+ "isUpstream": false,
16
18
  "name": "Produces"
17
19
  }
18
20
  },
@@ -2,10 +2,7 @@ import json
2
2
  import logging
3
3
  from typing import List
4
4
 
5
- from datahub.configuration.config_loader import (
6
- list_referenced_env_variables,
7
- resolve_env_variables,
8
- )
5
+ from datahub.configuration.config_loader import EnvResolver
9
6
  from datahub.secret.secret_store import SecretStore
10
7
 
11
8
  logger = logging.getLogger(__name__)
@@ -42,18 +39,27 @@ def resolve_secrets(secret_names: List[str], secret_stores: List[SecretStore]) -
42
39
  return final_secret_values
43
40
 
44
41
 
45
- def resolve_recipe(recipe: str, secret_stores: List[SecretStore]) -> dict:
42
+ def resolve_recipe(
43
+ recipe: str, secret_stores: List[SecretStore], strict_env_syntax: bool = True
44
+ ) -> dict:
45
+ # Note: the default for `strict_env_syntax` is normally False, but here we override
46
+ # it to be true. Particularly when fetching secrets from external secret stores, we
47
+ # want to be more careful about not over-fetching secrets.
48
+
46
49
  json_recipe_raw = json.loads(recipe)
47
50
 
48
51
  # 1. Extract all secrets needing resolved.
49
- secrets_to_resolve = list_referenced_env_variables(json_recipe_raw)
52
+ secrets_to_resolve = EnvResolver.list_referenced_variables(
53
+ json_recipe_raw, strict_env_syntax=strict_env_syntax
54
+ )
50
55
 
51
56
  # 2. Resolve secret values
52
57
  secret_values_dict = resolve_secrets(list(secrets_to_resolve), secret_stores)
53
58
 
54
59
  # 3. Substitute secrets into recipe file
55
- json_recipe_resolved = resolve_env_variables(
56
- json_recipe_raw, environ=secret_values_dict
60
+ resolver = EnvResolver(
61
+ environ=secret_values_dict, strict_env_syntax=strict_env_syntax
57
62
  )
63
+ json_recipe_resolved = resolver.resolve(json_recipe_raw)
58
64
 
59
65
  return json_recipe_resolved
@@ -9,8 +9,7 @@ from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
9
9
  class HasCustomPropertiesPatch(MetadataPatchProposal):
10
10
  @classmethod
11
11
  @abstractmethod
12
- def _custom_properties_location(self) -> Tuple[str, PatchPath]:
13
- ...
12
+ def _custom_properties_location(self) -> Tuple[str, PatchPath]: ...
14
13
 
15
14
  def add_custom_property(self, key: str, value: str) -> Self:
16
15
  """Add a custom property to the entity.