acryl-datahub 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (168) hide show
  1. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2513 -2521
  2. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +168 -168
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/assertion/assertion_operator.py +3 -5
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  7. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  8. datahub/api/entities/dataset/dataset.py +2 -1
  9. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  10. datahub/cli/cli_utils.py +1 -1
  11. datahub/cli/docker_cli.py +6 -6
  12. datahub/cli/ingest_cli.py +25 -15
  13. datahub/cli/lite_cli.py +2 -2
  14. datahub/cli/migrate.py +3 -3
  15. datahub/cli/specific/assertions_cli.py +3 -3
  16. datahub/cli/timeline_cli.py +1 -1
  17. datahub/configuration/common.py +1 -2
  18. datahub/configuration/config_loader.py +73 -50
  19. datahub/configuration/git.py +2 -2
  20. datahub/configuration/time_window_config.py +10 -5
  21. datahub/emitter/mce_builder.py +4 -8
  22. datahub/emitter/mcp_patch_builder.py +1 -2
  23. datahub/entrypoints.py +6 -0
  24. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  25. datahub/ingestion/api/report.py +1 -2
  26. datahub/ingestion/api/source_helpers.py +1 -1
  27. datahub/ingestion/extractor/json_schema_util.py +3 -3
  28. datahub/ingestion/extractor/schema_util.py +3 -5
  29. datahub/ingestion/fs/s3_fs.py +3 -3
  30. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  31. datahub/ingestion/graph/client.py +4 -6
  32. datahub/ingestion/run/pipeline.py +8 -7
  33. datahub/ingestion/run/pipeline_config.py +3 -3
  34. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  35. datahub/ingestion/source/abs/source.py +19 -8
  36. datahub/ingestion/source/aws/glue.py +11 -11
  37. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  38. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  39. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  40. datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
  41. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  42. datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
  43. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  44. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
  45. datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
  46. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  47. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  48. datahub/ingestion/source/bigquery_v2/usage.py +3 -3
  49. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  50. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
  51. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  52. datahub/ingestion/source/csv_enricher.py +29 -29
  53. datahub/ingestion/source/datahub/config.py +4 -0
  54. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  55. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  56. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  57. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  58. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  59. datahub/ingestion/source/elastic_search.py +4 -4
  60. datahub/ingestion/source/gc/datahub_gc.py +1 -0
  61. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +17 -5
  62. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  63. datahub/ingestion/source/ge_data_profiler.py +2 -5
  64. datahub/ingestion/source/ge_profiling_config.py +3 -3
  65. datahub/ingestion/source/iceberg/iceberg.py +3 -3
  66. datahub/ingestion/source/identity/azure_ad.py +3 -3
  67. datahub/ingestion/source/identity/okta.py +3 -3
  68. datahub/ingestion/source/kafka/kafka.py +11 -9
  69. datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
  70. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  71. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  72. datahub/ingestion/source/looker/looker_common.py +19 -19
  73. datahub/ingestion/source/looker/looker_config.py +3 -3
  74. datahub/ingestion/source/looker/looker_source.py +25 -25
  75. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  76. datahub/ingestion/source/looker/looker_usage.py +5 -7
  77. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  78. datahub/ingestion/source/looker/lookml_source.py +13 -15
  79. datahub/ingestion/source/looker/view_upstream.py +5 -5
  80. datahub/ingestion/source/mlflow.py +4 -4
  81. datahub/ingestion/source/mode.py +5 -5
  82. datahub/ingestion/source/mongodb.py +6 -4
  83. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  84. datahub/ingestion/source/nifi.py +24 -26
  85. datahub/ingestion/source/openapi.py +9 -9
  86. datahub/ingestion/source/powerbi/config.py +12 -12
  87. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  88. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  89. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  90. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  91. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  92. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  93. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  94. datahub/ingestion/source/redshift/config.py +3 -3
  95. datahub/ingestion/source/redshift/redshift.py +12 -12
  96. datahub/ingestion/source/redshift/usage.py +8 -8
  97. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  98. datahub/ingestion/source/s3/source.py +1 -1
  99. datahub/ingestion/source/salesforce.py +26 -25
  100. datahub/ingestion/source/schema/json_schema.py +1 -1
  101. datahub/ingestion/source/sigma/sigma.py +3 -3
  102. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  103. datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
  104. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  105. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  106. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
  107. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
  108. datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
  109. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
  110. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  111. datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
  112. datahub/ingestion/source/sql/athena.py +1 -3
  113. datahub/ingestion/source/sql/clickhouse.py +8 -14
  114. datahub/ingestion/source/sql/oracle.py +1 -3
  115. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  116. datahub/ingestion/source/sql/sql_types.py +0 -1
  117. datahub/ingestion/source/sql/teradata.py +16 -3
  118. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  119. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  120. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  121. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  122. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  123. datahub/ingestion/source/tableau/tableau.py +245 -101
  124. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  125. datahub/ingestion/source/unity/config.py +3 -1
  126. datahub/ingestion/source/unity/proxy.py +1 -1
  127. datahub/ingestion/source/unity/source.py +3 -3
  128. datahub/ingestion/source/unity/usage.py +3 -1
  129. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  130. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  131. datahub/ingestion/source/usage/usage_common.py +1 -1
  132. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  133. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  134. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  135. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  136. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  137. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  138. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  139. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  140. datahub/lite/duckdb_lite.py +12 -10
  141. datahub/metadata/_schema_classes.py +1 -1
  142. datahub/metadata/schema.avsc +6 -2
  143. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  144. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  145. datahub/secret/datahub_secrets_client.py +12 -21
  146. datahub/secret/secret_common.py +14 -8
  147. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  148. datahub/sql_parsing/schema_resolver.py +5 -10
  149. datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
  150. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  151. datahub/sql_parsing/sqlglot_utils.py +1 -1
  152. datahub/telemetry/stats.py +1 -2
  153. datahub/testing/mcp_diff.py +1 -1
  154. datahub/utilities/file_backed_collections.py +10 -10
  155. datahub/utilities/hive_schema_to_avro.py +2 -2
  156. datahub/utilities/logging_manager.py +2 -2
  157. datahub/utilities/lossy_collections.py +3 -3
  158. datahub/utilities/mapping.py +3 -3
  159. datahub/utilities/memory_footprint.py +3 -2
  160. datahub/utilities/serialized_lru_cache.py +3 -1
  161. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  162. datahub/utilities/sqllineage_patch.py +1 -1
  163. datahub/utilities/stats_collections.py +3 -1
  164. datahub/utilities/urns/_urn_base.py +28 -5
  165. datahub/utilities/urns/urn_iter.py +2 -2
  166. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
  167. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
  168. {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
@@ -89,7 +89,7 @@ def make_usage_workunit(
89
89
  top_sql_queries: Optional[List[str]] = None
90
90
  if query_freq is not None:
91
91
  if top_n_queries < len(query_freq):
92
- logger.warn(
92
+ logger.warning(
93
93
  f"Top N query limit exceeded on {str(resource)}. Max number of queries {top_n_queries} < {len(query_freq)}. Truncating top queries to {top_n_queries}."
94
94
  )
95
95
  query_freq = query_freq[0:top_n_queries]
@@ -80,10 +80,10 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
80
80
  ).add_asset(container_urn)
81
81
  data_products_container[data_product_urn] = container_product
82
82
  else:
83
- data_products_container[
84
- data_product_urn
85
- ] = data_products_container[data_product_urn].add_asset(
86
- container_urn
83
+ data_products_container[data_product_urn] = (
84
+ data_products_container[data_product_urn].add_asset(
85
+ container_urn
86
+ )
87
87
  )
88
88
 
89
89
  mcps: List[
@@ -61,9 +61,9 @@ class AddDatasetProperties(DatasetPropertiesTransformer):
61
61
  ) -> Optional[DatasetPropertiesClass]:
62
62
  assert dataset_properties_aspect
63
63
 
64
- server_dataset_properties_aspect: Optional[
65
- DatasetPropertiesClass
66
- ] = graph.get_dataset_properties(entity_urn)
64
+ server_dataset_properties_aspect: Optional[DatasetPropertiesClass] = (
65
+ graph.get_dataset_properties(entity_urn)
66
+ )
67
67
  # No need to take any action if server properties is None or there is not customProperties in server properties
68
68
  if (
69
69
  server_dataset_properties_aspect is None
@@ -89,9 +89,9 @@ class AddDatasetSchemaTags(DatasetSchemaMetadataTransformer):
89
89
  server_field_map: dict = {}
90
90
  if self.config.semantics == TransformerSemantics.PATCH:
91
91
  assert self.ctx.graph
92
- server_schema_metadata_aspect: Optional[
93
- SchemaMetadataClass
94
- ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
92
+ server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
93
+ self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
94
+ )
95
95
  if server_schema_metadata_aspect is not None:
96
96
  if not schema_metadata_aspect:
97
97
  schema_metadata_aspect = server_schema_metadata_aspect
@@ -108,9 +108,9 @@ class AddDatasetSchemaTerms(DatasetSchemaMetadataTransformer):
108
108
  ] = {} # Map to cache server field objects, where fieldPath is key
109
109
  if self.config.semantics == TransformerSemantics.PATCH:
110
110
  assert self.ctx.graph
111
- server_schema_metadata_aspect: Optional[
112
- SchemaMetadataClass
113
- ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
111
+ server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
112
+ self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
113
+ )
114
114
  if server_schema_metadata_aspect is not None:
115
115
  if not schema_metadata_aspect:
116
116
  schema_metadata_aspect = server_schema_metadata_aspect
@@ -60,10 +60,10 @@ class DatasetTagDomainMapper(DatasetDomainTransformer):
60
60
  domain_aspect.domains.extend(mapped_domains.domains)
61
61
  if self.config.semantics == TransformerSemantics.PATCH:
62
62
  # Try merging with server-side domains
63
- patch_domain_aspect: Optional[
64
- DomainsClass
65
- ] = AddDatasetDomain._merge_with_server_domains(
66
- self.ctx.graph, entity_urn, domain_aspect
63
+ patch_domain_aspect: Optional[DomainsClass] = (
64
+ AddDatasetDomain._merge_with_server_domains(
65
+ self.ctx.graph, entity_urn, domain_aspect
66
+ )
67
67
  )
68
68
  return cast(Optional[Aspect], patch_domain_aspect)
69
69
  return cast(Optional[Aspect], domain_aspect)
@@ -141,9 +141,9 @@ class ExtractOwnersFromTagsTransformer(DatasetTagsTransformer):
141
141
  else:
142
142
  owner_type = get_owner_type(self.config.owner_type)
143
143
  if owner_type == OwnershipTypeClass.CUSTOM:
144
- assert (
145
- self.config.owner_type_urn is not None
146
- ), "owner_type_urn must be set if owner_type is CUSTOM"
144
+ assert self.config.owner_type_urn is not None, (
145
+ "owner_type_urn must be set if owner_type is CUSTOM"
146
+ )
147
147
 
148
148
  owners.append(
149
149
  OwnerClass(
@@ -92,9 +92,9 @@ class TagsToTermMapper(TagsToTermTransformer):
92
92
  in_global_tags_aspect: Optional[GlobalTagsClass] = self.ctx.graph.get_tags(
93
93
  entity_urn
94
94
  )
95
- in_schema_metadata_aspect: Optional[
96
- SchemaMetadataClass
97
- ] = self.ctx.graph.get_schema_metadata(entity_urn)
95
+ in_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
96
+ self.ctx.graph.get_schema_metadata(entity_urn)
97
+ )
98
98
 
99
99
  if in_global_tags_aspect is None and in_schema_metadata_aspect is None:
100
100
  return cast(Aspect, in_glossary_terms)
@@ -134,10 +134,10 @@ class TagsToTermMapper(TagsToTermTransformer):
134
134
  )
135
135
 
136
136
  if self.config.semantics == TransformerSemantics.PATCH:
137
- patch_glossary_terms: Optional[
138
- GlossaryTermsClass
139
- ] = TagsToTermMapper._merge_with_server_glossary_terms(
140
- self.ctx.graph, entity_urn, out_glossary_terms
137
+ patch_glossary_terms: Optional[GlossaryTermsClass] = (
138
+ TagsToTermMapper._merge_with_server_glossary_terms(
139
+ self.ctx.graph, entity_urn, out_glossary_terms
140
+ )
141
141
  )
142
142
  return cast(Optional[Aspect], patch_glossary_terms)
143
143
  else:
@@ -61,17 +61,17 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
61
61
  def create(
62
62
  cls, output_dir: str, extras: Dict[str, str]
63
63
  ) -> "SnowflakeAssertionCompiler":
64
- assert os.path.exists(
65
- output_dir
66
- ), f"Specified location {output_dir} does not exist."
64
+ assert os.path.exists(output_dir), (
65
+ f"Specified location {output_dir} does not exist."
66
+ )
67
67
 
68
- assert os.path.isdir(
69
- output_dir
70
- ), f"Specified location {output_dir} is not a folder."
68
+ assert os.path.isdir(output_dir), (
69
+ f"Specified location {output_dir} is not a folder."
70
+ )
71
71
 
72
- assert any(
73
- x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras
74
- ), "Must specify value for DMF schema using -x DMF_SCHEMA=<db.schema>"
72
+ assert any(x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras), (
73
+ "Must specify value for DMF schema using -x DMF_SCHEMA=<db.schema>"
74
+ )
75
75
 
76
76
  return SnowflakeAssertionCompiler(output_dir, extras)
77
77
 
@@ -232,6 +232,6 @@ def get_dmf_schedule(trigger: AssertionTrigger) -> str:
232
232
  elif isinstance(trigger.trigger, CronTrigger):
233
233
  return f"USING CRON {trigger.trigger.cron} {trigger.trigger.timezone}"
234
234
  elif isinstance(trigger.trigger, IntervalTrigger):
235
- return f"{trigger.trigger.interval.seconds/60} MIN"
235
+ return f"{trigger.trigger.interval.seconds / 60} MIN"
236
236
  else:
237
237
  raise ValueError(f"Unsupported trigger type {type(trigger.trigger)}")
@@ -163,9 +163,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
163
163
 
164
164
  if "properties" not in writeable_dict["systemMetadata"]:
165
165
  writeable_dict["systemMetadata"]["properties"] = {}
166
- writeable_dict["systemMetadata"]["properties"][
167
- "sysVersion"
168
- ] = new_version
166
+ writeable_dict["systemMetadata"]["properties"]["sysVersion"] = (
167
+ new_version
168
+ )
169
169
  if needs_write:
170
170
  self.duckdb_client.execute(
171
171
  query="INSERT INTO metadata_aspect_v2 VALUES (?, ?, ?, ?, ?, ?)",
@@ -208,9 +208,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
208
208
  "lastObserved": writeable.systemMetadata.lastObserved
209
209
  }
210
210
  else:
211
- system_metadata[
212
- "lastObserved"
213
- ] = writeable.systemMetadata.lastObserved
211
+ system_metadata["lastObserved"] = (
212
+ writeable.systemMetadata.lastObserved
213
+ )
214
214
  self.duckdb_client.execute(
215
215
  query="UPDATE metadata_aspect_v2 SET system_metadata = ? WHERE urn = ? AND aspect_name = ? AND version = 0",
216
216
  parameters=[
@@ -497,9 +497,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
497
497
  aspect_name = r[1]
498
498
  aspect_payload = json.loads(r[2])
499
499
  if typed:
500
- assert (
501
- aspect_name in ASPECT_MAP
502
- ), f"Missing aspect name {aspect_name} in the registry"
500
+ assert aspect_name in ASPECT_MAP, (
501
+ f"Missing aspect name {aspect_name} in the registry"
502
+ )
503
503
  try:
504
504
  aspect_payload = ASPECT_MAP[aspect_name].from_obj(
505
505
  post_json_transform(aspect_payload)
@@ -531,7 +531,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
531
531
  for r in results.fetchall():
532
532
  urn = r[0]
533
533
  aspect_name = r[1]
534
- aspect_metadata = ASPECT_MAP[aspect_name].from_obj(post_json_transform(json.loads(r[2]))) # type: ignore
534
+ aspect_metadata = ASPECT_MAP[aspect_name].from_obj(
535
+ post_json_transform(json.loads(r[2]))
536
+ ) # type: ignore
535
537
  system_metadata = SystemMetadataClass.from_obj(json.loads(r[3]))
536
538
  mcp = MetadataChangeProposalWrapper(
537
539
  entityUrn=urn,
@@ -9096,7 +9096,7 @@ class DataProcessInstanceInputClass(_Aspect):
9096
9096
 
9097
9097
  @property
9098
9098
  def inputs(self) -> List[str]:
9099
- """Input datasets to be consumed"""
9099
+ """Input assets consumed"""
9100
9100
  return self._inner_dict.get('inputs') # type: ignore
9101
9101
 
9102
9102
  @inputs.setter
@@ -12699,8 +12699,10 @@
12699
12699
  "Relationship": {
12700
12700
  "/*": {
12701
12701
  "entityTypes": [
12702
- "dataset"
12702
+ "dataset",
12703
+ "mlModel"
12703
12704
  ],
12705
+ "isLineage": true,
12704
12706
  "name": "Consumes"
12705
12707
  }
12706
12708
  },
@@ -12720,7 +12722,7 @@
12720
12722
  "items": "string"
12721
12723
  },
12722
12724
  "name": "inputs",
12723
- "doc": "Input datasets to be consumed"
12725
+ "doc": "Input assets consumed"
12724
12726
  }
12725
12727
  ],
12726
12728
  "doc": "Information about the inputs datasets of a Data process"
@@ -12883,6 +12885,8 @@
12883
12885
  "dataset",
12884
12886
  "mlModel"
12885
12887
  ],
12888
+ "isLineage": true,
12889
+ "isUpstream": false,
12886
12890
  "name": "Produces"
12887
12891
  }
12888
12892
  },
@@ -10,8 +10,10 @@
10
10
  "Relationship": {
11
11
  "/*": {
12
12
  "entityTypes": [
13
- "dataset"
13
+ "dataset",
14
+ "mlModel"
14
15
  ],
16
+ "isLineage": true,
15
17
  "name": "Consumes"
16
18
  }
17
19
  },
@@ -29,7 +31,7 @@
29
31
  "items": "string"
30
32
  },
31
33
  "name": "inputs",
32
- "doc": "Input datasets to be consumed",
34
+ "doc": "Input assets consumed",
33
35
  "Urn": "Urn",
34
36
  "urn_is_array": true
35
37
  }
@@ -13,6 +13,8 @@
13
13
  "dataset",
14
14
  "mlModel"
15
15
  ],
16
+ "isLineage": true,
17
+ "isUpstream": false,
16
18
  "name": "Produces"
17
19
  }
18
20
  },
@@ -11,34 +11,25 @@ class DataHubSecretsClient:
11
11
  def __init__(self, graph: DataHubGraph):
12
12
  self.graph = graph
13
13
 
14
+ def _cleanup_secret_name(self, secret_names: List[str]) -> List[str]:
15
+ """Remove empty strings from the list of secret names."""
16
+ return [secret_name for secret_name in secret_names if secret_name]
17
+
14
18
  def get_secret_values(self, secret_names: List[str]) -> Dict[str, Optional[str]]:
15
19
  if len(secret_names) == 0:
16
20
  return {}
17
21
 
18
- request_json = {
19
- "query": """query getSecretValues($input: GetSecretValuesInput!) {\n
20
- getSecretValues(input: $input) {\n
21
- name\n
22
- value\n
23
- }\n
22
+ res_data = self.graph.execute_graphql(
23
+ query="""query getSecretValues($input: GetSecretValuesInput!) {
24
+ getSecretValues(input: $input) {
25
+ name
26
+ value
27
+ }
24
28
  }""",
25
- "variables": {"input": {"secrets": secret_names}},
26
- }
27
- # TODO: Use graph.execute_graphql() instead.
28
-
29
- # Fetch secrets using GraphQL API f
30
- response = self.graph._session.post(
31
- f"{self.graph.config.server}/api/graphql", json=request_json
29
+ variables={"input": {"secrets": self._cleanup_secret_name(secret_names)}},
32
30
  )
33
- response.raise_for_status()
34
-
35
- # Verify response
36
- res_data = response.json()
37
- if "errors" in res_data:
38
- raise Exception("Failed to retrieve secrets from DataHub.")
39
-
40
31
  # Convert list of name, value secret pairs into a dict and return
41
- secret_value_list = res_data["data"]["getSecretValues"]
32
+ secret_value_list = res_data["getSecretValues"]
42
33
  secret_value_dict = dict()
43
34
  for secret_value in secret_value_list:
44
35
  secret_value_dict[secret_value["name"]] = secret_value["value"]
@@ -2,10 +2,7 @@ import json
2
2
  import logging
3
3
  from typing import List
4
4
 
5
- from datahub.configuration.config_loader import (
6
- list_referenced_env_variables,
7
- resolve_env_variables,
8
- )
5
+ from datahub.configuration.config_loader import EnvResolver
9
6
  from datahub.secret.secret_store import SecretStore
10
7
 
11
8
  logger = logging.getLogger(__name__)
@@ -42,18 +39,27 @@ def resolve_secrets(secret_names: List[str], secret_stores: List[SecretStore]) -
42
39
  return final_secret_values
43
40
 
44
41
 
45
- def resolve_recipe(recipe: str, secret_stores: List[SecretStore]) -> dict:
42
+ def resolve_recipe(
43
+ recipe: str, secret_stores: List[SecretStore], strict_env_syntax: bool = True
44
+ ) -> dict:
45
+ # Note: the default for `strict_env_syntax` is normally False, but here we override
46
+ # it to be true. Particularly when fetching secrets from external secret stores, we
47
+ # want to be more careful about not over-fetching secrets.
48
+
46
49
  json_recipe_raw = json.loads(recipe)
47
50
 
48
51
  # 1. Extract all secrets needing resolved.
49
- secrets_to_resolve = list_referenced_env_variables(json_recipe_raw)
52
+ secrets_to_resolve = EnvResolver.list_referenced_variables(
53
+ json_recipe_raw, strict_env_syntax=strict_env_syntax
54
+ )
50
55
 
51
56
  # 2. Resolve secret values
52
57
  secret_values_dict = resolve_secrets(list(secrets_to_resolve), secret_stores)
53
58
 
54
59
  # 3. Substitute secrets into recipe file
55
- json_recipe_resolved = resolve_env_variables(
56
- json_recipe_raw, environ=secret_values_dict
60
+ resolver = EnvResolver(
61
+ environ=secret_values_dict, strict_env_syntax=strict_env_syntax
57
62
  )
63
+ json_recipe_resolved = resolver.resolve(json_recipe_raw)
58
64
 
59
65
  return json_recipe_resolved
@@ -9,8 +9,7 @@ from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
9
9
  class HasCustomPropertiesPatch(MetadataPatchProposal):
10
10
  @classmethod
11
11
  @abstractmethod
12
- def _custom_properties_location(self) -> Tuple[str, PatchPath]:
13
- ...
12
+ def _custom_properties_location(self) -> Tuple[str, PatchPath]: ...
14
13
 
15
14
  def add_custom_property(self, key: str, value: str) -> Self:
16
15
  """Add a custom property to the entity.
@@ -33,14 +33,11 @@ class GraphQLSchemaMetadata(TypedDict):
33
33
 
34
34
  class SchemaResolverInterface(Protocol):
35
35
  @property
36
- def platform(self) -> str:
37
- ...
36
+ def platform(self) -> str: ...
38
37
 
39
- def includes_temp_tables(self) -> bool:
40
- ...
38
+ def includes_temp_tables(self) -> bool: ...
41
39
 
42
- def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]:
43
- ...
40
+ def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]: ...
44
41
 
45
42
  def __hash__(self) -> int:
46
43
  # Mainly to make lru_cache happy in methods that accept a schema resolver.
@@ -232,8 +229,7 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
232
229
  return {
233
230
  get_simple_field_path_from_v2_field_path(field["fieldPath"]): (
234
231
  # The actual types are more of a "nice to have".
235
- field["nativeDataType"]
236
- or "str"
232
+ field["nativeDataType"] or "str"
237
233
  )
238
234
  for field in schema["fields"]
239
235
  # TODO: We can't generate lineage to columns nested within structs yet.
@@ -289,8 +285,7 @@ def _convert_schema_field_list_to_info(
289
285
  return {
290
286
  get_simple_field_path_from_v2_field_path(col.fieldPath): (
291
287
  # The actual types are more of a "nice to have".
292
- col.nativeDataType
293
- or "str"
288
+ col.nativeDataType or "str"
294
289
  )
295
290
  for col in schema_fields
296
291
  # TODO: We can't generate lineage to columns nested within structs yet.
@@ -284,6 +284,7 @@ class SqlAggregatorReport(Report):
284
284
 
285
285
  # Queries.
286
286
  num_queries_entities_generated: int = 0
287
+ num_queries_used_in_lineage: Optional[int] = None
287
288
  num_queries_skipped_due_to_filters: int = 0
288
289
 
289
290
  # Usage-related.
@@ -681,10 +682,10 @@ class SqlParsingAggregator(Closeable):
681
682
  query_id = self._known_lineage_query_id()
682
683
 
683
684
  # Generate CLL if schema of downstream is known
684
- column_lineage: List[
685
- ColumnLineageInfo
686
- ] = self._generate_identity_column_lineage(
687
- upstream_urn=upstream_urn, downstream_urn=downstream_urn
685
+ column_lineage: List[ColumnLineageInfo] = (
686
+ self._generate_identity_column_lineage(
687
+ upstream_urn=upstream_urn, downstream_urn=downstream_urn
688
+ )
688
689
  )
689
690
 
690
691
  # Register the query.
@@ -1043,9 +1044,9 @@ class SqlParsingAggregator(Closeable):
1043
1044
  temp_table_schemas: Dict[str, Optional[List[models.SchemaFieldClass]]] = {}
1044
1045
  for temp_table_urn, query_ids in self._temp_lineage_map[session_id].items():
1045
1046
  for query_id in query_ids:
1046
- temp_table_schemas[
1047
- temp_table_urn
1048
- ] = self._inferred_temp_schemas.get(query_id)
1047
+ temp_table_schemas[temp_table_urn] = (
1048
+ self._inferred_temp_schemas.get(query_id)
1049
+ )
1049
1050
  if temp_table_schemas:
1050
1051
  break
1051
1052
 
@@ -1072,9 +1073,9 @@ class SqlParsingAggregator(Closeable):
1072
1073
  schema_resolver=self._schema_resolver,
1073
1074
  )
1074
1075
  if parsed.debug_info.error:
1075
- self.report.views_parse_failures[
1076
- view_urn
1077
- ] = f"{parsed.debug_info.error} on query: {view_definition.view_definition[:100]}"
1076
+ self.report.views_parse_failures[view_urn] = (
1077
+ f"{parsed.debug_info.error} on query: {view_definition.view_definition[:100]}"
1078
+ )
1078
1079
  if parsed.debug_info.table_error:
1079
1080
  self.report.num_views_failed += 1
1080
1081
  return # we can't do anything with this query
@@ -1200,6 +1201,7 @@ class SqlParsingAggregator(Closeable):
1200
1201
  queries_generated: Set[QueryId] = set()
1201
1202
 
1202
1203
  yield from self._gen_lineage_mcps(queries_generated)
1204
+ self.report.num_queries_used_in_lineage = len(queries_generated)
1203
1205
  yield from self._gen_usage_statistics_mcps()
1204
1206
  yield from self._gen_operation_mcps(queries_generated)
1205
1207
  yield from self._gen_remaining_queries(queries_generated)
@@ -1581,9 +1583,9 @@ class SqlParsingAggregator(Closeable):
1581
1583
  temp_query_lineage_info
1582
1584
  )
1583
1585
  else:
1584
- temp_upstream_queries[
1585
- upstream
1586
- ] = temp_query_lineage_info
1586
+ temp_upstream_queries[upstream] = (
1587
+ temp_query_lineage_info
1588
+ )
1587
1589
 
1588
1590
  # Compute merged upstreams.
1589
1591
  new_upstreams = OrderedSet[UrnStr]()
@@ -1663,9 +1665,9 @@ class SqlParsingAggregator(Closeable):
1663
1665
  composed_of_queries_truncated: LossyList[str] = LossyList()
1664
1666
  for query_id in composed_of_queries:
1665
1667
  composed_of_queries_truncated.append(query_id)
1666
- self.report.queries_with_temp_upstreams[
1667
- composite_query_id
1668
- ] = composed_of_queries_truncated
1668
+ self.report.queries_with_temp_upstreams[composite_query_id] = (
1669
+ composed_of_queries_truncated
1670
+ )
1669
1671
 
1670
1672
  merged_query_text = ";\n\n".join(
1671
1673
  [q.formatted_query_string for q in ordered_queries]
@@ -442,9 +442,9 @@ def _create_table_ddl_cll(
442
442
  ) -> List[_ColumnLineageInfo]:
443
443
  column_lineage: List[_ColumnLineageInfo] = []
444
444
 
445
- assert (
446
- output_table is not None
447
- ), "output_table must be set for create DDL statements"
445
+ assert output_table is not None, (
446
+ "output_table must be set for create DDL statements"
447
+ )
448
448
 
449
449
  create_schema: sqlglot.exp.Schema = statement.this
450
450
  sqlglot_columns = create_schema.expressions
@@ -404,7 +404,7 @@ def detach_ctes(
404
404
  if new_statement == statement:
405
405
  if iteration > 1:
406
406
  logger.debug(
407
- f"Required {iteration+1} iterations to detach and eliminate all CTEs"
407
+ f"Required {iteration + 1} iterations to detach and eliminate all CTEs"
408
408
  )
409
409
  break
410
410
  statement = new_statement
@@ -5,8 +5,7 @@ from typing_extensions import Protocol
5
5
 
6
6
 
7
7
  class SupportsLT(Protocol):
8
- def __lt__(self, __other: Any) -> Any:
9
- ...
8
+ def __lt__(self, __other: Any) -> Any: ...
10
9
 
11
10
 
12
11
  _SupportsComparisonT = TypeVar("_SupportsComparisonT", bound=SupportsLT)
@@ -246,7 +246,7 @@ class MCPDiff:
246
246
  for urn in self.aspect_changes.keys() - self.urns_added - self.urns_removed:
247
247
  aspect_map = self.aspect_changes[urn]
248
248
  s.append(f"Urn changed, {urn}:")
249
- for aspect_name, aspect_diffs in aspect_map.items():
249
+ for aspect_diffs in aspect_map.values():
250
250
  for i, ga in aspect_diffs.aspects_added.items():
251
251
  s.append(self.report_aspect(ga, i, "added"))
252
252
  if verbose:
@@ -224,9 +224,9 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
224
224
  _use_sqlite_on_conflict: bool = field(repr=False, default=True)
225
225
 
226
226
  def __post_init__(self) -> None:
227
- assert (
228
- self.cache_eviction_batch_size > 0
229
- ), "cache_eviction_batch_size must be positive"
227
+ assert self.cache_eviction_batch_size > 0, (
228
+ "cache_eviction_batch_size must be positive"
229
+ )
230
230
 
231
231
  for reserved_column in ("key", "value", "rowid"):
232
232
  if reserved_column in self.extra_columns:
@@ -261,7 +261,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
261
261
  rowid INTEGER PRIMARY KEY AUTOINCREMENT,
262
262
  key TEXT UNIQUE,
263
263
  value BLOB
264
- {''.join(f', {column_name} BLOB' for column_name in self.extra_columns.keys())}
264
+ {"".join(f", {column_name} BLOB" for column_name in self.extra_columns.keys())}
265
265
  )"""
266
266
  )
267
267
 
@@ -316,12 +316,12 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
316
316
  f"""INSERT INTO {self.tablename} (
317
317
  key,
318
318
  value
319
- {''.join(f', {column_name}' for column_name in self.extra_columns.keys())}
319
+ {"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
320
320
  )
321
- VALUES ({', '.join(['?'] *(2 + len(self.extra_columns)))})
321
+ VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})
322
322
  ON CONFLICT (key) DO UPDATE SET
323
323
  value = excluded.value
324
- {''.join(f', {column_name} = excluded.{column_name}' for column_name in self.extra_columns.keys())}
324
+ {"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns.keys())}
325
325
  """,
326
326
  items_to_write,
327
327
  )
@@ -332,16 +332,16 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
332
332
  f"""INSERT INTO {self.tablename} (
333
333
  key,
334
334
  value
335
- {''.join(f', {column_name}' for column_name in self.extra_columns.keys())}
335
+ {"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
336
336
  )
337
- VALUES ({', '.join(['?'] *(2 + len(self.extra_columns)))})""",
337
+ VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})""",
338
338
  item,
339
339
  )
340
340
  except sqlite3.IntegrityError:
341
341
  self._conn.execute(
342
342
  f"""UPDATE {self.tablename} SET
343
343
  value = ?
344
- {''.join(f', {column_name} = ?' for column_name in self.extra_columns.keys())}
344
+ {"".join(f", {column_name} = ?" for column_name in self.extra_columns.keys())}
345
345
  WHERE key = ?""",
346
346
  (*item[1:], item[0]),
347
347
  )
@@ -142,10 +142,10 @@ class HiveColumnToAvroConverter:
142
142
  fields.append({"name": field_name, "type": field_type})
143
143
 
144
144
  if kwargs.get("ustruct_seqn") is not None:
145
- struct_name = f'__structn_{kwargs["ustruct_seqn"]}_{str(uuid.uuid4()).replace("-", "")}'
145
+ struct_name = f"__structn_{kwargs['ustruct_seqn']}_{str(uuid.uuid4()).replace('-', '')}"
146
146
 
147
147
  else:
148
- struct_name = f'__struct_{str(uuid.uuid4()).replace("-", "")}'
148
+ struct_name = f"__struct_{str(uuid.uuid4()).replace('-', '')}"
149
149
  return {
150
150
  "type": "record",
151
151
  "name": struct_name,
@@ -130,9 +130,9 @@ class _ColorLogFormatter(logging.Formatter):
130
130
  # Mimic our default format, but with color.
131
131
  message_fg = self.MESSAGE_COLORS.get(record.levelname)
132
132
  return (
133
- f'{click.style(f"[{self.formatTime(record, self.datefmt)}]", fg="green", dim=True)} '
133
+ f"{click.style(f'[{self.formatTime(record, self.datefmt)}]', fg='green', dim=True)} "
134
134
  f"{click.style(f'{record.levelname:8}', fg=message_fg)} "
135
- f'{click.style(f"{{{record.name}:{record.lineno}}}", fg="blue", dim=True)} - '
135
+ f"{click.style(f'{{{record.name}:{record.lineno}}}', fg='blue', dim=True)} - "
136
136
  f"{click.style(record.getMessage(), fg=message_fg)}"
137
137
  )
138
138