acryl-datahub 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2513 -2521
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +168 -168
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +4 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/gc/datahub_gc.py +1 -0
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +17 -5
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +0 -1
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +245 -101
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
|
@@ -89,7 +89,7 @@ def make_usage_workunit(
|
|
|
89
89
|
top_sql_queries: Optional[List[str]] = None
|
|
90
90
|
if query_freq is not None:
|
|
91
91
|
if top_n_queries < len(query_freq):
|
|
92
|
-
logger.
|
|
92
|
+
logger.warning(
|
|
93
93
|
f"Top N query limit exceeded on {str(resource)}. Max number of queries {top_n_queries} < {len(query_freq)}. Truncating top queries to {top_n_queries}."
|
|
94
94
|
)
|
|
95
95
|
query_freq = query_freq[0:top_n_queries]
|
|
@@ -80,10 +80,10 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
|
|
|
80
80
|
).add_asset(container_urn)
|
|
81
81
|
data_products_container[data_product_urn] = container_product
|
|
82
82
|
else:
|
|
83
|
-
data_products_container[
|
|
84
|
-
data_product_urn
|
|
85
|
-
|
|
86
|
-
|
|
83
|
+
data_products_container[data_product_urn] = (
|
|
84
|
+
data_products_container[data_product_urn].add_asset(
|
|
85
|
+
container_urn
|
|
86
|
+
)
|
|
87
87
|
)
|
|
88
88
|
|
|
89
89
|
mcps: List[
|
|
@@ -61,9 +61,9 @@ class AddDatasetProperties(DatasetPropertiesTransformer):
|
|
|
61
61
|
) -> Optional[DatasetPropertiesClass]:
|
|
62
62
|
assert dataset_properties_aspect
|
|
63
63
|
|
|
64
|
-
server_dataset_properties_aspect: Optional[
|
|
65
|
-
|
|
66
|
-
|
|
64
|
+
server_dataset_properties_aspect: Optional[DatasetPropertiesClass] = (
|
|
65
|
+
graph.get_dataset_properties(entity_urn)
|
|
66
|
+
)
|
|
67
67
|
# No need to take any action if server properties is None or there is not customProperties in server properties
|
|
68
68
|
if (
|
|
69
69
|
server_dataset_properties_aspect is None
|
|
@@ -89,9 +89,9 @@ class AddDatasetSchemaTags(DatasetSchemaMetadataTransformer):
|
|
|
89
89
|
server_field_map: dict = {}
|
|
90
90
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
91
91
|
assert self.ctx.graph
|
|
92
|
-
server_schema_metadata_aspect: Optional[
|
|
93
|
-
|
|
94
|
-
|
|
92
|
+
server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
|
|
93
|
+
self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
|
|
94
|
+
)
|
|
95
95
|
if server_schema_metadata_aspect is not None:
|
|
96
96
|
if not schema_metadata_aspect:
|
|
97
97
|
schema_metadata_aspect = server_schema_metadata_aspect
|
|
@@ -108,9 +108,9 @@ class AddDatasetSchemaTerms(DatasetSchemaMetadataTransformer):
|
|
|
108
108
|
] = {} # Map to cache server field objects, where fieldPath is key
|
|
109
109
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
110
110
|
assert self.ctx.graph
|
|
111
|
-
server_schema_metadata_aspect: Optional[
|
|
112
|
-
|
|
113
|
-
|
|
111
|
+
server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
|
|
112
|
+
self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
|
|
113
|
+
)
|
|
114
114
|
if server_schema_metadata_aspect is not None:
|
|
115
115
|
if not schema_metadata_aspect:
|
|
116
116
|
schema_metadata_aspect = server_schema_metadata_aspect
|
|
@@ -60,10 +60,10 @@ class DatasetTagDomainMapper(DatasetDomainTransformer):
|
|
|
60
60
|
domain_aspect.domains.extend(mapped_domains.domains)
|
|
61
61
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
62
62
|
# Try merging with server-side domains
|
|
63
|
-
patch_domain_aspect: Optional[
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
63
|
+
patch_domain_aspect: Optional[DomainsClass] = (
|
|
64
|
+
AddDatasetDomain._merge_with_server_domains(
|
|
65
|
+
self.ctx.graph, entity_urn, domain_aspect
|
|
66
|
+
)
|
|
67
67
|
)
|
|
68
68
|
return cast(Optional[Aspect], patch_domain_aspect)
|
|
69
69
|
return cast(Optional[Aspect], domain_aspect)
|
|
@@ -141,9 +141,9 @@ class ExtractOwnersFromTagsTransformer(DatasetTagsTransformer):
|
|
|
141
141
|
else:
|
|
142
142
|
owner_type = get_owner_type(self.config.owner_type)
|
|
143
143
|
if owner_type == OwnershipTypeClass.CUSTOM:
|
|
144
|
-
assert (
|
|
145
|
-
|
|
146
|
-
)
|
|
144
|
+
assert self.config.owner_type_urn is not None, (
|
|
145
|
+
"owner_type_urn must be set if owner_type is CUSTOM"
|
|
146
|
+
)
|
|
147
147
|
|
|
148
148
|
owners.append(
|
|
149
149
|
OwnerClass(
|
|
@@ -92,9 +92,9 @@ class TagsToTermMapper(TagsToTermTransformer):
|
|
|
92
92
|
in_global_tags_aspect: Optional[GlobalTagsClass] = self.ctx.graph.get_tags(
|
|
93
93
|
entity_urn
|
|
94
94
|
)
|
|
95
|
-
in_schema_metadata_aspect: Optional[
|
|
96
|
-
|
|
97
|
-
|
|
95
|
+
in_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
|
|
96
|
+
self.ctx.graph.get_schema_metadata(entity_urn)
|
|
97
|
+
)
|
|
98
98
|
|
|
99
99
|
if in_global_tags_aspect is None and in_schema_metadata_aspect is None:
|
|
100
100
|
return cast(Aspect, in_glossary_terms)
|
|
@@ -134,10 +134,10 @@ class TagsToTermMapper(TagsToTermTransformer):
|
|
|
134
134
|
)
|
|
135
135
|
|
|
136
136
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
137
|
-
patch_glossary_terms: Optional[
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
137
|
+
patch_glossary_terms: Optional[GlossaryTermsClass] = (
|
|
138
|
+
TagsToTermMapper._merge_with_server_glossary_terms(
|
|
139
|
+
self.ctx.graph, entity_urn, out_glossary_terms
|
|
140
|
+
)
|
|
141
141
|
)
|
|
142
142
|
return cast(Optional[Aspect], patch_glossary_terms)
|
|
143
143
|
else:
|
|
@@ -61,17 +61,17 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
|
|
|
61
61
|
def create(
|
|
62
62
|
cls, output_dir: str, extras: Dict[str, str]
|
|
63
63
|
) -> "SnowflakeAssertionCompiler":
|
|
64
|
-
assert os.path.exists(
|
|
65
|
-
output_dir
|
|
66
|
-
)
|
|
64
|
+
assert os.path.exists(output_dir), (
|
|
65
|
+
f"Specified location {output_dir} does not exist."
|
|
66
|
+
)
|
|
67
67
|
|
|
68
|
-
assert os.path.isdir(
|
|
69
|
-
output_dir
|
|
70
|
-
)
|
|
68
|
+
assert os.path.isdir(output_dir), (
|
|
69
|
+
f"Specified location {output_dir} is not a folder."
|
|
70
|
+
)
|
|
71
71
|
|
|
72
|
-
assert any(
|
|
73
|
-
|
|
74
|
-
)
|
|
72
|
+
assert any(x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras), (
|
|
73
|
+
"Must specify value for DMF schema using -x DMF_SCHEMA=<db.schema>"
|
|
74
|
+
)
|
|
75
75
|
|
|
76
76
|
return SnowflakeAssertionCompiler(output_dir, extras)
|
|
77
77
|
|
|
@@ -232,6 +232,6 @@ def get_dmf_schedule(trigger: AssertionTrigger) -> str:
|
|
|
232
232
|
elif isinstance(trigger.trigger, CronTrigger):
|
|
233
233
|
return f"USING CRON {trigger.trigger.cron} {trigger.trigger.timezone}"
|
|
234
234
|
elif isinstance(trigger.trigger, IntervalTrigger):
|
|
235
|
-
return f"{trigger.trigger.interval.seconds/60} MIN"
|
|
235
|
+
return f"{trigger.trigger.interval.seconds / 60} MIN"
|
|
236
236
|
else:
|
|
237
237
|
raise ValueError(f"Unsupported trigger type {type(trigger.trigger)}")
|
datahub/lite/duckdb_lite.py
CHANGED
|
@@ -163,9 +163,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
163
163
|
|
|
164
164
|
if "properties" not in writeable_dict["systemMetadata"]:
|
|
165
165
|
writeable_dict["systemMetadata"]["properties"] = {}
|
|
166
|
-
writeable_dict["systemMetadata"]["properties"][
|
|
167
|
-
|
|
168
|
-
|
|
166
|
+
writeable_dict["systemMetadata"]["properties"]["sysVersion"] = (
|
|
167
|
+
new_version
|
|
168
|
+
)
|
|
169
169
|
if needs_write:
|
|
170
170
|
self.duckdb_client.execute(
|
|
171
171
|
query="INSERT INTO metadata_aspect_v2 VALUES (?, ?, ?, ?, ?, ?)",
|
|
@@ -208,9 +208,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
208
208
|
"lastObserved": writeable.systemMetadata.lastObserved
|
|
209
209
|
}
|
|
210
210
|
else:
|
|
211
|
-
system_metadata[
|
|
212
|
-
|
|
213
|
-
|
|
211
|
+
system_metadata["lastObserved"] = (
|
|
212
|
+
writeable.systemMetadata.lastObserved
|
|
213
|
+
)
|
|
214
214
|
self.duckdb_client.execute(
|
|
215
215
|
query="UPDATE metadata_aspect_v2 SET system_metadata = ? WHERE urn = ? AND aspect_name = ? AND version = 0",
|
|
216
216
|
parameters=[
|
|
@@ -497,9 +497,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
497
497
|
aspect_name = r[1]
|
|
498
498
|
aspect_payload = json.loads(r[2])
|
|
499
499
|
if typed:
|
|
500
|
-
assert (
|
|
501
|
-
aspect_name in
|
|
502
|
-
)
|
|
500
|
+
assert aspect_name in ASPECT_MAP, (
|
|
501
|
+
f"Missing aspect name {aspect_name} in the registry"
|
|
502
|
+
)
|
|
503
503
|
try:
|
|
504
504
|
aspect_payload = ASPECT_MAP[aspect_name].from_obj(
|
|
505
505
|
post_json_transform(aspect_payload)
|
|
@@ -531,7 +531,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
531
531
|
for r in results.fetchall():
|
|
532
532
|
urn = r[0]
|
|
533
533
|
aspect_name = r[1]
|
|
534
|
-
aspect_metadata = ASPECT_MAP[aspect_name].from_obj(
|
|
534
|
+
aspect_metadata = ASPECT_MAP[aspect_name].from_obj(
|
|
535
|
+
post_json_transform(json.loads(r[2]))
|
|
536
|
+
) # type: ignore
|
|
535
537
|
system_metadata = SystemMetadataClass.from_obj(json.loads(r[3]))
|
|
536
538
|
mcp = MetadataChangeProposalWrapper(
|
|
537
539
|
entityUrn=urn,
|
|
@@ -9096,7 +9096,7 @@ class DataProcessInstanceInputClass(_Aspect):
|
|
|
9096
9096
|
|
|
9097
9097
|
@property
|
|
9098
9098
|
def inputs(self) -> List[str]:
|
|
9099
|
-
"""Input
|
|
9099
|
+
"""Input assets consumed"""
|
|
9100
9100
|
return self._inner_dict.get('inputs') # type: ignore
|
|
9101
9101
|
|
|
9102
9102
|
@inputs.setter
|
datahub/metadata/schema.avsc
CHANGED
|
@@ -12699,8 +12699,10 @@
|
|
|
12699
12699
|
"Relationship": {
|
|
12700
12700
|
"/*": {
|
|
12701
12701
|
"entityTypes": [
|
|
12702
|
-
"dataset"
|
|
12702
|
+
"dataset",
|
|
12703
|
+
"mlModel"
|
|
12703
12704
|
],
|
|
12705
|
+
"isLineage": true,
|
|
12704
12706
|
"name": "Consumes"
|
|
12705
12707
|
}
|
|
12706
12708
|
},
|
|
@@ -12720,7 +12722,7 @@
|
|
|
12720
12722
|
"items": "string"
|
|
12721
12723
|
},
|
|
12722
12724
|
"name": "inputs",
|
|
12723
|
-
"doc": "Input
|
|
12725
|
+
"doc": "Input assets consumed"
|
|
12724
12726
|
}
|
|
12725
12727
|
],
|
|
12726
12728
|
"doc": "Information about the inputs datasets of a Data process"
|
|
@@ -12883,6 +12885,8 @@
|
|
|
12883
12885
|
"dataset",
|
|
12884
12886
|
"mlModel"
|
|
12885
12887
|
],
|
|
12888
|
+
"isLineage": true,
|
|
12889
|
+
"isUpstream": false,
|
|
12886
12890
|
"name": "Produces"
|
|
12887
12891
|
}
|
|
12888
12892
|
},
|
|
@@ -10,8 +10,10 @@
|
|
|
10
10
|
"Relationship": {
|
|
11
11
|
"/*": {
|
|
12
12
|
"entityTypes": [
|
|
13
|
-
"dataset"
|
|
13
|
+
"dataset",
|
|
14
|
+
"mlModel"
|
|
14
15
|
],
|
|
16
|
+
"isLineage": true,
|
|
15
17
|
"name": "Consumes"
|
|
16
18
|
}
|
|
17
19
|
},
|
|
@@ -29,7 +31,7 @@
|
|
|
29
31
|
"items": "string"
|
|
30
32
|
},
|
|
31
33
|
"name": "inputs",
|
|
32
|
-
"doc": "Input
|
|
34
|
+
"doc": "Input assets consumed",
|
|
33
35
|
"Urn": "Urn",
|
|
34
36
|
"urn_is_array": true
|
|
35
37
|
}
|
|
@@ -11,34 +11,25 @@ class DataHubSecretsClient:
|
|
|
11
11
|
def __init__(self, graph: DataHubGraph):
|
|
12
12
|
self.graph = graph
|
|
13
13
|
|
|
14
|
+
def _cleanup_secret_name(self, secret_names: List[str]) -> List[str]:
|
|
15
|
+
"""Remove empty strings from the list of secret names."""
|
|
16
|
+
return [secret_name for secret_name in secret_names if secret_name]
|
|
17
|
+
|
|
14
18
|
def get_secret_values(self, secret_names: List[str]) -> Dict[str, Optional[str]]:
|
|
15
19
|
if len(secret_names) == 0:
|
|
16
20
|
return {}
|
|
17
21
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
getSecretValues(input: $input) {
|
|
21
|
-
name
|
|
22
|
-
value
|
|
23
|
-
}
|
|
22
|
+
res_data = self.graph.execute_graphql(
|
|
23
|
+
query="""query getSecretValues($input: GetSecretValuesInput!) {
|
|
24
|
+
getSecretValues(input: $input) {
|
|
25
|
+
name
|
|
26
|
+
value
|
|
27
|
+
}
|
|
24
28
|
}""",
|
|
25
|
-
|
|
26
|
-
}
|
|
27
|
-
# TODO: Use graph.execute_graphql() instead.
|
|
28
|
-
|
|
29
|
-
# Fetch secrets using GraphQL API f
|
|
30
|
-
response = self.graph._session.post(
|
|
31
|
-
f"{self.graph.config.server}/api/graphql", json=request_json
|
|
29
|
+
variables={"input": {"secrets": self._cleanup_secret_name(secret_names)}},
|
|
32
30
|
)
|
|
33
|
-
response.raise_for_status()
|
|
34
|
-
|
|
35
|
-
# Verify response
|
|
36
|
-
res_data = response.json()
|
|
37
|
-
if "errors" in res_data:
|
|
38
|
-
raise Exception("Failed to retrieve secrets from DataHub.")
|
|
39
|
-
|
|
40
31
|
# Convert list of name, value secret pairs into a dict and return
|
|
41
|
-
secret_value_list = res_data["
|
|
32
|
+
secret_value_list = res_data["getSecretValues"]
|
|
42
33
|
secret_value_dict = dict()
|
|
43
34
|
for secret_value in secret_value_list:
|
|
44
35
|
secret_value_dict[secret_value["name"]] = secret_value["value"]
|
datahub/secret/secret_common.py
CHANGED
|
@@ -2,10 +2,7 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
-
from datahub.configuration.config_loader import
|
|
6
|
-
list_referenced_env_variables,
|
|
7
|
-
resolve_env_variables,
|
|
8
|
-
)
|
|
5
|
+
from datahub.configuration.config_loader import EnvResolver
|
|
9
6
|
from datahub.secret.secret_store import SecretStore
|
|
10
7
|
|
|
11
8
|
logger = logging.getLogger(__name__)
|
|
@@ -42,18 +39,27 @@ def resolve_secrets(secret_names: List[str], secret_stores: List[SecretStore]) -
|
|
|
42
39
|
return final_secret_values
|
|
43
40
|
|
|
44
41
|
|
|
45
|
-
def resolve_recipe(
|
|
42
|
+
def resolve_recipe(
|
|
43
|
+
recipe: str, secret_stores: List[SecretStore], strict_env_syntax: bool = True
|
|
44
|
+
) -> dict:
|
|
45
|
+
# Note: the default for `strict_env_syntax` is normally False, but here we override
|
|
46
|
+
# it to be true. Particularly when fetching secrets from external secret stores, we
|
|
47
|
+
# want to be more careful about not over-fetching secrets.
|
|
48
|
+
|
|
46
49
|
json_recipe_raw = json.loads(recipe)
|
|
47
50
|
|
|
48
51
|
# 1. Extract all secrets needing resolved.
|
|
49
|
-
secrets_to_resolve =
|
|
52
|
+
secrets_to_resolve = EnvResolver.list_referenced_variables(
|
|
53
|
+
json_recipe_raw, strict_env_syntax=strict_env_syntax
|
|
54
|
+
)
|
|
50
55
|
|
|
51
56
|
# 2. Resolve secret values
|
|
52
57
|
secret_values_dict = resolve_secrets(list(secrets_to_resolve), secret_stores)
|
|
53
58
|
|
|
54
59
|
# 3. Substitute secrets into recipe file
|
|
55
|
-
|
|
56
|
-
|
|
60
|
+
resolver = EnvResolver(
|
|
61
|
+
environ=secret_values_dict, strict_env_syntax=strict_env_syntax
|
|
57
62
|
)
|
|
63
|
+
json_recipe_resolved = resolver.resolve(json_recipe_raw)
|
|
58
64
|
|
|
59
65
|
return json_recipe_resolved
|
|
@@ -9,8 +9,7 @@ from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
|
|
|
9
9
|
class HasCustomPropertiesPatch(MetadataPatchProposal):
|
|
10
10
|
@classmethod
|
|
11
11
|
@abstractmethod
|
|
12
|
-
def _custom_properties_location(self) -> Tuple[str, PatchPath]:
|
|
13
|
-
...
|
|
12
|
+
def _custom_properties_location(self) -> Tuple[str, PatchPath]: ...
|
|
14
13
|
|
|
15
14
|
def add_custom_property(self, key: str, value: str) -> Self:
|
|
16
15
|
"""Add a custom property to the entity.
|
|
@@ -33,14 +33,11 @@ class GraphQLSchemaMetadata(TypedDict):
|
|
|
33
33
|
|
|
34
34
|
class SchemaResolverInterface(Protocol):
|
|
35
35
|
@property
|
|
36
|
-
def platform(self) -> str:
|
|
37
|
-
...
|
|
36
|
+
def platform(self) -> str: ...
|
|
38
37
|
|
|
39
|
-
def includes_temp_tables(self) -> bool:
|
|
40
|
-
...
|
|
38
|
+
def includes_temp_tables(self) -> bool: ...
|
|
41
39
|
|
|
42
|
-
def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]:
|
|
43
|
-
...
|
|
40
|
+
def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]: ...
|
|
44
41
|
|
|
45
42
|
def __hash__(self) -> int:
|
|
46
43
|
# Mainly to make lru_cache happy in methods that accept a schema resolver.
|
|
@@ -232,8 +229,7 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
|
|
232
229
|
return {
|
|
233
230
|
get_simple_field_path_from_v2_field_path(field["fieldPath"]): (
|
|
234
231
|
# The actual types are more of a "nice to have".
|
|
235
|
-
field["nativeDataType"]
|
|
236
|
-
or "str"
|
|
232
|
+
field["nativeDataType"] or "str"
|
|
237
233
|
)
|
|
238
234
|
for field in schema["fields"]
|
|
239
235
|
# TODO: We can't generate lineage to columns nested within structs yet.
|
|
@@ -289,8 +285,7 @@ def _convert_schema_field_list_to_info(
|
|
|
289
285
|
return {
|
|
290
286
|
get_simple_field_path_from_v2_field_path(col.fieldPath): (
|
|
291
287
|
# The actual types are more of a "nice to have".
|
|
292
|
-
col.nativeDataType
|
|
293
|
-
or "str"
|
|
288
|
+
col.nativeDataType or "str"
|
|
294
289
|
)
|
|
295
290
|
for col in schema_fields
|
|
296
291
|
# TODO: We can't generate lineage to columns nested within structs yet.
|
|
@@ -284,6 +284,7 @@ class SqlAggregatorReport(Report):
|
|
|
284
284
|
|
|
285
285
|
# Queries.
|
|
286
286
|
num_queries_entities_generated: int = 0
|
|
287
|
+
num_queries_used_in_lineage: Optional[int] = None
|
|
287
288
|
num_queries_skipped_due_to_filters: int = 0
|
|
288
289
|
|
|
289
290
|
# Usage-related.
|
|
@@ -681,10 +682,10 @@ class SqlParsingAggregator(Closeable):
|
|
|
681
682
|
query_id = self._known_lineage_query_id()
|
|
682
683
|
|
|
683
684
|
# Generate CLL if schema of downstream is known
|
|
684
|
-
column_lineage: List[
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
685
|
+
column_lineage: List[ColumnLineageInfo] = (
|
|
686
|
+
self._generate_identity_column_lineage(
|
|
687
|
+
upstream_urn=upstream_urn, downstream_urn=downstream_urn
|
|
688
|
+
)
|
|
688
689
|
)
|
|
689
690
|
|
|
690
691
|
# Register the query.
|
|
@@ -1043,9 +1044,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1043
1044
|
temp_table_schemas: Dict[str, Optional[List[models.SchemaFieldClass]]] = {}
|
|
1044
1045
|
for temp_table_urn, query_ids in self._temp_lineage_map[session_id].items():
|
|
1045
1046
|
for query_id in query_ids:
|
|
1046
|
-
temp_table_schemas[
|
|
1047
|
-
|
|
1048
|
-
|
|
1047
|
+
temp_table_schemas[temp_table_urn] = (
|
|
1048
|
+
self._inferred_temp_schemas.get(query_id)
|
|
1049
|
+
)
|
|
1049
1050
|
if temp_table_schemas:
|
|
1050
1051
|
break
|
|
1051
1052
|
|
|
@@ -1072,9 +1073,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1072
1073
|
schema_resolver=self._schema_resolver,
|
|
1073
1074
|
)
|
|
1074
1075
|
if parsed.debug_info.error:
|
|
1075
|
-
self.report.views_parse_failures[
|
|
1076
|
-
|
|
1077
|
-
|
|
1076
|
+
self.report.views_parse_failures[view_urn] = (
|
|
1077
|
+
f"{parsed.debug_info.error} on query: {view_definition.view_definition[:100]}"
|
|
1078
|
+
)
|
|
1078
1079
|
if parsed.debug_info.table_error:
|
|
1079
1080
|
self.report.num_views_failed += 1
|
|
1080
1081
|
return # we can't do anything with this query
|
|
@@ -1200,6 +1201,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1200
1201
|
queries_generated: Set[QueryId] = set()
|
|
1201
1202
|
|
|
1202
1203
|
yield from self._gen_lineage_mcps(queries_generated)
|
|
1204
|
+
self.report.num_queries_used_in_lineage = len(queries_generated)
|
|
1203
1205
|
yield from self._gen_usage_statistics_mcps()
|
|
1204
1206
|
yield from self._gen_operation_mcps(queries_generated)
|
|
1205
1207
|
yield from self._gen_remaining_queries(queries_generated)
|
|
@@ -1581,9 +1583,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1581
1583
|
temp_query_lineage_info
|
|
1582
1584
|
)
|
|
1583
1585
|
else:
|
|
1584
|
-
temp_upstream_queries[
|
|
1585
|
-
|
|
1586
|
-
|
|
1586
|
+
temp_upstream_queries[upstream] = (
|
|
1587
|
+
temp_query_lineage_info
|
|
1588
|
+
)
|
|
1587
1589
|
|
|
1588
1590
|
# Compute merged upstreams.
|
|
1589
1591
|
new_upstreams = OrderedSet[UrnStr]()
|
|
@@ -1663,9 +1665,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1663
1665
|
composed_of_queries_truncated: LossyList[str] = LossyList()
|
|
1664
1666
|
for query_id in composed_of_queries:
|
|
1665
1667
|
composed_of_queries_truncated.append(query_id)
|
|
1666
|
-
self.report.queries_with_temp_upstreams[
|
|
1667
|
-
|
|
1668
|
-
|
|
1668
|
+
self.report.queries_with_temp_upstreams[composite_query_id] = (
|
|
1669
|
+
composed_of_queries_truncated
|
|
1670
|
+
)
|
|
1669
1671
|
|
|
1670
1672
|
merged_query_text = ";\n\n".join(
|
|
1671
1673
|
[q.formatted_query_string for q in ordered_queries]
|
|
@@ -442,9 +442,9 @@ def _create_table_ddl_cll(
|
|
|
442
442
|
) -> List[_ColumnLineageInfo]:
|
|
443
443
|
column_lineage: List[_ColumnLineageInfo] = []
|
|
444
444
|
|
|
445
|
-
assert (
|
|
446
|
-
output_table
|
|
447
|
-
)
|
|
445
|
+
assert output_table is not None, (
|
|
446
|
+
"output_table must be set for create DDL statements"
|
|
447
|
+
)
|
|
448
448
|
|
|
449
449
|
create_schema: sqlglot.exp.Schema = statement.this
|
|
450
450
|
sqlglot_columns = create_schema.expressions
|
|
@@ -404,7 +404,7 @@ def detach_ctes(
|
|
|
404
404
|
if new_statement == statement:
|
|
405
405
|
if iteration > 1:
|
|
406
406
|
logger.debug(
|
|
407
|
-
f"Required {iteration+1} iterations to detach and eliminate all CTEs"
|
|
407
|
+
f"Required {iteration + 1} iterations to detach and eliminate all CTEs"
|
|
408
408
|
)
|
|
409
409
|
break
|
|
410
410
|
statement = new_statement
|
datahub/telemetry/stats.py
CHANGED
datahub/testing/mcp_diff.py
CHANGED
|
@@ -246,7 +246,7 @@ class MCPDiff:
|
|
|
246
246
|
for urn in self.aspect_changes.keys() - self.urns_added - self.urns_removed:
|
|
247
247
|
aspect_map = self.aspect_changes[urn]
|
|
248
248
|
s.append(f"Urn changed, {urn}:")
|
|
249
|
-
for
|
|
249
|
+
for aspect_diffs in aspect_map.values():
|
|
250
250
|
for i, ga in aspect_diffs.aspects_added.items():
|
|
251
251
|
s.append(self.report_aspect(ga, i, "added"))
|
|
252
252
|
if verbose:
|
|
@@ -224,9 +224,9 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
224
224
|
_use_sqlite_on_conflict: bool = field(repr=False, default=True)
|
|
225
225
|
|
|
226
226
|
def __post_init__(self) -> None:
|
|
227
|
-
assert (
|
|
228
|
-
|
|
229
|
-
)
|
|
227
|
+
assert self.cache_eviction_batch_size > 0, (
|
|
228
|
+
"cache_eviction_batch_size must be positive"
|
|
229
|
+
)
|
|
230
230
|
|
|
231
231
|
for reserved_column in ("key", "value", "rowid"):
|
|
232
232
|
if reserved_column in self.extra_columns:
|
|
@@ -261,7 +261,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
261
261
|
rowid INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
262
262
|
key TEXT UNIQUE,
|
|
263
263
|
value BLOB
|
|
264
|
-
{
|
|
264
|
+
{"".join(f", {column_name} BLOB" for column_name in self.extra_columns.keys())}
|
|
265
265
|
)"""
|
|
266
266
|
)
|
|
267
267
|
|
|
@@ -316,12 +316,12 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
316
316
|
f"""INSERT INTO {self.tablename} (
|
|
317
317
|
key,
|
|
318
318
|
value
|
|
319
|
-
{
|
|
319
|
+
{"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
|
|
320
320
|
)
|
|
321
|
-
VALUES ({
|
|
321
|
+
VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})
|
|
322
322
|
ON CONFLICT (key) DO UPDATE SET
|
|
323
323
|
value = excluded.value
|
|
324
|
-
{
|
|
324
|
+
{"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns.keys())}
|
|
325
325
|
""",
|
|
326
326
|
items_to_write,
|
|
327
327
|
)
|
|
@@ -332,16 +332,16 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
332
332
|
f"""INSERT INTO {self.tablename} (
|
|
333
333
|
key,
|
|
334
334
|
value
|
|
335
|
-
{
|
|
335
|
+
{"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
|
|
336
336
|
)
|
|
337
|
-
VALUES ({
|
|
337
|
+
VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})""",
|
|
338
338
|
item,
|
|
339
339
|
)
|
|
340
340
|
except sqlite3.IntegrityError:
|
|
341
341
|
self._conn.execute(
|
|
342
342
|
f"""UPDATE {self.tablename} SET
|
|
343
343
|
value = ?
|
|
344
|
-
{
|
|
344
|
+
{"".join(f", {column_name} = ?" for column_name in self.extra_columns.keys())}
|
|
345
345
|
WHERE key = ?""",
|
|
346
346
|
(*item[1:], item[0]),
|
|
347
347
|
)
|
|
@@ -142,10 +142,10 @@ class HiveColumnToAvroConverter:
|
|
|
142
142
|
fields.append({"name": field_name, "type": field_type})
|
|
143
143
|
|
|
144
144
|
if kwargs.get("ustruct_seqn") is not None:
|
|
145
|
-
struct_name = f
|
|
145
|
+
struct_name = f"__structn_{kwargs['ustruct_seqn']}_{str(uuid.uuid4()).replace('-', '')}"
|
|
146
146
|
|
|
147
147
|
else:
|
|
148
|
-
struct_name = f
|
|
148
|
+
struct_name = f"__struct_{str(uuid.uuid4()).replace('-', '')}"
|
|
149
149
|
return {
|
|
150
150
|
"type": "record",
|
|
151
151
|
"name": struct_name,
|
|
@@ -130,9 +130,9 @@ class _ColorLogFormatter(logging.Formatter):
|
|
|
130
130
|
# Mimic our default format, but with color.
|
|
131
131
|
message_fg = self.MESSAGE_COLORS.get(record.levelname)
|
|
132
132
|
return (
|
|
133
|
-
f
|
|
133
|
+
f"{click.style(f'[{self.formatTime(record, self.datefmt)}]', fg='green', dim=True)} "
|
|
134
134
|
f"{click.style(f'{record.levelname:8}', fg=message_fg)} "
|
|
135
|
-
f
|
|
135
|
+
f"{click.style(f'{{{record.name}:{record.lineno}}}', fg='blue', dim=True)} - "
|
|
136
136
|
f"{click.style(record.getMessage(), fg=message_fg)}"
|
|
137
137
|
)
|
|
138
138
|
|