acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2335 -2337
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +157 -157
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +4 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +48 -49
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
- datahub/sql_parsing/sqlglot_lineage.py +5 -4
- datahub/sql_parsing/sqlglot_utils.py +3 -2
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
|
@@ -92,9 +92,9 @@ class PipelineConfig(ConfigModel):
|
|
|
92
92
|
pipeline_name: Optional[str] = None
|
|
93
93
|
failure_log: FailureLoggingConfig = FailureLoggingConfig()
|
|
94
94
|
|
|
95
|
-
_raw_dict: Optional[
|
|
96
|
-
dict
|
|
97
|
-
|
|
95
|
+
_raw_dict: Optional[dict] = (
|
|
96
|
+
None # the raw dict that was parsed to construct this config
|
|
97
|
+
)
|
|
98
98
|
|
|
99
99
|
@validator("run_id", pre=True, always=True)
|
|
100
100
|
def run_id_should_be_semantic(
|
|
@@ -85,8 +85,8 @@ class DataLakeProfilerConfig(ConfigModel):
|
|
|
85
85
|
if field_level_metric.startswith("include_field_"):
|
|
86
86
|
values.setdefault(field_level_metric, False)
|
|
87
87
|
|
|
88
|
-
assert (
|
|
89
|
-
|
|
90
|
-
)
|
|
88
|
+
assert max_num_fields_to_profile is None, (
|
|
89
|
+
f"{max_num_fields_to_profile_key} should be set to None"
|
|
90
|
+
)
|
|
91
91
|
|
|
92
92
|
return values
|
|
@@ -508,7 +508,12 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
508
508
|
):
|
|
509
509
|
abs_path = self.create_abs_path(obj.name)
|
|
510
510
|
logger.debug(f"Sampling file: {abs_path}")
|
|
511
|
-
yield
|
|
511
|
+
yield (
|
|
512
|
+
abs_path,
|
|
513
|
+
obj.name,
|
|
514
|
+
obj.last_modified,
|
|
515
|
+
obj.size,
|
|
516
|
+
)
|
|
512
517
|
except Exception as e:
|
|
513
518
|
# This odd check if being done because boto does not have a proper exception to catch
|
|
514
519
|
# The exception that appears in stacktrace cannot actually be caught without a lot more work
|
|
@@ -552,9 +557,12 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
552
557
|
if os.path.isfile(prefix):
|
|
553
558
|
logger.debug(f"Scanning single local file: {prefix}")
|
|
554
559
|
file_name = prefix
|
|
555
|
-
yield
|
|
556
|
-
|
|
557
|
-
|
|
560
|
+
yield (
|
|
561
|
+
prefix,
|
|
562
|
+
file_name,
|
|
563
|
+
datetime.utcfromtimestamp(os.path.getmtime(prefix)),
|
|
564
|
+
os.path.getsize(prefix),
|
|
565
|
+
)
|
|
558
566
|
else:
|
|
559
567
|
logger.debug(f"Scanning files under local folder: {prefix}")
|
|
560
568
|
for root, dirs, files in os.walk(prefix):
|
|
@@ -565,9 +573,12 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
565
573
|
full_path = PurePath(
|
|
566
574
|
os.path.normpath(os.path.join(root, file))
|
|
567
575
|
).as_posix()
|
|
568
|
-
yield
|
|
569
|
-
|
|
570
|
-
|
|
576
|
+
yield (
|
|
577
|
+
full_path,
|
|
578
|
+
file,
|
|
579
|
+
datetime.utcfromtimestamp(os.path.getmtime(full_path)),
|
|
580
|
+
os.path.getsize(full_path),
|
|
581
|
+
)
|
|
571
582
|
|
|
572
583
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
573
584
|
self.container_WU_creator = ContainerWUCreator(
|
|
@@ -613,7 +624,7 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
613
624
|
table_data.table_path
|
|
614
625
|
].timestamp = table_data.timestamp
|
|
615
626
|
|
|
616
|
-
for
|
|
627
|
+
for _, table_data in table_dict.items():
|
|
617
628
|
yield from self.ingest_table(table_data, path_spec)
|
|
618
629
|
|
|
619
630
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
@@ -521,7 +521,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
521
521
|
# otherwise, a node represents a transformation
|
|
522
522
|
else:
|
|
523
523
|
node_urn = mce_builder.make_data_job_urn_with_flow(
|
|
524
|
-
flow_urn, job_id=f
|
|
524
|
+
flow_urn, job_id=f"{node['NodeType']}-{node['Id']}"
|
|
525
525
|
)
|
|
526
526
|
|
|
527
527
|
return {
|
|
@@ -679,7 +679,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
679
679
|
)
|
|
680
680
|
)
|
|
681
681
|
|
|
682
|
-
return MetadataWorkUnit(id=f
|
|
682
|
+
return MetadataWorkUnit(id=f"{job_name}-{node['Id']}", mce=mce)
|
|
683
683
|
|
|
684
684
|
def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
|
|
685
685
|
logger.debug("Getting all databases")
|
|
@@ -750,13 +750,13 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
750
750
|
) -> Optional[MetadataWorkUnit]:
|
|
751
751
|
if self.source_config.emit_s3_lineage:
|
|
752
752
|
# extract dataset properties aspect
|
|
753
|
-
dataset_properties: Optional[
|
|
754
|
-
DatasetPropertiesClass
|
|
755
|
-
|
|
753
|
+
dataset_properties: Optional[DatasetPropertiesClass] = (
|
|
754
|
+
mce_builder.get_aspect_if_available(mce, DatasetPropertiesClass)
|
|
755
|
+
)
|
|
756
756
|
# extract dataset schema aspect
|
|
757
|
-
schema_metadata: Optional[
|
|
758
|
-
SchemaMetadataClass
|
|
759
|
-
|
|
757
|
+
schema_metadata: Optional[SchemaMetadataClass] = (
|
|
758
|
+
mce_builder.get_aspect_if_available(mce, SchemaMetadataClass)
|
|
759
|
+
)
|
|
760
760
|
|
|
761
761
|
if dataset_properties and "Location" in dataset_properties.customProperties:
|
|
762
762
|
location = dataset_properties.customProperties["Location"]
|
|
@@ -765,9 +765,9 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
765
765
|
location, self.source_config.env
|
|
766
766
|
)
|
|
767
767
|
assert self.ctx.graph
|
|
768
|
-
schema_metadata_for_s3: Optional[
|
|
769
|
-
|
|
770
|
-
|
|
768
|
+
schema_metadata_for_s3: Optional[SchemaMetadataClass] = (
|
|
769
|
+
self.ctx.graph.get_schema_metadata(s3_dataset_urn)
|
|
770
|
+
)
|
|
771
771
|
|
|
772
772
|
if self.source_config.glue_s3_lineage_direction == "upstream":
|
|
773
773
|
fine_grained_lineages = None
|
|
@@ -40,7 +40,7 @@ def get_s3_tags(
|
|
|
40
40
|
]
|
|
41
41
|
)
|
|
42
42
|
except s3.meta.client.exceptions.ClientError:
|
|
43
|
-
logger.
|
|
43
|
+
logger.warning(f"No tags found for bucket={bucket_name}")
|
|
44
44
|
|
|
45
45
|
if use_s3_object_tags and key_name is not None:
|
|
46
46
|
s3_client = aws_config.get_s3_client()
|
|
@@ -53,7 +53,7 @@ def get_s3_tags(
|
|
|
53
53
|
else:
|
|
54
54
|
# Unlike bucket tags, if an object does not have tags, it will just return an empty array
|
|
55
55
|
# as opposed to an exception.
|
|
56
|
-
logger.
|
|
56
|
+
logger.warning(f"No tags found for bucket={bucket_name} key={key_name}")
|
|
57
57
|
if len(tags_to_add) == 0:
|
|
58
58
|
return None
|
|
59
59
|
if ctx.graph is not None:
|
|
@@ -65,7 +65,7 @@ def get_s3_tags(
|
|
|
65
65
|
if current_tags:
|
|
66
66
|
tags_to_add.extend([current_tag.tag for current_tag in current_tags.tags])
|
|
67
67
|
else:
|
|
68
|
-
logger.
|
|
68
|
+
logger.warning("Could not connect to DatahubApi. No current tags to maintain")
|
|
69
69
|
# Remove duplicate tags
|
|
70
70
|
tags_to_add = sorted(list(set(tags_to_add)))
|
|
71
71
|
new_tags = GlobalTagsClass(
|
|
@@ -257,7 +257,7 @@ class FeatureGroupProcessor:
|
|
|
257
257
|
mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)
|
|
258
258
|
|
|
259
259
|
return MetadataWorkUnit(
|
|
260
|
-
id=f
|
|
260
|
+
id=f"{feature_group_details['FeatureGroupName']}-{feature['FeatureName']}",
|
|
261
261
|
mce=mce,
|
|
262
262
|
)
|
|
263
263
|
|
|
@@ -212,7 +212,7 @@ class ModelProcessor:
|
|
|
212
212
|
mce = MetadataChangeEvent(proposedSnapshot=endpoint_snapshot)
|
|
213
213
|
|
|
214
214
|
return MetadataWorkUnit(
|
|
215
|
-
id=f
|
|
215
|
+
id=f"{endpoint_details['EndpointName']}",
|
|
216
216
|
mce=mce,
|
|
217
217
|
)
|
|
218
218
|
|
|
@@ -503,7 +503,7 @@ class ModelProcessor:
|
|
|
503
503
|
mce = MetadataChangeEvent(proposedSnapshot=model_snapshot)
|
|
504
504
|
|
|
505
505
|
return MetadataWorkUnit(
|
|
506
|
-
id=f
|
|
506
|
+
id=f"{model_details['ModelName']}",
|
|
507
507
|
mce=mce,
|
|
508
508
|
)
|
|
509
509
|
|
|
@@ -132,9 +132,9 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
132
132
|
self.filters = BigQueryFilter(self.config, self.report)
|
|
133
133
|
self.identifiers = BigQueryIdentifierBuilder(self.config, self.report)
|
|
134
134
|
|
|
135
|
-
redundant_lineage_run_skip_handler: Optional[
|
|
136
|
-
|
|
137
|
-
|
|
135
|
+
redundant_lineage_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = (
|
|
136
|
+
None
|
|
137
|
+
)
|
|
138
138
|
if self.config.enable_stateful_lineage_ingestion:
|
|
139
139
|
redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
|
|
140
140
|
source=self,
|
|
@@ -37,9 +37,9 @@ class BigqueryTableIdentifier:
|
|
|
37
37
|
|
|
38
38
|
# Note: this regex may get overwritten by the sharded_table_pattern config.
|
|
39
39
|
# The class-level constant, however, will not be overwritten.
|
|
40
|
-
_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[
|
|
41
|
-
|
|
42
|
-
|
|
40
|
+
_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[str] = (
|
|
41
|
+
_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX
|
|
42
|
+
)
|
|
43
43
|
_BIGQUERY_WILDCARD_REGEX: ClassVar[str] = "((_(\\d+)?)\\*$)|\\*$"
|
|
44
44
|
_BQ_SHARDED_TABLE_SUFFIX: str = "_yyyymmdd"
|
|
45
45
|
|
|
@@ -137,9 +137,9 @@ class BigQueryCredential(ConfigModel):
|
|
|
137
137
|
@root_validator(skip_on_failure=True)
|
|
138
138
|
def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
|
139
139
|
if values.get("client_x509_cert_url") is None:
|
|
140
|
-
values[
|
|
141
|
-
"
|
|
142
|
-
|
|
140
|
+
values["client_x509_cert_url"] = (
|
|
141
|
+
f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}"
|
|
142
|
+
)
|
|
143
143
|
return values
|
|
144
144
|
|
|
145
145
|
def create_credential_temp_file(self) -> str:
|
|
@@ -611,9 +611,9 @@ class BigQueryV2Config(
|
|
|
611
611
|
cls, v: Optional[List[str]], values: Dict
|
|
612
612
|
) -> Optional[List[str]]:
|
|
613
613
|
if values.get("use_exported_bigquery_audit_metadata"):
|
|
614
|
-
assert (
|
|
615
|
-
|
|
616
|
-
)
|
|
614
|
+
assert v and len(v) > 0, (
|
|
615
|
+
"`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
|
|
616
|
+
)
|
|
617
617
|
|
|
618
618
|
return v
|
|
619
619
|
|
|
@@ -87,9 +87,9 @@ class BigQueryPlatformResourceHelper:
|
|
|
87
87
|
key=platform_resource_key, graph_client=self.graph
|
|
88
88
|
)
|
|
89
89
|
if platform_resource:
|
|
90
|
-
self.platform_resource_cache[
|
|
91
|
-
|
|
92
|
-
|
|
90
|
+
self.platform_resource_cache[platform_resource_key.primary_key] = (
|
|
91
|
+
platform_resource
|
|
92
|
+
)
|
|
93
93
|
return platform_resource
|
|
94
94
|
return None
|
|
95
95
|
|
|
@@ -115,7 +115,11 @@ class BigQueryPlatformResourceHelper:
|
|
|
115
115
|
and platform_resource.resource_info.value
|
|
116
116
|
):
|
|
117
117
|
try:
|
|
118
|
-
existing_info: Optional[BigQueryLabelInfo] =
|
|
118
|
+
existing_info: Optional[BigQueryLabelInfo] = (
|
|
119
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
120
|
+
BigQueryLabelInfo
|
|
121
|
+
)
|
|
122
|
+
)
|
|
119
123
|
except ValidationError as e:
|
|
120
124
|
logger.error(
|
|
121
125
|
f"Error converting existing value to BigQueryLabelInfo: {e}. Creating new one. Maybe this is because of a non backward compatible schema change."
|
|
@@ -311,8 +311,10 @@ class BigQuerySchemaGenerator:
|
|
|
311
311
|
platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
|
|
312
312
|
label, tag_urn, managed_by_datahub=False
|
|
313
313
|
)
|
|
314
|
-
label_info: BigQueryLabelInfo =
|
|
315
|
-
|
|
314
|
+
label_info: BigQueryLabelInfo = (
|
|
315
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
316
|
+
BigQueryLabelInfo
|
|
317
|
+
)
|
|
316
318
|
)
|
|
317
319
|
tag_urn = TagUrn.from_string(label_info.datahub_urn)
|
|
318
320
|
|
|
@@ -820,8 +822,10 @@ class BigQuerySchemaGenerator:
|
|
|
820
822
|
platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
|
|
821
823
|
label, tag_urn, managed_by_datahub=False
|
|
822
824
|
)
|
|
823
|
-
label_info: BigQueryLabelInfo =
|
|
824
|
-
|
|
825
|
+
label_info: BigQueryLabelInfo = (
|
|
826
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
827
|
+
BigQueryLabelInfo
|
|
828
|
+
)
|
|
825
829
|
)
|
|
826
830
|
tag_urn = TagUrn.from_string(label_info.datahub_urn)
|
|
827
831
|
|
|
@@ -860,8 +864,10 @@ class BigQuerySchemaGenerator:
|
|
|
860
864
|
platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
|
|
861
865
|
label, tag_urn, managed_by_datahub=False
|
|
862
866
|
)
|
|
863
|
-
label_info: BigQueryLabelInfo =
|
|
864
|
-
|
|
867
|
+
label_info: BigQueryLabelInfo = (
|
|
868
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
869
|
+
BigQueryLabelInfo
|
|
870
|
+
)
|
|
865
871
|
)
|
|
866
872
|
tag_urn = TagUrn.from_string(label_info.datahub_urn)
|
|
867
873
|
|
|
@@ -1203,9 +1209,9 @@ class BigQuerySchemaGenerator:
|
|
|
1203
1209
|
report=self.report,
|
|
1204
1210
|
)
|
|
1205
1211
|
|
|
1206
|
-
self.report.metadata_extraction_sec[
|
|
1207
|
-
|
|
1208
|
-
|
|
1212
|
+
self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = (
|
|
1213
|
+
timer.elapsed_seconds(digits=2)
|
|
1214
|
+
)
|
|
1209
1215
|
|
|
1210
1216
|
def get_core_table_details(
|
|
1211
1217
|
self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str
|
|
@@ -697,7 +697,7 @@ class BigqueryLineageExtractor:
|
|
|
697
697
|
if parsed_queries[-1]:
|
|
698
698
|
query = f"""create table `{destination_table.get_sanitized_table_ref().table_identifier.get_table_name()}` AS
|
|
699
699
|
(
|
|
700
|
-
{parsed_queries[-1].sql(dialect=
|
|
700
|
+
{parsed_queries[-1].sql(dialect="bigquery")}
|
|
701
701
|
)"""
|
|
702
702
|
else:
|
|
703
703
|
query = e.query
|
|
@@ -809,11 +809,11 @@ class BigqueryLineageExtractor:
|
|
|
809
809
|
upstream_lineage, temp_table_upstream
|
|
810
810
|
)
|
|
811
811
|
|
|
812
|
-
upstreams[
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
812
|
+
upstreams[ref_temp_table_upstream] = (
|
|
813
|
+
_merge_lineage_edge_columns(
|
|
814
|
+
upstreams.get(ref_temp_table_upstream),
|
|
815
|
+
collapsed_lineage,
|
|
816
|
+
)
|
|
817
817
|
)
|
|
818
818
|
else:
|
|
819
819
|
upstreams[upstream_table_ref] = _merge_lineage_edge_columns(
|
|
@@ -1004,9 +1004,9 @@ class BigqueryLineageExtractor:
|
|
|
1004
1004
|
dataset_urn
|
|
1005
1005
|
)
|
|
1006
1006
|
for gcs_dataset_urn in gcs_urns:
|
|
1007
|
-
schema_metadata_for_gcs: Optional[
|
|
1008
|
-
|
|
1009
|
-
|
|
1007
|
+
schema_metadata_for_gcs: Optional[SchemaMetadataClass] = (
|
|
1008
|
+
graph.get_schema_metadata(gcs_dataset_urn)
|
|
1009
|
+
)
|
|
1010
1010
|
if schema_metadata and schema_metadata_for_gcs:
|
|
1011
1011
|
fine_grained_lineage = self.get_fine_grained_lineages_with_gcs(
|
|
1012
1012
|
dataset_urn,
|
|
@@ -271,9 +271,9 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
271
271
|
# Preprocessing stage that deduplicates the queries using query hash per usage bucket
|
|
272
272
|
# Note: FileBackedDict is an ordered dictionary, so the order of execution of
|
|
273
273
|
# queries is inherently maintained
|
|
274
|
-
queries_deduped: FileBackedDict[
|
|
275
|
-
|
|
276
|
-
|
|
274
|
+
queries_deduped: FileBackedDict[Dict[int, ObservedQuery]] = (
|
|
275
|
+
self.deduplicate_queries(queries)
|
|
276
|
+
)
|
|
277
277
|
self.report.num_unique_queries = len(queries_deduped)
|
|
278
278
|
logger.info(f"Found {self.report.num_unique_queries} unique queries")
|
|
279
279
|
|
|
@@ -763,9 +763,9 @@ class BigQueryUsageExtractor:
|
|
|
763
763
|
)
|
|
764
764
|
|
|
765
765
|
if event.query_event.default_dataset:
|
|
766
|
-
custom_properties[
|
|
767
|
-
|
|
768
|
-
|
|
766
|
+
custom_properties["defaultDatabase"] = (
|
|
767
|
+
event.query_event.default_dataset
|
|
768
|
+
)
|
|
769
769
|
if event.read_event:
|
|
770
770
|
if event.read_event.readReason:
|
|
771
771
|
custom_properties["readReason"] = event.read_event.readReason
|
|
@@ -107,10 +107,10 @@ class CassandraToSchemaFieldConverter:
|
|
|
107
107
|
|
|
108
108
|
@staticmethod
|
|
109
109
|
def get_column_type(cassandra_column_type: str) -> SchemaFieldDataType:
|
|
110
|
-
type_class: Optional[
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
110
|
+
type_class: Optional[Type] = (
|
|
111
|
+
CassandraToSchemaFieldConverter._field_type_to_schema_field_type.get(
|
|
112
|
+
cassandra_column_type
|
|
113
|
+
)
|
|
114
114
|
)
|
|
115
115
|
if type_class is None:
|
|
116
116
|
logger.warning(
|
|
@@ -293,9 +293,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
|
|
|
293
293
|
def _load_json_schema_with_resolved_references(
|
|
294
294
|
self, schema: Schema, name: str, subject: str
|
|
295
295
|
) -> dict:
|
|
296
|
-
imported_json_schemas: List[
|
|
297
|
-
|
|
298
|
-
|
|
296
|
+
imported_json_schemas: List[JsonSchemaWrapper] = (
|
|
297
|
+
self.get_schemas_from_confluent_ref_json(schema, name=name, subject=subject)
|
|
298
|
+
)
|
|
299
299
|
schema_dict = json.loads(schema.schema_str)
|
|
300
300
|
reference_map = {}
|
|
301
301
|
for imported_schema in imported_json_schemas:
|
|
@@ -332,9 +332,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
|
|
|
332
332
|
)
|
|
333
333
|
|
|
334
334
|
elif schema.schema_type == "PROTOBUF":
|
|
335
|
-
imported_schemas: List[
|
|
336
|
-
|
|
337
|
-
|
|
335
|
+
imported_schemas: List[ProtobufSchema] = (
|
|
336
|
+
self.get_schemas_from_confluent_ref_protobuf(schema)
|
|
337
|
+
)
|
|
338
338
|
base_name: str = topic.replace(".", "_")
|
|
339
339
|
fields = protobuf_util.protobuf_schema_to_mce_fields(
|
|
340
340
|
ProtobufSchema(
|
|
@@ -371,11 +371,11 @@ class CSVEnricherSource(Source):
|
|
|
371
371
|
domain: Optional[str],
|
|
372
372
|
description: Optional[str],
|
|
373
373
|
) -> Iterable[MetadataWorkUnit]:
|
|
374
|
-
maybe_terms_wu: Optional[
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
374
|
+
maybe_terms_wu: Optional[MetadataWorkUnit] = (
|
|
375
|
+
self.get_resource_glossary_terms_work_unit(
|
|
376
|
+
entity_urn=entity_urn,
|
|
377
|
+
term_associations=term_associations,
|
|
378
|
+
)
|
|
379
379
|
)
|
|
380
380
|
if maybe_terms_wu:
|
|
381
381
|
self.report.num_glossary_term_workunits_produced += 1
|
|
@@ -389,31 +389,31 @@ class CSVEnricherSource(Source):
|
|
|
389
389
|
self.report.num_tag_workunits_produced += 1
|
|
390
390
|
yield maybe_tags_wu
|
|
391
391
|
|
|
392
|
-
maybe_owners_wu: Optional[
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
392
|
+
maybe_owners_wu: Optional[MetadataWorkUnit] = (
|
|
393
|
+
self.get_resource_owners_work_unit(
|
|
394
|
+
entity_urn=entity_urn,
|
|
395
|
+
owners=owners,
|
|
396
|
+
)
|
|
397
397
|
)
|
|
398
398
|
if maybe_owners_wu:
|
|
399
399
|
self.report.num_owners_workunits_produced += 1
|
|
400
400
|
yield maybe_owners_wu
|
|
401
401
|
|
|
402
|
-
maybe_domain_wu: Optional[
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
402
|
+
maybe_domain_wu: Optional[MetadataWorkUnit] = (
|
|
403
|
+
self.get_resource_domain_work_unit(
|
|
404
|
+
entity_urn=entity_urn,
|
|
405
|
+
domain=domain,
|
|
406
|
+
)
|
|
407
407
|
)
|
|
408
408
|
if maybe_domain_wu:
|
|
409
409
|
self.report.num_domain_workunits_produced += 1
|
|
410
410
|
yield maybe_domain_wu
|
|
411
411
|
|
|
412
|
-
maybe_description_wu: Optional[
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
412
|
+
maybe_description_wu: Optional[MetadataWorkUnit] = (
|
|
413
|
+
self.get_resource_description_work_unit(
|
|
414
|
+
entity_urn=entity_urn,
|
|
415
|
+
description=description,
|
|
416
|
+
)
|
|
417
417
|
)
|
|
418
418
|
if maybe_description_wu:
|
|
419
419
|
self.report.num_description_workunits_produced += 1
|
|
@@ -426,9 +426,9 @@ class CSVEnricherSource(Source):
|
|
|
426
426
|
needs_write: bool,
|
|
427
427
|
) -> Tuple[EditableSchemaMetadataClass, bool]:
|
|
428
428
|
field_path: str = sub_resource_row.field_path
|
|
429
|
-
term_associations: List[
|
|
430
|
-
|
|
431
|
-
|
|
429
|
+
term_associations: List[GlossaryTermAssociationClass] = (
|
|
430
|
+
sub_resource_row.term_associations
|
|
431
|
+
)
|
|
432
432
|
tag_associations: List[TagAssociationClass] = sub_resource_row.tag_associations
|
|
433
433
|
description: Optional[str] = sub_resource_row.description
|
|
434
434
|
has_terms: bool = len(term_associations) > 0
|
|
@@ -517,9 +517,9 @@ class CSVEnricherSource(Source):
|
|
|
517
517
|
# Boolean field to tell whether we need to write an MCPW.
|
|
518
518
|
needs_write = False
|
|
519
519
|
|
|
520
|
-
current_editable_schema_metadata: Optional[
|
|
521
|
-
|
|
522
|
-
|
|
520
|
+
current_editable_schema_metadata: Optional[EditableSchemaMetadataClass] = (
|
|
521
|
+
None
|
|
522
|
+
)
|
|
523
523
|
if self.ctx.graph and not self.should_overwrite:
|
|
524
524
|
# Fetch the current editable schema metadata
|
|
525
525
|
current_editable_schema_metadata = self.ctx.graph.get_aspect(
|
|
@@ -655,9 +655,9 @@ class CSVEnricherSource(Source):
|
|
|
655
655
|
entity_urn = row["resource"]
|
|
656
656
|
entity_type = Urn.from_string(row["resource"]).get_type()
|
|
657
657
|
|
|
658
|
-
term_associations: List[
|
|
659
|
-
|
|
660
|
-
|
|
658
|
+
term_associations: List[GlossaryTermAssociationClass] = (
|
|
659
|
+
self.maybe_extract_glossary_terms(row)
|
|
660
|
+
)
|
|
661
661
|
tag_associations: List[TagAssociationClass] = self.maybe_extract_tags(row)
|
|
662
662
|
owners: List[OwnerClass] = self.maybe_extract_owners(row, is_resource_row)
|
|
663
663
|
|
|
@@ -152,7 +152,9 @@ class DataHubDatabaseReader:
|
|
|
152
152
|
) -> Iterable[Dict[str, Any]]:
|
|
153
153
|
with self.engine.connect() as conn:
|
|
154
154
|
if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
|
|
155
|
-
with
|
|
155
|
+
with (
|
|
156
|
+
conn.begin()
|
|
157
|
+
): # Transaction required for PostgreSQL server-side cursor
|
|
156
158
|
# Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
|
|
157
159
|
# https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
|
|
158
160
|
conn = conn.execution_options(
|
|
@@ -222,7 +224,7 @@ class DataHubDatabaseReader:
|
|
|
222
224
|
)
|
|
223
225
|
except Exception as e:
|
|
224
226
|
logger.warning(
|
|
225
|
-
f
|
|
227
|
+
f"Failed to parse metadata for {row['urn']}: {e}", exc_info=True
|
|
226
228
|
)
|
|
227
229
|
self.report.num_database_parse_errors += 1
|
|
228
230
|
self.report.database_parse_errors.setdefault(
|
|
@@ -194,20 +194,20 @@ _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS = """
|
|
|
194
194
|
|
|
195
195
|
_DBT_FIELDS_BY_TYPE = {
|
|
196
196
|
"models": f"""
|
|
197
|
-
{
|
|
198
|
-
{
|
|
199
|
-
{
|
|
197
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
198
|
+
{_DBT_GRAPHQL_NODE_COMMON_FIELDS}
|
|
199
|
+
{_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
|
|
200
200
|
dependsOn
|
|
201
201
|
materializedType
|
|
202
202
|
""",
|
|
203
203
|
"seeds": f"""
|
|
204
|
-
{
|
|
205
|
-
{
|
|
206
|
-
{
|
|
204
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
205
|
+
{_DBT_GRAPHQL_NODE_COMMON_FIELDS}
|
|
206
|
+
{_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
|
|
207
207
|
""",
|
|
208
208
|
"sources": f"""
|
|
209
|
-
{
|
|
210
|
-
{
|
|
209
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
210
|
+
{_DBT_GRAPHQL_NODE_COMMON_FIELDS}
|
|
211
211
|
identifier
|
|
212
212
|
sourceName
|
|
213
213
|
sourceDescription
|
|
@@ -218,9 +218,9 @@ _DBT_FIELDS_BY_TYPE = {
|
|
|
218
218
|
loader
|
|
219
219
|
""",
|
|
220
220
|
"snapshots": f"""
|
|
221
|
-
{
|
|
222
|
-
{
|
|
223
|
-
{
|
|
221
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
222
|
+
{_DBT_GRAPHQL_NODE_COMMON_FIELDS}
|
|
223
|
+
{_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
|
|
224
224
|
parentsSources {{
|
|
225
225
|
uniqueId
|
|
226
226
|
}}
|
|
@@ -229,7 +229,7 @@ _DBT_FIELDS_BY_TYPE = {
|
|
|
229
229
|
}}
|
|
230
230
|
""",
|
|
231
231
|
"tests": f"""
|
|
232
|
-
{
|
|
232
|
+
{_DBT_GRAPHQL_COMMON_FIELDS}
|
|
233
233
|
state
|
|
234
234
|
columnName
|
|
235
235
|
status
|
|
@@ -315,7 +315,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
|
315
315
|
res = response.json()
|
|
316
316
|
if "errors" in res:
|
|
317
317
|
raise ValueError(
|
|
318
|
-
f
|
|
318
|
+
f"Unable to fetch metadata from dbt Cloud: {res['errors']}"
|
|
319
319
|
)
|
|
320
320
|
data = res["data"]
|
|
321
321
|
except JSONDecodeError as e:
|