acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/METADATA +2461 -2463
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/RECORD +161 -161
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/delete_cli.py +16 -2
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/config.py +4 -0
- datahub/ingestion/source/fivetran/fivetran.py +15 -5
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
- datahub/ingestion/source/gcs/gcs_source.py +5 -3
- datahub/ingestion/source/ge_data_profiler.py +4 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/query.py +77 -47
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +48 -49
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
- datahub/sql_parsing/sqlglot_lineage.py +5 -4
- datahub/sql_parsing/sqlglot_utils.py +3 -2
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/top_level.txt +0 -0
|
@@ -519,9 +519,9 @@ class DataHubGraph(DatahubRestEmitter):
|
|
|
519
519
|
:return: Optionally, a map of aspect_name to aspect_value as a dictionary if present, aspect_value will be set to None if that aspect was not found. Returns None on HTTP status 404.
|
|
520
520
|
:raises HttpError: if the HTTP response is not a 200
|
|
521
521
|
"""
|
|
522
|
-
assert len(aspects) == len(
|
|
523
|
-
aspect_types
|
|
524
|
-
)
|
|
522
|
+
assert len(aspects) == len(aspect_types), (
|
|
523
|
+
f"number of aspects requested ({len(aspects)}) should be the same as number of aspect types provided ({len(aspect_types)})"
|
|
524
|
+
)
|
|
525
525
|
|
|
526
526
|
# TODO: generate aspects list from type classes
|
|
527
527
|
response_json = self.get_entity_raw(entity_urn, aspects)
|
|
@@ -1576,9 +1576,7 @@ class DataHubGraph(DatahubRestEmitter):
|
|
|
1576
1576
|
... assertionResult
|
|
1577
1577
|
}
|
|
1578
1578
|
}
|
|
1579
|
-
""" % (
|
|
1580
|
-
self._assertion_result_shared()
|
|
1581
|
-
)
|
|
1579
|
+
""" % (self._assertion_result_shared())
|
|
1582
1580
|
|
|
1583
1581
|
variables = {
|
|
1584
1582
|
"assertionUrn": urn,
|
|
@@ -76,8 +76,9 @@ class LoggingCallback(WriteCallback):
|
|
|
76
76
|
failure_metadata: dict,
|
|
77
77
|
) -> None:
|
|
78
78
|
logger.error(
|
|
79
|
-
f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}"
|
|
80
|
-
|
|
79
|
+
f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}",
|
|
80
|
+
extra={"failure_metadata": failure_metadata},
|
|
81
|
+
exc_info=failure_exception,
|
|
81
82
|
)
|
|
82
83
|
|
|
83
84
|
|
|
@@ -108,9 +109,9 @@ class DeadLetterQueueCallback(WriteCallback):
|
|
|
108
109
|
mcp.systemMetadata.properties = {}
|
|
109
110
|
if "workunit_id" not in mcp.systemMetadata.properties:
|
|
110
111
|
# update the workunit id
|
|
111
|
-
mcp.systemMetadata.properties[
|
|
112
|
-
"workunit_id"
|
|
113
|
-
|
|
112
|
+
mcp.systemMetadata.properties["workunit_id"] = (
|
|
113
|
+
record_envelope.metadata["workunit_id"]
|
|
114
|
+
)
|
|
114
115
|
record_envelope.record = mcp
|
|
115
116
|
self.file_sink.write_record_async(record_envelope, self.logging_callback)
|
|
116
117
|
|
|
@@ -700,7 +701,7 @@ class Pipeline:
|
|
|
700
701
|
num_failures_sink = len(self.sink.get_report().failures)
|
|
701
702
|
click.secho(
|
|
702
703
|
message_template.format(
|
|
703
|
-
status=f"with at least {num_failures_source+num_failures_sink} failures"
|
|
704
|
+
status=f"with at least {num_failures_source + num_failures_sink} failures"
|
|
704
705
|
),
|
|
705
706
|
fg=self._get_text_color(
|
|
706
707
|
running=currently_running, failures=True, warnings=False
|
|
@@ -718,7 +719,7 @@ class Pipeline:
|
|
|
718
719
|
num_warn_global = len(global_warnings)
|
|
719
720
|
click.secho(
|
|
720
721
|
message_template.format(
|
|
721
|
-
status=f"with at least {num_warn_source+num_warn_sink+num_warn_global} warnings"
|
|
722
|
+
status=f"with at least {num_warn_source + num_warn_sink + num_warn_global} warnings"
|
|
722
723
|
),
|
|
723
724
|
fg=self._get_text_color(
|
|
724
725
|
running=currently_running, failures=False, warnings=True
|
|
@@ -92,9 +92,9 @@ class PipelineConfig(ConfigModel):
|
|
|
92
92
|
pipeline_name: Optional[str] = None
|
|
93
93
|
failure_log: FailureLoggingConfig = FailureLoggingConfig()
|
|
94
94
|
|
|
95
|
-
_raw_dict: Optional[
|
|
96
|
-
dict
|
|
97
|
-
|
|
95
|
+
_raw_dict: Optional[dict] = (
|
|
96
|
+
None # the raw dict that was parsed to construct this config
|
|
97
|
+
)
|
|
98
98
|
|
|
99
99
|
@validator("run_id", pre=True, always=True)
|
|
100
100
|
def run_id_should_be_semantic(
|
|
@@ -85,8 +85,8 @@ class DataLakeProfilerConfig(ConfigModel):
|
|
|
85
85
|
if field_level_metric.startswith("include_field_"):
|
|
86
86
|
values.setdefault(field_level_metric, False)
|
|
87
87
|
|
|
88
|
-
assert (
|
|
89
|
-
|
|
90
|
-
)
|
|
88
|
+
assert max_num_fields_to_profile is None, (
|
|
89
|
+
f"{max_num_fields_to_profile_key} should be set to None"
|
|
90
|
+
)
|
|
91
91
|
|
|
92
92
|
return values
|
|
@@ -508,7 +508,12 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
508
508
|
):
|
|
509
509
|
abs_path = self.create_abs_path(obj.name)
|
|
510
510
|
logger.debug(f"Sampling file: {abs_path}")
|
|
511
|
-
yield
|
|
511
|
+
yield (
|
|
512
|
+
abs_path,
|
|
513
|
+
obj.name,
|
|
514
|
+
obj.last_modified,
|
|
515
|
+
obj.size,
|
|
516
|
+
)
|
|
512
517
|
except Exception as e:
|
|
513
518
|
# This odd check if being done because boto does not have a proper exception to catch
|
|
514
519
|
# The exception that appears in stacktrace cannot actually be caught without a lot more work
|
|
@@ -552,9 +557,12 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
552
557
|
if os.path.isfile(prefix):
|
|
553
558
|
logger.debug(f"Scanning single local file: {prefix}")
|
|
554
559
|
file_name = prefix
|
|
555
|
-
yield
|
|
556
|
-
|
|
557
|
-
|
|
560
|
+
yield (
|
|
561
|
+
prefix,
|
|
562
|
+
file_name,
|
|
563
|
+
datetime.utcfromtimestamp(os.path.getmtime(prefix)),
|
|
564
|
+
os.path.getsize(prefix),
|
|
565
|
+
)
|
|
558
566
|
else:
|
|
559
567
|
logger.debug(f"Scanning files under local folder: {prefix}")
|
|
560
568
|
for root, dirs, files in os.walk(prefix):
|
|
@@ -565,9 +573,12 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
565
573
|
full_path = PurePath(
|
|
566
574
|
os.path.normpath(os.path.join(root, file))
|
|
567
575
|
).as_posix()
|
|
568
|
-
yield
|
|
569
|
-
|
|
570
|
-
|
|
576
|
+
yield (
|
|
577
|
+
full_path,
|
|
578
|
+
file,
|
|
579
|
+
datetime.utcfromtimestamp(os.path.getmtime(full_path)),
|
|
580
|
+
os.path.getsize(full_path),
|
|
581
|
+
)
|
|
571
582
|
|
|
572
583
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
573
584
|
self.container_WU_creator = ContainerWUCreator(
|
|
@@ -613,7 +624,7 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
613
624
|
table_data.table_path
|
|
614
625
|
].timestamp = table_data.timestamp
|
|
615
626
|
|
|
616
|
-
for
|
|
627
|
+
for _, table_data in table_dict.items():
|
|
617
628
|
yield from self.ingest_table(table_data, path_spec)
|
|
618
629
|
|
|
619
630
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
@@ -521,7 +521,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
521
521
|
# otherwise, a node represents a transformation
|
|
522
522
|
else:
|
|
523
523
|
node_urn = mce_builder.make_data_job_urn_with_flow(
|
|
524
|
-
flow_urn, job_id=f
|
|
524
|
+
flow_urn, job_id=f"{node['NodeType']}-{node['Id']}"
|
|
525
525
|
)
|
|
526
526
|
|
|
527
527
|
return {
|
|
@@ -679,7 +679,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
679
679
|
)
|
|
680
680
|
)
|
|
681
681
|
|
|
682
|
-
return MetadataWorkUnit(id=f
|
|
682
|
+
return MetadataWorkUnit(id=f"{job_name}-{node['Id']}", mce=mce)
|
|
683
683
|
|
|
684
684
|
def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
|
|
685
685
|
logger.debug("Getting all databases")
|
|
@@ -750,13 +750,13 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
750
750
|
) -> Optional[MetadataWorkUnit]:
|
|
751
751
|
if self.source_config.emit_s3_lineage:
|
|
752
752
|
# extract dataset properties aspect
|
|
753
|
-
dataset_properties: Optional[
|
|
754
|
-
DatasetPropertiesClass
|
|
755
|
-
|
|
753
|
+
dataset_properties: Optional[DatasetPropertiesClass] = (
|
|
754
|
+
mce_builder.get_aspect_if_available(mce, DatasetPropertiesClass)
|
|
755
|
+
)
|
|
756
756
|
# extract dataset schema aspect
|
|
757
|
-
schema_metadata: Optional[
|
|
758
|
-
SchemaMetadataClass
|
|
759
|
-
|
|
757
|
+
schema_metadata: Optional[SchemaMetadataClass] = (
|
|
758
|
+
mce_builder.get_aspect_if_available(mce, SchemaMetadataClass)
|
|
759
|
+
)
|
|
760
760
|
|
|
761
761
|
if dataset_properties and "Location" in dataset_properties.customProperties:
|
|
762
762
|
location = dataset_properties.customProperties["Location"]
|
|
@@ -765,9 +765,9 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
765
765
|
location, self.source_config.env
|
|
766
766
|
)
|
|
767
767
|
assert self.ctx.graph
|
|
768
|
-
schema_metadata_for_s3: Optional[
|
|
769
|
-
|
|
770
|
-
|
|
768
|
+
schema_metadata_for_s3: Optional[SchemaMetadataClass] = (
|
|
769
|
+
self.ctx.graph.get_schema_metadata(s3_dataset_urn)
|
|
770
|
+
)
|
|
771
771
|
|
|
772
772
|
if self.source_config.glue_s3_lineage_direction == "upstream":
|
|
773
773
|
fine_grained_lineages = None
|
|
@@ -40,7 +40,7 @@ def get_s3_tags(
|
|
|
40
40
|
]
|
|
41
41
|
)
|
|
42
42
|
except s3.meta.client.exceptions.ClientError:
|
|
43
|
-
logger.
|
|
43
|
+
logger.warning(f"No tags found for bucket={bucket_name}")
|
|
44
44
|
|
|
45
45
|
if use_s3_object_tags and key_name is not None:
|
|
46
46
|
s3_client = aws_config.get_s3_client()
|
|
@@ -53,7 +53,7 @@ def get_s3_tags(
|
|
|
53
53
|
else:
|
|
54
54
|
# Unlike bucket tags, if an object does not have tags, it will just return an empty array
|
|
55
55
|
# as opposed to an exception.
|
|
56
|
-
logger.
|
|
56
|
+
logger.warning(f"No tags found for bucket={bucket_name} key={key_name}")
|
|
57
57
|
if len(tags_to_add) == 0:
|
|
58
58
|
return None
|
|
59
59
|
if ctx.graph is not None:
|
|
@@ -65,7 +65,7 @@ def get_s3_tags(
|
|
|
65
65
|
if current_tags:
|
|
66
66
|
tags_to_add.extend([current_tag.tag for current_tag in current_tags.tags])
|
|
67
67
|
else:
|
|
68
|
-
logger.
|
|
68
|
+
logger.warning("Could not connect to DatahubApi. No current tags to maintain")
|
|
69
69
|
# Remove duplicate tags
|
|
70
70
|
tags_to_add = sorted(list(set(tags_to_add)))
|
|
71
71
|
new_tags = GlobalTagsClass(
|
|
@@ -257,7 +257,7 @@ class FeatureGroupProcessor:
|
|
|
257
257
|
mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)
|
|
258
258
|
|
|
259
259
|
return MetadataWorkUnit(
|
|
260
|
-
id=f
|
|
260
|
+
id=f"{feature_group_details['FeatureGroupName']}-{feature['FeatureName']}",
|
|
261
261
|
mce=mce,
|
|
262
262
|
)
|
|
263
263
|
|
|
@@ -212,7 +212,7 @@ class ModelProcessor:
|
|
|
212
212
|
mce = MetadataChangeEvent(proposedSnapshot=endpoint_snapshot)
|
|
213
213
|
|
|
214
214
|
return MetadataWorkUnit(
|
|
215
|
-
id=f
|
|
215
|
+
id=f"{endpoint_details['EndpointName']}",
|
|
216
216
|
mce=mce,
|
|
217
217
|
)
|
|
218
218
|
|
|
@@ -503,7 +503,7 @@ class ModelProcessor:
|
|
|
503
503
|
mce = MetadataChangeEvent(proposedSnapshot=model_snapshot)
|
|
504
504
|
|
|
505
505
|
return MetadataWorkUnit(
|
|
506
|
-
id=f
|
|
506
|
+
id=f"{model_details['ModelName']}",
|
|
507
507
|
mce=mce,
|
|
508
508
|
)
|
|
509
509
|
|
|
@@ -132,9 +132,9 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
132
132
|
self.filters = BigQueryFilter(self.config, self.report)
|
|
133
133
|
self.identifiers = BigQueryIdentifierBuilder(self.config, self.report)
|
|
134
134
|
|
|
135
|
-
redundant_lineage_run_skip_handler: Optional[
|
|
136
|
-
|
|
137
|
-
|
|
135
|
+
redundant_lineage_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = (
|
|
136
|
+
None
|
|
137
|
+
)
|
|
138
138
|
if self.config.enable_stateful_lineage_ingestion:
|
|
139
139
|
redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
|
|
140
140
|
source=self,
|
|
@@ -37,9 +37,9 @@ class BigqueryTableIdentifier:
|
|
|
37
37
|
|
|
38
38
|
# Note: this regex may get overwritten by the sharded_table_pattern config.
|
|
39
39
|
# The class-level constant, however, will not be overwritten.
|
|
40
|
-
_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[
|
|
41
|
-
|
|
42
|
-
|
|
40
|
+
_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[str] = (
|
|
41
|
+
_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX
|
|
42
|
+
)
|
|
43
43
|
_BIGQUERY_WILDCARD_REGEX: ClassVar[str] = "((_(\\d+)?)\\*$)|\\*$"
|
|
44
44
|
_BQ_SHARDED_TABLE_SUFFIX: str = "_yyyymmdd"
|
|
45
45
|
|
|
@@ -137,9 +137,9 @@ class BigQueryCredential(ConfigModel):
|
|
|
137
137
|
@root_validator(skip_on_failure=True)
|
|
138
138
|
def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
|
139
139
|
if values.get("client_x509_cert_url") is None:
|
|
140
|
-
values[
|
|
141
|
-
"
|
|
142
|
-
|
|
140
|
+
values["client_x509_cert_url"] = (
|
|
141
|
+
f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}"
|
|
142
|
+
)
|
|
143
143
|
return values
|
|
144
144
|
|
|
145
145
|
def create_credential_temp_file(self) -> str:
|
|
@@ -611,9 +611,9 @@ class BigQueryV2Config(
|
|
|
611
611
|
cls, v: Optional[List[str]], values: Dict
|
|
612
612
|
) -> Optional[List[str]]:
|
|
613
613
|
if values.get("use_exported_bigquery_audit_metadata"):
|
|
614
|
-
assert (
|
|
615
|
-
|
|
616
|
-
)
|
|
614
|
+
assert v and len(v) > 0, (
|
|
615
|
+
"`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
|
|
616
|
+
)
|
|
617
617
|
|
|
618
618
|
return v
|
|
619
619
|
|
|
@@ -87,9 +87,9 @@ class BigQueryPlatformResourceHelper:
|
|
|
87
87
|
key=platform_resource_key, graph_client=self.graph
|
|
88
88
|
)
|
|
89
89
|
if platform_resource:
|
|
90
|
-
self.platform_resource_cache[
|
|
91
|
-
|
|
92
|
-
|
|
90
|
+
self.platform_resource_cache[platform_resource_key.primary_key] = (
|
|
91
|
+
platform_resource
|
|
92
|
+
)
|
|
93
93
|
return platform_resource
|
|
94
94
|
return None
|
|
95
95
|
|
|
@@ -115,7 +115,11 @@ class BigQueryPlatformResourceHelper:
|
|
|
115
115
|
and platform_resource.resource_info.value
|
|
116
116
|
):
|
|
117
117
|
try:
|
|
118
|
-
existing_info: Optional[BigQueryLabelInfo] =
|
|
118
|
+
existing_info: Optional[BigQueryLabelInfo] = (
|
|
119
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
120
|
+
BigQueryLabelInfo
|
|
121
|
+
)
|
|
122
|
+
)
|
|
119
123
|
except ValidationError as e:
|
|
120
124
|
logger.error(
|
|
121
125
|
f"Error converting existing value to BigQueryLabelInfo: {e}. Creating new one. Maybe this is because of a non backward compatible schema change."
|
|
@@ -311,8 +311,10 @@ class BigQuerySchemaGenerator:
|
|
|
311
311
|
platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
|
|
312
312
|
label, tag_urn, managed_by_datahub=False
|
|
313
313
|
)
|
|
314
|
-
label_info: BigQueryLabelInfo =
|
|
315
|
-
|
|
314
|
+
label_info: BigQueryLabelInfo = (
|
|
315
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
316
|
+
BigQueryLabelInfo
|
|
317
|
+
)
|
|
316
318
|
)
|
|
317
319
|
tag_urn = TagUrn.from_string(label_info.datahub_urn)
|
|
318
320
|
|
|
@@ -820,8 +822,10 @@ class BigQuerySchemaGenerator:
|
|
|
820
822
|
platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
|
|
821
823
|
label, tag_urn, managed_by_datahub=False
|
|
822
824
|
)
|
|
823
|
-
label_info: BigQueryLabelInfo =
|
|
824
|
-
|
|
825
|
+
label_info: BigQueryLabelInfo = (
|
|
826
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
827
|
+
BigQueryLabelInfo
|
|
828
|
+
)
|
|
825
829
|
)
|
|
826
830
|
tag_urn = TagUrn.from_string(label_info.datahub_urn)
|
|
827
831
|
|
|
@@ -860,8 +864,10 @@ class BigQuerySchemaGenerator:
|
|
|
860
864
|
platform_resource: PlatformResource = self.platform_resource_helper.generate_label_platform_resource(
|
|
861
865
|
label, tag_urn, managed_by_datahub=False
|
|
862
866
|
)
|
|
863
|
-
label_info: BigQueryLabelInfo =
|
|
864
|
-
|
|
867
|
+
label_info: BigQueryLabelInfo = (
|
|
868
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
869
|
+
BigQueryLabelInfo
|
|
870
|
+
)
|
|
865
871
|
)
|
|
866
872
|
tag_urn = TagUrn.from_string(label_info.datahub_urn)
|
|
867
873
|
|
|
@@ -1203,9 +1209,9 @@ class BigQuerySchemaGenerator:
|
|
|
1203
1209
|
report=self.report,
|
|
1204
1210
|
)
|
|
1205
1211
|
|
|
1206
|
-
self.report.metadata_extraction_sec[
|
|
1207
|
-
|
|
1208
|
-
|
|
1212
|
+
self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = (
|
|
1213
|
+
timer.elapsed_seconds(digits=2)
|
|
1214
|
+
)
|
|
1209
1215
|
|
|
1210
1216
|
def get_core_table_details(
|
|
1211
1217
|
self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str
|
|
@@ -697,7 +697,7 @@ class BigqueryLineageExtractor:
|
|
|
697
697
|
if parsed_queries[-1]:
|
|
698
698
|
query = f"""create table `{destination_table.get_sanitized_table_ref().table_identifier.get_table_name()}` AS
|
|
699
699
|
(
|
|
700
|
-
{parsed_queries[-1].sql(dialect=
|
|
700
|
+
{parsed_queries[-1].sql(dialect="bigquery")}
|
|
701
701
|
)"""
|
|
702
702
|
else:
|
|
703
703
|
query = e.query
|
|
@@ -809,11 +809,11 @@ class BigqueryLineageExtractor:
|
|
|
809
809
|
upstream_lineage, temp_table_upstream
|
|
810
810
|
)
|
|
811
811
|
|
|
812
|
-
upstreams[
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
812
|
+
upstreams[ref_temp_table_upstream] = (
|
|
813
|
+
_merge_lineage_edge_columns(
|
|
814
|
+
upstreams.get(ref_temp_table_upstream),
|
|
815
|
+
collapsed_lineage,
|
|
816
|
+
)
|
|
817
817
|
)
|
|
818
818
|
else:
|
|
819
819
|
upstreams[upstream_table_ref] = _merge_lineage_edge_columns(
|
|
@@ -1004,9 +1004,9 @@ class BigqueryLineageExtractor:
|
|
|
1004
1004
|
dataset_urn
|
|
1005
1005
|
)
|
|
1006
1006
|
for gcs_dataset_urn in gcs_urns:
|
|
1007
|
-
schema_metadata_for_gcs: Optional[
|
|
1008
|
-
|
|
1009
|
-
|
|
1007
|
+
schema_metadata_for_gcs: Optional[SchemaMetadataClass] = (
|
|
1008
|
+
graph.get_schema_metadata(gcs_dataset_urn)
|
|
1009
|
+
)
|
|
1010
1010
|
if schema_metadata and schema_metadata_for_gcs:
|
|
1011
1011
|
fine_grained_lineage = self.get_fine_grained_lineages_with_gcs(
|
|
1012
1012
|
dataset_urn,
|
|
@@ -271,9 +271,9 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
271
271
|
# Preprocessing stage that deduplicates the queries using query hash per usage bucket
|
|
272
272
|
# Note: FileBackedDict is an ordered dictionary, so the order of execution of
|
|
273
273
|
# queries is inherently maintained
|
|
274
|
-
queries_deduped: FileBackedDict[
|
|
275
|
-
|
|
276
|
-
|
|
274
|
+
queries_deduped: FileBackedDict[Dict[int, ObservedQuery]] = (
|
|
275
|
+
self.deduplicate_queries(queries)
|
|
276
|
+
)
|
|
277
277
|
self.report.num_unique_queries = len(queries_deduped)
|
|
278
278
|
logger.info(f"Found {self.report.num_unique_queries} unique queries")
|
|
279
279
|
|
|
@@ -763,9 +763,9 @@ class BigQueryUsageExtractor:
|
|
|
763
763
|
)
|
|
764
764
|
|
|
765
765
|
if event.query_event.default_dataset:
|
|
766
|
-
custom_properties[
|
|
767
|
-
|
|
768
|
-
|
|
766
|
+
custom_properties["defaultDatabase"] = (
|
|
767
|
+
event.query_event.default_dataset
|
|
768
|
+
)
|
|
769
769
|
if event.read_event:
|
|
770
770
|
if event.read_event.readReason:
|
|
771
771
|
custom_properties["readReason"] = event.read_event.readReason
|
|
@@ -107,10 +107,10 @@ class CassandraToSchemaFieldConverter:
|
|
|
107
107
|
|
|
108
108
|
@staticmethod
|
|
109
109
|
def get_column_type(cassandra_column_type: str) -> SchemaFieldDataType:
|
|
110
|
-
type_class: Optional[
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
110
|
+
type_class: Optional[Type] = (
|
|
111
|
+
CassandraToSchemaFieldConverter._field_type_to_schema_field_type.get(
|
|
112
|
+
cassandra_column_type
|
|
113
|
+
)
|
|
114
114
|
)
|
|
115
115
|
if type_class is None:
|
|
116
116
|
logger.warning(
|
|
@@ -293,9 +293,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
|
|
|
293
293
|
def _load_json_schema_with_resolved_references(
|
|
294
294
|
self, schema: Schema, name: str, subject: str
|
|
295
295
|
) -> dict:
|
|
296
|
-
imported_json_schemas: List[
|
|
297
|
-
|
|
298
|
-
|
|
296
|
+
imported_json_schemas: List[JsonSchemaWrapper] = (
|
|
297
|
+
self.get_schemas_from_confluent_ref_json(schema, name=name, subject=subject)
|
|
298
|
+
)
|
|
299
299
|
schema_dict = json.loads(schema.schema_str)
|
|
300
300
|
reference_map = {}
|
|
301
301
|
for imported_schema in imported_json_schemas:
|
|
@@ -332,9 +332,9 @@ class ConfluentSchemaRegistry(KafkaSchemaRegistryBase):
|
|
|
332
332
|
)
|
|
333
333
|
|
|
334
334
|
elif schema.schema_type == "PROTOBUF":
|
|
335
|
-
imported_schemas: List[
|
|
336
|
-
|
|
337
|
-
|
|
335
|
+
imported_schemas: List[ProtobufSchema] = (
|
|
336
|
+
self.get_schemas_from_confluent_ref_protobuf(schema)
|
|
337
|
+
)
|
|
338
338
|
base_name: str = topic.replace(".", "_")
|
|
339
339
|
fields = protobuf_util.protobuf_schema_to_mce_fields(
|
|
340
340
|
ProtobufSchema(
|
|
@@ -371,11 +371,11 @@ class CSVEnricherSource(Source):
|
|
|
371
371
|
domain: Optional[str],
|
|
372
372
|
description: Optional[str],
|
|
373
373
|
) -> Iterable[MetadataWorkUnit]:
|
|
374
|
-
maybe_terms_wu: Optional[
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
374
|
+
maybe_terms_wu: Optional[MetadataWorkUnit] = (
|
|
375
|
+
self.get_resource_glossary_terms_work_unit(
|
|
376
|
+
entity_urn=entity_urn,
|
|
377
|
+
term_associations=term_associations,
|
|
378
|
+
)
|
|
379
379
|
)
|
|
380
380
|
if maybe_terms_wu:
|
|
381
381
|
self.report.num_glossary_term_workunits_produced += 1
|
|
@@ -389,31 +389,31 @@ class CSVEnricherSource(Source):
|
|
|
389
389
|
self.report.num_tag_workunits_produced += 1
|
|
390
390
|
yield maybe_tags_wu
|
|
391
391
|
|
|
392
|
-
maybe_owners_wu: Optional[
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
392
|
+
maybe_owners_wu: Optional[MetadataWorkUnit] = (
|
|
393
|
+
self.get_resource_owners_work_unit(
|
|
394
|
+
entity_urn=entity_urn,
|
|
395
|
+
owners=owners,
|
|
396
|
+
)
|
|
397
397
|
)
|
|
398
398
|
if maybe_owners_wu:
|
|
399
399
|
self.report.num_owners_workunits_produced += 1
|
|
400
400
|
yield maybe_owners_wu
|
|
401
401
|
|
|
402
|
-
maybe_domain_wu: Optional[
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
402
|
+
maybe_domain_wu: Optional[MetadataWorkUnit] = (
|
|
403
|
+
self.get_resource_domain_work_unit(
|
|
404
|
+
entity_urn=entity_urn,
|
|
405
|
+
domain=domain,
|
|
406
|
+
)
|
|
407
407
|
)
|
|
408
408
|
if maybe_domain_wu:
|
|
409
409
|
self.report.num_domain_workunits_produced += 1
|
|
410
410
|
yield maybe_domain_wu
|
|
411
411
|
|
|
412
|
-
maybe_description_wu: Optional[
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
412
|
+
maybe_description_wu: Optional[MetadataWorkUnit] = (
|
|
413
|
+
self.get_resource_description_work_unit(
|
|
414
|
+
entity_urn=entity_urn,
|
|
415
|
+
description=description,
|
|
416
|
+
)
|
|
417
417
|
)
|
|
418
418
|
if maybe_description_wu:
|
|
419
419
|
self.report.num_description_workunits_produced += 1
|
|
@@ -426,9 +426,9 @@ class CSVEnricherSource(Source):
|
|
|
426
426
|
needs_write: bool,
|
|
427
427
|
) -> Tuple[EditableSchemaMetadataClass, bool]:
|
|
428
428
|
field_path: str = sub_resource_row.field_path
|
|
429
|
-
term_associations: List[
|
|
430
|
-
|
|
431
|
-
|
|
429
|
+
term_associations: List[GlossaryTermAssociationClass] = (
|
|
430
|
+
sub_resource_row.term_associations
|
|
431
|
+
)
|
|
432
432
|
tag_associations: List[TagAssociationClass] = sub_resource_row.tag_associations
|
|
433
433
|
description: Optional[str] = sub_resource_row.description
|
|
434
434
|
has_terms: bool = len(term_associations) > 0
|
|
@@ -517,9 +517,9 @@ class CSVEnricherSource(Source):
|
|
|
517
517
|
# Boolean field to tell whether we need to write an MCPW.
|
|
518
518
|
needs_write = False
|
|
519
519
|
|
|
520
|
-
current_editable_schema_metadata: Optional[
|
|
521
|
-
|
|
522
|
-
|
|
520
|
+
current_editable_schema_metadata: Optional[EditableSchemaMetadataClass] = (
|
|
521
|
+
None
|
|
522
|
+
)
|
|
523
523
|
if self.ctx.graph and not self.should_overwrite:
|
|
524
524
|
# Fetch the current editable schema metadata
|
|
525
525
|
current_editable_schema_metadata = self.ctx.graph.get_aspect(
|
|
@@ -655,9 +655,9 @@ class CSVEnricherSource(Source):
|
|
|
655
655
|
entity_urn = row["resource"]
|
|
656
656
|
entity_type = Urn.from_string(row["resource"]).get_type()
|
|
657
657
|
|
|
658
|
-
term_associations: List[
|
|
659
|
-
|
|
660
|
-
|
|
658
|
+
term_associations: List[GlossaryTermAssociationClass] = (
|
|
659
|
+
self.maybe_extract_glossary_terms(row)
|
|
660
|
+
)
|
|
661
661
|
tag_associations: List[TagAssociationClass] = self.maybe_extract_tags(row)
|
|
662
662
|
owners: List[OwnerClass] = self.maybe_extract_owners(row, is_resource_row)
|
|
663
663
|
|
|
@@ -152,7 +152,9 @@ class DataHubDatabaseReader:
|
|
|
152
152
|
) -> Iterable[Dict[str, Any]]:
|
|
153
153
|
with self.engine.connect() as conn:
|
|
154
154
|
if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
|
|
155
|
-
with
|
|
155
|
+
with (
|
|
156
|
+
conn.begin()
|
|
157
|
+
): # Transaction required for PostgreSQL server-side cursor
|
|
156
158
|
# Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
|
|
157
159
|
# https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
|
|
158
160
|
conn = conn.execution_options(
|
|
@@ -222,7 +224,7 @@ class DataHubDatabaseReader:
|
|
|
222
224
|
)
|
|
223
225
|
except Exception as e:
|
|
224
226
|
logger.warning(
|
|
225
|
-
f
|
|
227
|
+
f"Failed to parse metadata for {row['urn']}: {e}", exc_info=True
|
|
226
228
|
)
|
|
227
229
|
self.report.num_database_parse_errors += 1
|
|
228
230
|
self.report.database_parse_errors.setdefault(
|