acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
- datahub/cli/cli_utils.py +13 -2
- datahub/cli/delete_cli.py +3 -3
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +5 -5
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_builder.py +27 -0
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/emitter/rest_emitter.py +126 -85
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source.py +4 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +22 -19
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +77 -47
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
- datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +60 -60
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +10 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/datahub/datahub_source.py +12 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/delta_lake/source.py +0 -5
- datahub/ingestion/source/demo_data.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/fivetran.py +1 -6
- datahub/ingestion/source/gc/datahub_gc.py +11 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +13 -6
- datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
- datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +11 -6
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/metabase.py +1 -6
- datahub/ingestion/source/mlflow.py +4 -9
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -31
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redash.py +0 -5
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +45 -46
- datahub/ingestion/source/redshift/usage.py +33 -33
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +11 -15
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
- datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +1 -2
- datahub/ingestion/source/sql/sql_utils.py +5 -0
- datahub/ingestion/source/sql/teradata.py +18 -5
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +1 -6
- datahub/ingestion/source/tableau/tableau.py +343 -117
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +74 -74
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +317 -44
- datahub/metadata/_urns/urn_defs.py +69 -15
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- datahub/metadata/schema.avsc +302 -89
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLModelProperties.avsc +96 -48
- datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
- datahub/metadata/schemas/VersionProperties.avsc +216 -0
- datahub/metadata/schemas/VersionSetKey.avsc +26 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +11 -11
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/perf_timer.py +11 -6
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
|
@@ -76,8 +76,9 @@ class LoggingCallback(WriteCallback):
|
|
|
76
76
|
failure_metadata: dict,
|
|
77
77
|
) -> None:
|
|
78
78
|
logger.error(
|
|
79
|
-
f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}"
|
|
80
|
-
|
|
79
|
+
f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}",
|
|
80
|
+
extra={"failure_metadata": failure_metadata},
|
|
81
|
+
exc_info=failure_exception,
|
|
81
82
|
)
|
|
82
83
|
|
|
83
84
|
|
|
@@ -108,9 +109,9 @@ class DeadLetterQueueCallback(WriteCallback):
|
|
|
108
109
|
mcp.systemMetadata.properties = {}
|
|
109
110
|
if "workunit_id" not in mcp.systemMetadata.properties:
|
|
110
111
|
# update the workunit id
|
|
111
|
-
mcp.systemMetadata.properties[
|
|
112
|
-
"workunit_id"
|
|
113
|
-
|
|
112
|
+
mcp.systemMetadata.properties["workunit_id"] = (
|
|
113
|
+
record_envelope.metadata["workunit_id"]
|
|
114
|
+
)
|
|
114
115
|
record_envelope.record = mcp
|
|
115
116
|
self.file_sink.write_record_async(record_envelope, self.logging_callback)
|
|
116
117
|
|
|
@@ -700,7 +701,7 @@ class Pipeline:
|
|
|
700
701
|
num_failures_sink = len(self.sink.get_report().failures)
|
|
701
702
|
click.secho(
|
|
702
703
|
message_template.format(
|
|
703
|
-
status=f"with at least {num_failures_source+num_failures_sink} failures"
|
|
704
|
+
status=f"with at least {num_failures_source + num_failures_sink} failures"
|
|
704
705
|
),
|
|
705
706
|
fg=self._get_text_color(
|
|
706
707
|
running=currently_running, failures=True, warnings=False
|
|
@@ -718,7 +719,7 @@ class Pipeline:
|
|
|
718
719
|
num_warn_global = len(global_warnings)
|
|
719
720
|
click.secho(
|
|
720
721
|
message_template.format(
|
|
721
|
-
status=f"with at least {num_warn_source+num_warn_sink+num_warn_global} warnings"
|
|
722
|
+
status=f"with at least {num_warn_source + num_warn_sink + num_warn_global} warnings"
|
|
722
723
|
),
|
|
723
724
|
fg=self._get_text_color(
|
|
724
725
|
running=currently_running, failures=False, warnings=True
|
|
@@ -92,9 +92,9 @@ class PipelineConfig(ConfigModel):
|
|
|
92
92
|
pipeline_name: Optional[str] = None
|
|
93
93
|
failure_log: FailureLoggingConfig = FailureLoggingConfig()
|
|
94
94
|
|
|
95
|
-
_raw_dict: Optional[
|
|
96
|
-
dict
|
|
97
|
-
|
|
95
|
+
_raw_dict: Optional[dict] = (
|
|
96
|
+
None # the raw dict that was parsed to construct this config
|
|
97
|
+
)
|
|
98
98
|
|
|
99
99
|
@validator("run_id", pre=True, always=True)
|
|
100
100
|
def run_id_should_be_semantic(
|
|
@@ -85,8 +85,8 @@ class DataLakeProfilerConfig(ConfigModel):
|
|
|
85
85
|
if field_level_metric.startswith("include_field_"):
|
|
86
86
|
values.setdefault(field_level_metric, False)
|
|
87
87
|
|
|
88
|
-
assert (
|
|
89
|
-
|
|
90
|
-
)
|
|
88
|
+
assert max_num_fields_to_profile is None, (
|
|
89
|
+
f"{max_num_fields_to_profile_key} should be set to None"
|
|
90
|
+
)
|
|
91
91
|
|
|
92
92
|
return values
|
|
@@ -508,7 +508,12 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
508
508
|
):
|
|
509
509
|
abs_path = self.create_abs_path(obj.name)
|
|
510
510
|
logger.debug(f"Sampling file: {abs_path}")
|
|
511
|
-
yield
|
|
511
|
+
yield (
|
|
512
|
+
abs_path,
|
|
513
|
+
obj.name,
|
|
514
|
+
obj.last_modified,
|
|
515
|
+
obj.size,
|
|
516
|
+
)
|
|
512
517
|
except Exception as e:
|
|
513
518
|
# This odd check if being done because boto does not have a proper exception to catch
|
|
514
519
|
# The exception that appears in stacktrace cannot actually be caught without a lot more work
|
|
@@ -552,9 +557,12 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
552
557
|
if os.path.isfile(prefix):
|
|
553
558
|
logger.debug(f"Scanning single local file: {prefix}")
|
|
554
559
|
file_name = prefix
|
|
555
|
-
yield
|
|
556
|
-
|
|
557
|
-
|
|
560
|
+
yield (
|
|
561
|
+
prefix,
|
|
562
|
+
file_name,
|
|
563
|
+
datetime.utcfromtimestamp(os.path.getmtime(prefix)),
|
|
564
|
+
os.path.getsize(prefix),
|
|
565
|
+
)
|
|
558
566
|
else:
|
|
559
567
|
logger.debug(f"Scanning files under local folder: {prefix}")
|
|
560
568
|
for root, dirs, files in os.walk(prefix):
|
|
@@ -565,9 +573,12 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
565
573
|
full_path = PurePath(
|
|
566
574
|
os.path.normpath(os.path.join(root, file))
|
|
567
575
|
).as_posix()
|
|
568
|
-
yield
|
|
569
|
-
|
|
570
|
-
|
|
576
|
+
yield (
|
|
577
|
+
full_path,
|
|
578
|
+
file,
|
|
579
|
+
datetime.utcfromtimestamp(os.path.getmtime(full_path)),
|
|
580
|
+
os.path.getsize(full_path),
|
|
581
|
+
)
|
|
571
582
|
|
|
572
583
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
573
584
|
self.container_WU_creator = ContainerWUCreator(
|
|
@@ -613,7 +624,7 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
613
624
|
table_data.table_path
|
|
614
625
|
].timestamp = table_data.timestamp
|
|
615
626
|
|
|
616
|
-
for
|
|
627
|
+
for _, table_data in table_dict.items():
|
|
617
628
|
yield from self.ingest_table(table_data, path_spec)
|
|
618
629
|
|
|
619
630
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
@@ -248,6 +248,9 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
|
|
|
248
248
|
"Enabled by default when stateful ingestion is turned on.",
|
|
249
249
|
)
|
|
250
250
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
251
|
+
@capability(
|
|
252
|
+
SourceCapability.LINEAGE_FINE, "Support via the `emit_s3_lineage` config field"
|
|
253
|
+
)
|
|
251
254
|
class GlueSource(StatefulIngestionSourceBase):
|
|
252
255
|
"""
|
|
253
256
|
Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../../../../docs/generated/ingestion/sources/s3.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub.
|
|
@@ -284,12 +287,22 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
284
287
|
"Action": [
|
|
285
288
|
"glue:GetDataflowGraph",
|
|
286
289
|
"glue:GetJobs",
|
|
290
|
+
"s3:GetObject",
|
|
287
291
|
],
|
|
288
292
|
"Resource": "*"
|
|
289
293
|
}
|
|
290
294
|
```
|
|
291
295
|
|
|
292
|
-
|
|
296
|
+
For profiling datasets, the following additional permissions are required:
|
|
297
|
+
```json
|
|
298
|
+
{
|
|
299
|
+
"Effect": "Allow",
|
|
300
|
+
"Action": [
|
|
301
|
+
"glue:GetPartitions",
|
|
302
|
+
],
|
|
303
|
+
"Resource": "*"
|
|
304
|
+
}
|
|
305
|
+
```
|
|
293
306
|
|
|
294
307
|
"""
|
|
295
308
|
|
|
@@ -508,7 +521,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
508
521
|
# otherwise, a node represents a transformation
|
|
509
522
|
else:
|
|
510
523
|
node_urn = mce_builder.make_data_job_urn_with_flow(
|
|
511
|
-
flow_urn, job_id=f
|
|
524
|
+
flow_urn, job_id=f"{node['NodeType']}-{node['Id']}"
|
|
512
525
|
)
|
|
513
526
|
|
|
514
527
|
return {
|
|
@@ -666,7 +679,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
666
679
|
)
|
|
667
680
|
)
|
|
668
681
|
|
|
669
|
-
return MetadataWorkUnit(id=f
|
|
682
|
+
return MetadataWorkUnit(id=f"{job_name}-{node['Id']}", mce=mce)
|
|
670
683
|
|
|
671
684
|
def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
|
|
672
685
|
logger.debug("Getting all databases")
|
|
@@ -737,13 +750,13 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
737
750
|
) -> Optional[MetadataWorkUnit]:
|
|
738
751
|
if self.source_config.emit_s3_lineage:
|
|
739
752
|
# extract dataset properties aspect
|
|
740
|
-
dataset_properties: Optional[
|
|
741
|
-
DatasetPropertiesClass
|
|
742
|
-
|
|
753
|
+
dataset_properties: Optional[DatasetPropertiesClass] = (
|
|
754
|
+
mce_builder.get_aspect_if_available(mce, DatasetPropertiesClass)
|
|
755
|
+
)
|
|
743
756
|
# extract dataset schema aspect
|
|
744
|
-
schema_metadata: Optional[
|
|
745
|
-
SchemaMetadataClass
|
|
746
|
-
|
|
757
|
+
schema_metadata: Optional[SchemaMetadataClass] = (
|
|
758
|
+
mce_builder.get_aspect_if_available(mce, SchemaMetadataClass)
|
|
759
|
+
)
|
|
747
760
|
|
|
748
761
|
if dataset_properties and "Location" in dataset_properties.customProperties:
|
|
749
762
|
location = dataset_properties.customProperties["Location"]
|
|
@@ -752,9 +765,9 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
752
765
|
location, self.source_config.env
|
|
753
766
|
)
|
|
754
767
|
assert self.ctx.graph
|
|
755
|
-
schema_metadata_for_s3: Optional[
|
|
756
|
-
|
|
757
|
-
|
|
768
|
+
schema_metadata_for_s3: Optional[SchemaMetadataClass] = (
|
|
769
|
+
self.ctx.graph.get_schema_metadata(s3_dataset_urn)
|
|
770
|
+
)
|
|
758
771
|
|
|
759
772
|
if self.source_config.glue_s3_lineage_direction == "upstream":
|
|
760
773
|
fine_grained_lineages = None
|
|
@@ -1054,49 +1067,66 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1054
1067
|
yield from self.gen_database_containers(database)
|
|
1055
1068
|
|
|
1056
1069
|
for table in tables:
|
|
1057
|
-
database_name = table["DatabaseName"]
|
|
1058
1070
|
table_name = table["Name"]
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1071
|
+
try:
|
|
1072
|
+
yield from self._gen_table_wu(table=table)
|
|
1073
|
+
except KeyError as e:
|
|
1074
|
+
self.report.report_failure(
|
|
1075
|
+
message="Failed to extract workunit for table",
|
|
1076
|
+
context=f"Table: {table_name}",
|
|
1077
|
+
exc=e,
|
|
1078
|
+
)
|
|
1079
|
+
if self.extract_transforms:
|
|
1080
|
+
yield from self._transform_extraction()
|
|
1066
1081
|
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1082
|
+
def _gen_table_wu(self, table: Dict) -> Iterable[MetadataWorkUnit]:
|
|
1083
|
+
database_name = table["DatabaseName"]
|
|
1084
|
+
table_name = table["Name"]
|
|
1085
|
+
full_table_name = f"{database_name}.{table_name}"
|
|
1086
|
+
self.report.report_table_scanned()
|
|
1087
|
+
if not self.source_config.database_pattern.allowed(
|
|
1088
|
+
database_name
|
|
1089
|
+
) or not self.source_config.table_pattern.allowed(full_table_name):
|
|
1090
|
+
self.report.report_table_dropped(full_table_name)
|
|
1091
|
+
return
|
|
1092
|
+
|
|
1093
|
+
dataset_urn = make_dataset_urn_with_platform_instance(
|
|
1094
|
+
platform=self.platform,
|
|
1095
|
+
name=full_table_name,
|
|
1096
|
+
env=self.env,
|
|
1097
|
+
platform_instance=self.source_config.platform_instance,
|
|
1098
|
+
)
|
|
1073
1099
|
|
|
1074
|
-
|
|
1075
|
-
|
|
1100
|
+
mce = self._extract_record(dataset_urn, table, full_table_name)
|
|
1101
|
+
yield MetadataWorkUnit(full_table_name, mce=mce)
|
|
1076
1102
|
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1103
|
+
# We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
|
|
1104
|
+
# possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
|
|
1105
|
+
yield MetadataChangeProposalWrapper(
|
|
1106
|
+
entityUrn=dataset_urn,
|
|
1107
|
+
aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
|
|
1108
|
+
).as_workunit()
|
|
1083
1109
|
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1110
|
+
yield from self._get_domain_wu(
|
|
1111
|
+
dataset_name=full_table_name,
|
|
1112
|
+
entity_urn=dataset_urn,
|
|
1113
|
+
)
|
|
1114
|
+
yield from self.add_table_to_database_container(
|
|
1115
|
+
dataset_urn=dataset_urn, db_name=database_name
|
|
1116
|
+
)
|
|
1091
1117
|
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1118
|
+
wu = self.get_lineage_if_enabled(mce)
|
|
1119
|
+
if wu:
|
|
1120
|
+
yield wu
|
|
1095
1121
|
|
|
1122
|
+
try:
|
|
1096
1123
|
yield from self.get_profile_if_enabled(mce, database_name, table_name)
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1124
|
+
except KeyError as e:
|
|
1125
|
+
self.report.report_failure(
|
|
1126
|
+
message="Failed to extract profile for table",
|
|
1127
|
+
context=f"Table: {dataset_urn}",
|
|
1128
|
+
exc=e,
|
|
1129
|
+
)
|
|
1100
1130
|
|
|
1101
1131
|
def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
|
|
1102
1132
|
dags: Dict[str, Optional[Dict[str, Any]]] = {}
|
|
@@ -40,7 +40,7 @@ def get_s3_tags(
|
|
|
40
40
|
]
|
|
41
41
|
)
|
|
42
42
|
except s3.meta.client.exceptions.ClientError:
|
|
43
|
-
logger.
|
|
43
|
+
logger.warning(f"No tags found for bucket={bucket_name}")
|
|
44
44
|
|
|
45
45
|
if use_s3_object_tags and key_name is not None:
|
|
46
46
|
s3_client = aws_config.get_s3_client()
|
|
@@ -53,7 +53,7 @@ def get_s3_tags(
|
|
|
53
53
|
else:
|
|
54
54
|
# Unlike bucket tags, if an object does not have tags, it will just return an empty array
|
|
55
55
|
# as opposed to an exception.
|
|
56
|
-
logger.
|
|
56
|
+
logger.warning(f"No tags found for bucket={bucket_name} key={key_name}")
|
|
57
57
|
if len(tags_to_add) == 0:
|
|
58
58
|
return None
|
|
59
59
|
if ctx.graph is not None:
|
|
@@ -65,7 +65,7 @@ def get_s3_tags(
|
|
|
65
65
|
if current_tags:
|
|
66
66
|
tags_to_add.extend([current_tag.tag for current_tag in current_tags.tags])
|
|
67
67
|
else:
|
|
68
|
-
logger.
|
|
68
|
+
logger.warning("Could not connect to DatahubApi. No current tags to maintain")
|
|
69
69
|
# Remove duplicate tags
|
|
70
70
|
tags_to_add = sorted(list(set(tags_to_add)))
|
|
71
71
|
new_tags = GlobalTagsClass(
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
|
-
from
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from mypy_boto3_s3.service_resource import ObjectSummary
|
|
8
|
+
|
|
4
9
|
|
|
5
10
|
S3_PREFIXES = ["s3://", "s3n://", "s3a://"]
|
|
6
11
|
|
|
@@ -68,3 +73,21 @@ def get_key_prefix(s3_uri: str) -> str:
|
|
|
68
73
|
f"Not an S3 URI. Must start with one of the following prefixes: {str(S3_PREFIXES)}"
|
|
69
74
|
)
|
|
70
75
|
return strip_s3_prefix(s3_uri).split("/", maxsplit=1)[1]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def group_s3_objects_by_dirname(
|
|
79
|
+
s3_objects: Iterable["ObjectSummary"],
|
|
80
|
+
) -> Dict[str, List["ObjectSummary"]]:
|
|
81
|
+
"""
|
|
82
|
+
Groups S3 objects by their directory name.
|
|
83
|
+
|
|
84
|
+
If a s3_object in the root directory (i.e., s3://bucket/file.txt), it is grouped under '/'.
|
|
85
|
+
"""
|
|
86
|
+
grouped_s3_objs = defaultdict(list)
|
|
87
|
+
for obj in s3_objects:
|
|
88
|
+
if "/" in obj.key:
|
|
89
|
+
dirname = obj.key.rsplit("/", 1)[0]
|
|
90
|
+
else:
|
|
91
|
+
dirname = "/"
|
|
92
|
+
grouped_s3_objs[dirname].append(obj)
|
|
93
|
+
return grouped_s3_objs
|
|
@@ -257,7 +257,7 @@ class FeatureGroupProcessor:
|
|
|
257
257
|
mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)
|
|
258
258
|
|
|
259
259
|
return MetadataWorkUnit(
|
|
260
|
-
id=f
|
|
260
|
+
id=f"{feature_group_details['FeatureGroupName']}-{feature['FeatureName']}",
|
|
261
261
|
mce=mce,
|
|
262
262
|
)
|
|
263
263
|
|
|
@@ -212,7 +212,7 @@ class ModelProcessor:
|
|
|
212
212
|
mce = MetadataChangeEvent(proposedSnapshot=endpoint_snapshot)
|
|
213
213
|
|
|
214
214
|
return MetadataWorkUnit(
|
|
215
|
-
id=f
|
|
215
|
+
id=f"{endpoint_details['EndpointName']}",
|
|
216
216
|
mce=mce,
|
|
217
217
|
)
|
|
218
218
|
|
|
@@ -503,7 +503,7 @@ class ModelProcessor:
|
|
|
503
503
|
mce = MetadataChangeEvent(proposedSnapshot=model_snapshot)
|
|
504
504
|
|
|
505
505
|
return MetadataWorkUnit(
|
|
506
|
-
id=f
|
|
506
|
+
id=f"{model_details['ModelName']}",
|
|
507
507
|
mce=mce,
|
|
508
508
|
)
|
|
509
509
|
|
|
@@ -132,9 +132,9 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
132
132
|
self.filters = BigQueryFilter(self.config, self.report)
|
|
133
133
|
self.identifiers = BigQueryIdentifierBuilder(self.config, self.report)
|
|
134
134
|
|
|
135
|
-
redundant_lineage_run_skip_handler: Optional[
|
|
136
|
-
|
|
137
|
-
|
|
135
|
+
redundant_lineage_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = (
|
|
136
|
+
None
|
|
137
|
+
)
|
|
138
138
|
if self.config.enable_stateful_lineage_ingestion:
|
|
139
139
|
redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
|
|
140
140
|
source=self,
|
|
@@ -253,14 +253,14 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
253
253
|
for project in projects:
|
|
254
254
|
yield from self.bq_schema_extractor.get_project_workunits(project)
|
|
255
255
|
|
|
256
|
-
self.report.
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
256
|
+
with self.report.new_stage("*: View and Snapshot Lineage"):
|
|
257
|
+
yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
|
|
258
|
+
[p.id for p in projects],
|
|
259
|
+
self.bq_schema_extractor.view_refs_by_project,
|
|
260
|
+
self.bq_schema_extractor.view_definitions,
|
|
261
|
+
self.bq_schema_extractor.snapshot_refs_by_project,
|
|
262
|
+
self.bq_schema_extractor.snapshots_by_ref,
|
|
263
|
+
)
|
|
264
264
|
|
|
265
265
|
if self.config.use_queries_v2:
|
|
266
266
|
# if both usage and lineage are disabled then skip queries extractor piece
|
|
@@ -270,29 +270,29 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
270
270
|
):
|
|
271
271
|
return
|
|
272
272
|
|
|
273
|
-
self.report.
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
273
|
+
with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
|
|
274
|
+
with BigQueryQueriesExtractor(
|
|
275
|
+
connection=self.config.get_bigquery_client(),
|
|
276
|
+
schema_api=self.bq_schema_extractor.schema_api,
|
|
277
|
+
config=BigQueryQueriesExtractorConfig(
|
|
278
|
+
window=self.config,
|
|
279
|
+
user_email_pattern=self.config.usage.user_email_pattern,
|
|
280
|
+
include_lineage=self.config.include_table_lineage,
|
|
281
|
+
include_usage_statistics=self.config.include_usage_statistics,
|
|
282
|
+
include_operations=self.config.usage.include_operational_stats,
|
|
283
|
+
include_queries=self.config.include_queries,
|
|
284
|
+
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
285
|
+
top_n_queries=self.config.usage.top_n_queries,
|
|
286
|
+
region_qualifiers=self.config.region_qualifiers,
|
|
287
|
+
),
|
|
288
|
+
structured_report=self.report,
|
|
289
|
+
filters=self.filters,
|
|
290
|
+
identifiers=self.identifiers,
|
|
291
|
+
schema_resolver=self.sql_parser_schema_resolver,
|
|
292
|
+
discovered_tables=self.bq_schema_extractor.table_refs,
|
|
293
|
+
) as queries_extractor:
|
|
294
|
+
self.report.queries_extractor = queries_extractor.report
|
|
295
|
+
yield from queries_extractor.get_workunits_internal()
|
|
296
296
|
else:
|
|
297
297
|
if self.config.include_usage_statistics:
|
|
298
298
|
yield from self.usage_extractor.get_usage_workunits(
|
|
@@ -37,9 +37,9 @@ class BigqueryTableIdentifier:
|
|
|
37
37
|
|
|
38
38
|
# Note: this regex may get overwritten by the sharded_table_pattern config.
|
|
39
39
|
# The class-level constant, however, will not be overwritten.
|
|
40
|
-
_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[
|
|
41
|
-
|
|
42
|
-
|
|
40
|
+
_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[str] = (
|
|
41
|
+
_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX
|
|
42
|
+
)
|
|
43
43
|
_BIGQUERY_WILDCARD_REGEX: ClassVar[str] = "((_(\\d+)?)\\*$)|\\*$"
|
|
44
44
|
_BQ_SHARDED_TABLE_SUFFIX: str = "_yyyymmdd"
|
|
45
45
|
|
|
@@ -137,9 +137,9 @@ class BigQueryCredential(ConfigModel):
|
|
|
137
137
|
@root_validator(skip_on_failure=True)
|
|
138
138
|
def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
|
139
139
|
if values.get("client_x509_cert_url") is None:
|
|
140
|
-
values[
|
|
141
|
-
"
|
|
142
|
-
|
|
140
|
+
values["client_x509_cert_url"] = (
|
|
141
|
+
f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}"
|
|
142
|
+
)
|
|
143
143
|
return values
|
|
144
144
|
|
|
145
145
|
def create_credential_temp_file(self) -> str:
|
|
@@ -447,6 +447,14 @@ class BigQueryV2Config(
|
|
|
447
447
|
default=False,
|
|
448
448
|
description="If enabled, uses the new queries extractor to extract queries from bigquery.",
|
|
449
449
|
)
|
|
450
|
+
include_queries: bool = Field(
|
|
451
|
+
default=True,
|
|
452
|
+
description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
|
|
453
|
+
)
|
|
454
|
+
include_query_usage_statistics: bool = Field(
|
|
455
|
+
default=True,
|
|
456
|
+
description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
|
|
457
|
+
)
|
|
450
458
|
|
|
451
459
|
@property
|
|
452
460
|
def have_table_data_read_permission(self) -> bool:
|
|
@@ -603,9 +611,9 @@ class BigQueryV2Config(
|
|
|
603
611
|
cls, v: Optional[List[str]], values: Dict
|
|
604
612
|
) -> Optional[List[str]]:
|
|
605
613
|
if values.get("use_exported_bigquery_audit_metadata"):
|
|
606
|
-
assert (
|
|
607
|
-
|
|
608
|
-
)
|
|
614
|
+
assert v and len(v) > 0, (
|
|
615
|
+
"`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
|
|
616
|
+
)
|
|
609
617
|
|
|
610
618
|
return v
|
|
611
619
|
|
|
@@ -87,9 +87,9 @@ class BigQueryPlatformResourceHelper:
|
|
|
87
87
|
key=platform_resource_key, graph_client=self.graph
|
|
88
88
|
)
|
|
89
89
|
if platform_resource:
|
|
90
|
-
self.platform_resource_cache[
|
|
91
|
-
|
|
92
|
-
|
|
90
|
+
self.platform_resource_cache[platform_resource_key.primary_key] = (
|
|
91
|
+
platform_resource
|
|
92
|
+
)
|
|
93
93
|
return platform_resource
|
|
94
94
|
return None
|
|
95
95
|
|
|
@@ -115,7 +115,11 @@ class BigQueryPlatformResourceHelper:
|
|
|
115
115
|
and platform_resource.resource_info.value
|
|
116
116
|
):
|
|
117
117
|
try:
|
|
118
|
-
existing_info: Optional[BigQueryLabelInfo] =
|
|
118
|
+
existing_info: Optional[BigQueryLabelInfo] = (
|
|
119
|
+
platform_resource.resource_info.value.as_pydantic_object( # type: ignore
|
|
120
|
+
BigQueryLabelInfo
|
|
121
|
+
)
|
|
122
|
+
)
|
|
119
123
|
except ValidationError as e:
|
|
120
124
|
logger.error(
|
|
121
125
|
f"Error converting existing value to BigQueryLabelInfo: {e}. Creating new one. Maybe this is because of a non backward compatible schema change."
|
|
@@ -190,6 +190,3 @@ class BigQueryV2Report(
|
|
|
190
190
|
num_skipped_external_table_lineage: int = 0
|
|
191
191
|
|
|
192
192
|
queries_extractor: Optional[BigQueryQueriesExtractorReport] = None
|
|
193
|
-
|
|
194
|
-
def set_ingestion_stage(self, project_id: str, stage: str) -> None:
|
|
195
|
-
self.report_ingestion_stage_start(f"{project_id}: {stage}")
|