acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
- datahub/cli/cli_utils.py +13 -2
- datahub/cli/delete_cli.py +3 -3
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +5 -5
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_builder.py +27 -0
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/emitter/rest_emitter.py +126 -85
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source.py +4 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +22 -19
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +77 -47
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
- datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +60 -60
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +10 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/datahub/datahub_source.py +12 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/delta_lake/source.py +0 -5
- datahub/ingestion/source/demo_data.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/fivetran.py +1 -6
- datahub/ingestion/source/gc/datahub_gc.py +11 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +13 -6
- datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
- datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +11 -6
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/metabase.py +1 -6
- datahub/ingestion/source/mlflow.py +4 -9
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -31
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redash.py +0 -5
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +45 -46
- datahub/ingestion/source/redshift/usage.py +33 -33
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +11 -15
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
- datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +1 -2
- datahub/ingestion/source/sql/sql_utils.py +5 -0
- datahub/ingestion/source/sql/teradata.py +18 -5
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +1 -6
- datahub/ingestion/source/tableau/tableau.py +343 -117
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +74 -74
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +317 -44
- datahub/metadata/_urns/urn_defs.py +69 -15
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- datahub/metadata/schema.avsc +302 -89
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLModelProperties.avsc +96 -48
- datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
- datahub/metadata/schemas/VersionProperties.avsc +216 -0
- datahub/metadata/schemas/VersionSetKey.avsc +26 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +11 -11
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/perf_timer.py +11 -6
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
|
@@ -642,8 +642,11 @@ class TableauUpstreamReference:
|
|
|
642
642
|
|
|
643
643
|
@classmethod
|
|
644
644
|
def create(
|
|
645
|
-
cls, d:
|
|
645
|
+
cls, d: Dict, default_schema_map: Optional[Dict[str, str]] = None
|
|
646
646
|
) -> "TableauUpstreamReference":
|
|
647
|
+
if d is None:
|
|
648
|
+
raise ValueError("TableauUpstreamReference.create: d is None")
|
|
649
|
+
|
|
647
650
|
# Values directly from `table` object from Tableau
|
|
648
651
|
database_dict = (
|
|
649
652
|
d.get(c.DATABASE) or {}
|
|
@@ -717,7 +720,7 @@ class TableauUpstreamReference:
|
|
|
717
720
|
# schema
|
|
718
721
|
|
|
719
722
|
# TODO: Validate the startswith check. Currently required for our integration tests
|
|
720
|
-
if full_name is None
|
|
723
|
+
if full_name is None:
|
|
721
724
|
return None
|
|
722
725
|
|
|
723
726
|
return full_name.replace("[", "").replace("]", "").split(".")
|
|
@@ -254,7 +254,9 @@ class UnityCatalogSourceConfig(
|
|
|
254
254
|
)
|
|
255
255
|
|
|
256
256
|
# TODO: Remove `type:ignore` by refactoring config
|
|
257
|
-
profiling: Union[
|
|
257
|
+
profiling: Union[
|
|
258
|
+
UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
|
|
259
|
+
] = Field( # type: ignore
|
|
258
260
|
default=UnityCatalogGEProfilerConfig(),
|
|
259
261
|
description="Data profiling configuration",
|
|
260
262
|
discriminator="method",
|
|
@@ -363,7 +363,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
363
363
|
|
|
364
364
|
@staticmethod
|
|
365
365
|
def _create_metastore(
|
|
366
|
-
obj: Union[GetMetastoreSummaryResponse, MetastoreInfo]
|
|
366
|
+
obj: Union[GetMetastoreSummaryResponse, MetastoreInfo],
|
|
367
367
|
) -> Optional[Metastore]:
|
|
368
368
|
if not obj.name:
|
|
369
369
|
return None
|
|
@@ -205,9 +205,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
205
205
|
self.table_refs: Set[TableReference] = set()
|
|
206
206
|
self.view_refs: Set[TableReference] = set()
|
|
207
207
|
self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
|
|
208
|
-
self.view_definitions: FileBackedDict[
|
|
209
|
-
|
|
210
|
-
|
|
208
|
+
self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
|
|
209
|
+
FileBackedDict()
|
|
210
|
+
)
|
|
211
211
|
|
|
212
212
|
# Global map of tables, for profiling
|
|
213
213
|
self.tables: FileBackedDict[Table] = FileBackedDict()
|
|
@@ -263,86 +263,86 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
263
263
|
]
|
|
264
264
|
|
|
265
265
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
266
|
-
self.report.
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
266
|
+
with self.report.new_stage("Ingestion Setup"):
|
|
267
|
+
wait_on_warehouse = None
|
|
268
|
+
if self.config.include_hive_metastore:
|
|
269
|
+
with self.report.new_stage("Start warehouse"):
|
|
270
|
+
# Can take several minutes, so start now and wait later
|
|
271
|
+
wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
|
|
272
|
+
if wait_on_warehouse is None:
|
|
273
|
+
self.report.report_failure(
|
|
274
|
+
"initialization",
|
|
275
|
+
f"SQL warehouse {self.config.profiling.warehouse_id} not found",
|
|
276
|
+
)
|
|
277
|
+
return
|
|
278
|
+
else:
|
|
279
|
+
# wait until warehouse is started
|
|
280
|
+
wait_on_warehouse.result()
|
|
281
281
|
|
|
282
282
|
if self.config.include_ownership:
|
|
283
|
-
self.report.
|
|
284
|
-
|
|
285
|
-
|
|
283
|
+
with self.report.new_stage("Ingest service principals"):
|
|
284
|
+
self.build_service_principal_map()
|
|
285
|
+
self.build_groups_map()
|
|
286
286
|
if self.config.include_notebooks:
|
|
287
|
-
self.report.
|
|
288
|
-
|
|
287
|
+
with self.report.new_stage("Ingest notebooks"):
|
|
288
|
+
yield from self.process_notebooks()
|
|
289
289
|
|
|
290
290
|
yield from self.process_metastores()
|
|
291
291
|
|
|
292
292
|
yield from self.get_view_lineage()
|
|
293
293
|
|
|
294
294
|
if self.config.include_notebooks:
|
|
295
|
-
self.report.
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
295
|
+
with self.report.new_stage("Notebook lineage"):
|
|
296
|
+
for notebook in self.notebooks.values():
|
|
297
|
+
wu = self._gen_notebook_lineage(notebook)
|
|
298
|
+
if wu:
|
|
299
|
+
yield wu
|
|
300
300
|
|
|
301
301
|
if self.config.include_usage_statistics:
|
|
302
|
-
self.report.
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
)
|
|
313
|
-
|
|
314
|
-
if self.config.is_profiling_enabled():
|
|
315
|
-
self.report.report_ingestion_stage_start("Start warehouse")
|
|
316
|
-
# Need to start the warehouse again for profiling,
|
|
317
|
-
# as it may have been stopped after ingestion might take
|
|
318
|
-
# longer time to complete
|
|
319
|
-
wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
|
|
320
|
-
if wait_on_warehouse is None:
|
|
321
|
-
self.report.report_failure(
|
|
322
|
-
"initialization",
|
|
323
|
-
f"SQL warehouse {self.config.profiling.warehouse_id} not found",
|
|
302
|
+
with self.report.new_stage("Ingest usage"):
|
|
303
|
+
usage_extractor = UnityCatalogUsageExtractor(
|
|
304
|
+
config=self.config,
|
|
305
|
+
report=self.report,
|
|
306
|
+
proxy=self.unity_catalog_api_proxy,
|
|
307
|
+
table_urn_builder=self.gen_dataset_urn,
|
|
308
|
+
user_urn_builder=self.gen_user_urn,
|
|
309
|
+
)
|
|
310
|
+
yield from usage_extractor.get_usage_workunits(
|
|
311
|
+
self.table_refs | self.view_refs
|
|
324
312
|
)
|
|
325
|
-
return
|
|
326
|
-
else:
|
|
327
|
-
# wait until warehouse is started
|
|
328
|
-
wait_on_warehouse.result()
|
|
329
313
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
314
|
+
if self.config.is_profiling_enabled():
|
|
315
|
+
with self.report.new_stage("Start warehouse"):
|
|
316
|
+
# Need to start the warehouse again for profiling,
|
|
317
|
+
# as it may have been stopped after ingestion might take
|
|
318
|
+
# longer time to complete
|
|
319
|
+
wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
|
|
320
|
+
if wait_on_warehouse is None:
|
|
321
|
+
self.report.report_failure(
|
|
322
|
+
"initialization",
|
|
323
|
+
f"SQL warehouse {self.config.profiling.warehouse_id} not found",
|
|
324
|
+
)
|
|
325
|
+
return
|
|
326
|
+
else:
|
|
327
|
+
# wait until warehouse is started
|
|
328
|
+
wait_on_warehouse.result()
|
|
329
|
+
|
|
330
|
+
with self.report.new_stage("Profiling"):
|
|
331
|
+
if isinstance(self.config.profiling, UnityCatalogAnalyzeProfilerConfig):
|
|
332
|
+
yield from UnityCatalogAnalyzeProfiler(
|
|
333
|
+
self.config.profiling,
|
|
334
|
+
self.report,
|
|
335
|
+
self.unity_catalog_api_proxy,
|
|
336
|
+
self.gen_dataset_urn,
|
|
337
|
+
).get_workunits(self.table_refs)
|
|
338
|
+
elif isinstance(self.config.profiling, UnityCatalogGEProfilerConfig):
|
|
339
|
+
yield from UnityCatalogGEProfiler(
|
|
340
|
+
sql_common_config=self.config,
|
|
341
|
+
profiling_config=self.config.profiling,
|
|
342
|
+
report=self.report,
|
|
343
|
+
).get_workunits(list(self.tables.values()))
|
|
344
|
+
else:
|
|
345
|
+
raise ValueError("Unknown profiling config method")
|
|
346
346
|
|
|
347
347
|
def build_service_principal_map(self) -> None:
|
|
348
348
|
try:
|
|
@@ -462,11 +462,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
462
462
|
self.report.schemas.dropped(schema.id)
|
|
463
463
|
continue
|
|
464
464
|
|
|
465
|
-
self.report.
|
|
466
|
-
|
|
467
|
-
|
|
465
|
+
with self.report.new_stage(f"Ingest schema {schema.id}"):
|
|
466
|
+
yield from self.gen_schema_containers(schema)
|
|
467
|
+
yield from self.process_tables(schema)
|
|
468
468
|
|
|
469
|
-
|
|
469
|
+
self.report.schemas.processed(schema.id)
|
|
470
470
|
|
|
471
471
|
def process_tables(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
|
|
472
472
|
for table in self.unity_catalog_api_proxy.tables(schema=schema):
|
|
@@ -103,7 +103,9 @@ class UnityCatalogUsageExtractor:
|
|
|
103
103
|
query, table_info
|
|
104
104
|
)
|
|
105
105
|
for source_table in table_info.source_tables:
|
|
106
|
-
with
|
|
106
|
+
with (
|
|
107
|
+
self.report.usage_perf_report.aggregator_add_event_timer
|
|
108
|
+
):
|
|
107
109
|
self.usage_aggregator.aggregate_event(
|
|
108
110
|
resource=source_table,
|
|
109
111
|
start_time=query.start_time,
|
|
@@ -213,15 +213,15 @@ class ClickHouseUsageSource(Source):
|
|
|
213
213
|
def _aggregate_access_events(
|
|
214
214
|
self, events: List[ClickHouseJoinedAccessEvent]
|
|
215
215
|
) -> Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]]:
|
|
216
|
-
datasets: Dict[
|
|
217
|
-
|
|
218
|
-
|
|
216
|
+
datasets: Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]] = (
|
|
217
|
+
collections.defaultdict(dict)
|
|
218
|
+
)
|
|
219
219
|
|
|
220
220
|
for event in events:
|
|
221
221
|
floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
|
|
222
222
|
|
|
223
223
|
resource = (
|
|
224
|
-
f
|
|
224
|
+
f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
|
|
225
225
|
f"{event.database}.{event.table}"
|
|
226
226
|
)
|
|
227
227
|
|
|
@@ -235,9 +235,9 @@ class TrinoUsageSource(Source):
|
|
|
235
235
|
def _aggregate_access_events(
|
|
236
236
|
self, events: List[TrinoJoinedAccessEvent]
|
|
237
237
|
) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]:
|
|
238
|
-
datasets: Dict[
|
|
239
|
-
|
|
240
|
-
|
|
238
|
+
datasets: Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]] = (
|
|
239
|
+
collections.defaultdict(dict)
|
|
240
|
+
)
|
|
241
241
|
|
|
242
242
|
for event in events:
|
|
243
243
|
floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
|
|
@@ -89,7 +89,7 @@ def make_usage_workunit(
|
|
|
89
89
|
top_sql_queries: Optional[List[str]] = None
|
|
90
90
|
if query_freq is not None:
|
|
91
91
|
if top_n_queries < len(query_freq):
|
|
92
|
-
logger.
|
|
92
|
+
logger.warning(
|
|
93
93
|
f"Top N query limit exceeded on {str(resource)}. Max number of queries {top_n_queries} < {len(query_freq)}. Truncating top queries to {top_n_queries}."
|
|
94
94
|
)
|
|
95
95
|
query_freq = query_freq[0:top_n_queries]
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from contextlib import AbstractContextManager
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from datetime import datetime, timezone
|
|
4
|
-
from typing import Optional
|
|
5
5
|
|
|
6
6
|
from datahub.utilities.perf_timer import PerfTimer
|
|
7
7
|
from datahub.utilities.stats_collections import TopKDict
|
|
@@ -22,25 +22,29 @@ PROFILING = "Profiling"
|
|
|
22
22
|
|
|
23
23
|
@dataclass
|
|
24
24
|
class IngestionStageReport:
|
|
25
|
-
ingestion_stage: Optional[str] = None
|
|
26
25
|
ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
else:
|
|
42
|
-
self._timer = PerfTimer()
|
|
43
|
-
|
|
44
|
-
self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
|
|
45
|
-
logger.info(f"Stage started: {self.ingestion_stage}")
|
|
27
|
+
def new_stage(self, stage: str) -> "IngestionStageContext":
|
|
28
|
+
return IngestionStageContext(stage, self)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class IngestionStageContext(AbstractContextManager):
|
|
33
|
+
def __init__(self, stage: str, report: IngestionStageReport):
|
|
34
|
+
self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
|
|
35
|
+
self._timer: PerfTimer = PerfTimer()
|
|
36
|
+
self._report = report
|
|
37
|
+
|
|
38
|
+
def __enter__(self) -> "IngestionStageContext":
|
|
39
|
+
logger.info(f"Stage started: {self._ingestion_stage}")
|
|
46
40
|
self._timer.start()
|
|
41
|
+
return self
|
|
42
|
+
|
|
43
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
44
|
+
elapsed = self._timer.elapsed_seconds(digits=2)
|
|
45
|
+
logger.info(
|
|
46
|
+
f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
|
|
47
|
+
stacklevel=2,
|
|
48
|
+
)
|
|
49
|
+
self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed
|
|
50
|
+
return None
|
|
@@ -80,10 +80,10 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
|
|
|
80
80
|
).add_asset(container_urn)
|
|
81
81
|
data_products_container[data_product_urn] = container_product
|
|
82
82
|
else:
|
|
83
|
-
data_products_container[
|
|
84
|
-
data_product_urn
|
|
85
|
-
|
|
86
|
-
|
|
83
|
+
data_products_container[data_product_urn] = (
|
|
84
|
+
data_products_container[data_product_urn].add_asset(
|
|
85
|
+
container_urn
|
|
86
|
+
)
|
|
87
87
|
)
|
|
88
88
|
|
|
89
89
|
mcps: List[
|
|
@@ -61,9 +61,9 @@ class AddDatasetProperties(DatasetPropertiesTransformer):
|
|
|
61
61
|
) -> Optional[DatasetPropertiesClass]:
|
|
62
62
|
assert dataset_properties_aspect
|
|
63
63
|
|
|
64
|
-
server_dataset_properties_aspect: Optional[
|
|
65
|
-
|
|
66
|
-
|
|
64
|
+
server_dataset_properties_aspect: Optional[DatasetPropertiesClass] = (
|
|
65
|
+
graph.get_dataset_properties(entity_urn)
|
|
66
|
+
)
|
|
67
67
|
# No need to take any action if server properties is None or there is not customProperties in server properties
|
|
68
68
|
if (
|
|
69
69
|
server_dataset_properties_aspect is None
|
|
@@ -89,9 +89,9 @@ class AddDatasetSchemaTags(DatasetSchemaMetadataTransformer):
|
|
|
89
89
|
server_field_map: dict = {}
|
|
90
90
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
91
91
|
assert self.ctx.graph
|
|
92
|
-
server_schema_metadata_aspect: Optional[
|
|
93
|
-
|
|
94
|
-
|
|
92
|
+
server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
|
|
93
|
+
self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
|
|
94
|
+
)
|
|
95
95
|
if server_schema_metadata_aspect is not None:
|
|
96
96
|
if not schema_metadata_aspect:
|
|
97
97
|
schema_metadata_aspect = server_schema_metadata_aspect
|
|
@@ -108,9 +108,9 @@ class AddDatasetSchemaTerms(DatasetSchemaMetadataTransformer):
|
|
|
108
108
|
] = {} # Map to cache server field objects, where fieldPath is key
|
|
109
109
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
110
110
|
assert self.ctx.graph
|
|
111
|
-
server_schema_metadata_aspect: Optional[
|
|
112
|
-
|
|
113
|
-
|
|
111
|
+
server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
|
|
112
|
+
self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
|
|
113
|
+
)
|
|
114
114
|
if server_schema_metadata_aspect is not None:
|
|
115
115
|
if not schema_metadata_aspect:
|
|
116
116
|
schema_metadata_aspect = server_schema_metadata_aspect
|
|
@@ -60,10 +60,10 @@ class DatasetTagDomainMapper(DatasetDomainTransformer):
|
|
|
60
60
|
domain_aspect.domains.extend(mapped_domains.domains)
|
|
61
61
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
62
62
|
# Try merging with server-side domains
|
|
63
|
-
patch_domain_aspect: Optional[
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
63
|
+
patch_domain_aspect: Optional[DomainsClass] = (
|
|
64
|
+
AddDatasetDomain._merge_with_server_domains(
|
|
65
|
+
self.ctx.graph, entity_urn, domain_aspect
|
|
66
|
+
)
|
|
67
67
|
)
|
|
68
68
|
return cast(Optional[Aspect], patch_domain_aspect)
|
|
69
69
|
return cast(Optional[Aspect], domain_aspect)
|
|
@@ -141,9 +141,9 @@ class ExtractOwnersFromTagsTransformer(DatasetTagsTransformer):
|
|
|
141
141
|
else:
|
|
142
142
|
owner_type = get_owner_type(self.config.owner_type)
|
|
143
143
|
if owner_type == OwnershipTypeClass.CUSTOM:
|
|
144
|
-
assert (
|
|
145
|
-
|
|
146
|
-
)
|
|
144
|
+
assert self.config.owner_type_urn is not None, (
|
|
145
|
+
"owner_type_urn must be set if owner_type is CUSTOM"
|
|
146
|
+
)
|
|
147
147
|
|
|
148
148
|
owners.append(
|
|
149
149
|
OwnerClass(
|
|
@@ -92,9 +92,9 @@ class TagsToTermMapper(TagsToTermTransformer):
|
|
|
92
92
|
in_global_tags_aspect: Optional[GlobalTagsClass] = self.ctx.graph.get_tags(
|
|
93
93
|
entity_urn
|
|
94
94
|
)
|
|
95
|
-
in_schema_metadata_aspect: Optional[
|
|
96
|
-
|
|
97
|
-
|
|
95
|
+
in_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
|
|
96
|
+
self.ctx.graph.get_schema_metadata(entity_urn)
|
|
97
|
+
)
|
|
98
98
|
|
|
99
99
|
if in_global_tags_aspect is None and in_schema_metadata_aspect is None:
|
|
100
100
|
return cast(Aspect, in_glossary_terms)
|
|
@@ -134,10 +134,10 @@ class TagsToTermMapper(TagsToTermTransformer):
|
|
|
134
134
|
)
|
|
135
135
|
|
|
136
136
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
137
|
-
patch_glossary_terms: Optional[
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
137
|
+
patch_glossary_terms: Optional[GlossaryTermsClass] = (
|
|
138
|
+
TagsToTermMapper._merge_with_server_glossary_terms(
|
|
139
|
+
self.ctx.graph, entity_urn, out_glossary_terms
|
|
140
|
+
)
|
|
141
141
|
)
|
|
142
142
|
return cast(Optional[Aspect], patch_glossary_terms)
|
|
143
143
|
else:
|
|
@@ -61,17 +61,17 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
|
|
|
61
61
|
def create(
|
|
62
62
|
cls, output_dir: str, extras: Dict[str, str]
|
|
63
63
|
) -> "SnowflakeAssertionCompiler":
|
|
64
|
-
assert os.path.exists(
|
|
65
|
-
output_dir
|
|
66
|
-
)
|
|
64
|
+
assert os.path.exists(output_dir), (
|
|
65
|
+
f"Specified location {output_dir} does not exist."
|
|
66
|
+
)
|
|
67
67
|
|
|
68
|
-
assert os.path.isdir(
|
|
69
|
-
output_dir
|
|
70
|
-
)
|
|
68
|
+
assert os.path.isdir(output_dir), (
|
|
69
|
+
f"Specified location {output_dir} is not a folder."
|
|
70
|
+
)
|
|
71
71
|
|
|
72
|
-
assert any(
|
|
73
|
-
|
|
74
|
-
)
|
|
72
|
+
assert any(x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras), (
|
|
73
|
+
"Must specify value for DMF schema using -x DMF_SCHEMA=<db.schema>"
|
|
74
|
+
)
|
|
75
75
|
|
|
76
76
|
return SnowflakeAssertionCompiler(output_dir, extras)
|
|
77
77
|
|
|
@@ -232,6 +232,6 @@ def get_dmf_schedule(trigger: AssertionTrigger) -> str:
|
|
|
232
232
|
elif isinstance(trigger.trigger, CronTrigger):
|
|
233
233
|
return f"USING CRON {trigger.trigger.cron} {trigger.trigger.timezone}"
|
|
234
234
|
elif isinstance(trigger.trigger, IntervalTrigger):
|
|
235
|
-
return f"{trigger.trigger.interval.seconds/60} MIN"
|
|
235
|
+
return f"{trigger.trigger.interval.seconds / 60} MIN"
|
|
236
236
|
else:
|
|
237
237
|
raise ValueError(f"Unsupported trigger type {type(trigger.trigger)}")
|
datahub/lite/duckdb_lite.py
CHANGED
|
@@ -163,9 +163,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
163
163
|
|
|
164
164
|
if "properties" not in writeable_dict["systemMetadata"]:
|
|
165
165
|
writeable_dict["systemMetadata"]["properties"] = {}
|
|
166
|
-
writeable_dict["systemMetadata"]["properties"][
|
|
167
|
-
|
|
168
|
-
|
|
166
|
+
writeable_dict["systemMetadata"]["properties"]["sysVersion"] = (
|
|
167
|
+
new_version
|
|
168
|
+
)
|
|
169
169
|
if needs_write:
|
|
170
170
|
self.duckdb_client.execute(
|
|
171
171
|
query="INSERT INTO metadata_aspect_v2 VALUES (?, ?, ?, ?, ?, ?)",
|
|
@@ -208,9 +208,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
208
208
|
"lastObserved": writeable.systemMetadata.lastObserved
|
|
209
209
|
}
|
|
210
210
|
else:
|
|
211
|
-
system_metadata[
|
|
212
|
-
|
|
213
|
-
|
|
211
|
+
system_metadata["lastObserved"] = (
|
|
212
|
+
writeable.systemMetadata.lastObserved
|
|
213
|
+
)
|
|
214
214
|
self.duckdb_client.execute(
|
|
215
215
|
query="UPDATE metadata_aspect_v2 SET system_metadata = ? WHERE urn = ? AND aspect_name = ? AND version = 0",
|
|
216
216
|
parameters=[
|
|
@@ -497,9 +497,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
497
497
|
aspect_name = r[1]
|
|
498
498
|
aspect_payload = json.loads(r[2])
|
|
499
499
|
if typed:
|
|
500
|
-
assert (
|
|
501
|
-
aspect_name in
|
|
502
|
-
)
|
|
500
|
+
assert aspect_name in ASPECT_MAP, (
|
|
501
|
+
f"Missing aspect name {aspect_name} in the registry"
|
|
502
|
+
)
|
|
503
503
|
try:
|
|
504
504
|
aspect_payload = ASPECT_MAP[aspect_name].from_obj(
|
|
505
505
|
post_json_transform(aspect_payload)
|
|
@@ -531,7 +531,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
531
531
|
for r in results.fetchall():
|
|
532
532
|
urn = r[0]
|
|
533
533
|
aspect_name = r[1]
|
|
534
|
-
aspect_metadata = ASPECT_MAP[aspect_name].from_obj(
|
|
534
|
+
aspect_metadata = ASPECT_MAP[aspect_name].from_obj(
|
|
535
|
+
post_json_transform(json.loads(r[2]))
|
|
536
|
+
) # type: ignore
|
|
535
537
|
system_metadata = SystemMetadataClass.from_obj(json.loads(r[3]))
|
|
536
538
|
mcp = MetadataChangeProposalWrapper(
|
|
537
539
|
entityUrn=urn,
|