acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2335 -2337
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +157 -157
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +4 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +48 -49
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
- datahub/sql_parsing/sqlglot_lineage.py +5 -4
- datahub/sql_parsing/sqlglot_utils.py +3 -2
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
|
@@ -281,9 +281,9 @@ class TableauConnectionConfig(ConfigModel):
|
|
|
281
281
|
return authentication
|
|
282
282
|
|
|
283
283
|
def make_tableau_client(self, site: str) -> Server:
|
|
284
|
-
authentication: Union[
|
|
285
|
-
|
|
286
|
-
|
|
284
|
+
authentication: Union[TableauAuth, PersonalAccessTokenAuth] = (
|
|
285
|
+
self.get_tableau_auth(site)
|
|
286
|
+
)
|
|
287
287
|
try:
|
|
288
288
|
server = Server(
|
|
289
289
|
self.connect_uri,
|
|
@@ -635,7 +635,7 @@ class TableauConfig(
|
|
|
635
635
|
project_path_pattern = values.get("project_path_pattern")
|
|
636
636
|
if project_pattern is None and project_path_pattern is None and projects:
|
|
637
637
|
logger.warning(
|
|
638
|
-
"projects is deprecated, please use
|
|
638
|
+
"projects is deprecated, please use project_path_pattern instead."
|
|
639
639
|
)
|
|
640
640
|
logger.info("Initializing project_pattern from projects")
|
|
641
641
|
values["project_pattern"] = AllowDenyPattern(
|
|
@@ -708,18 +708,18 @@ class DatabaseTable:
|
|
|
708
708
|
"""
|
|
709
709
|
|
|
710
710
|
urn: str
|
|
711
|
-
id: Optional[
|
|
712
|
-
|
|
713
|
-
|
|
711
|
+
id: Optional[str] = (
|
|
712
|
+
None # is not None only for tables that came from Tableau metadata
|
|
713
|
+
)
|
|
714
714
|
num_cols: Optional[int] = None
|
|
715
715
|
|
|
716
|
-
paths: Optional[
|
|
717
|
-
|
|
718
|
-
|
|
716
|
+
paths: Optional[Set[str]] = (
|
|
717
|
+
None # maintains all browse paths encountered for this table
|
|
718
|
+
)
|
|
719
719
|
|
|
720
|
-
parsed_columns: Optional[
|
|
721
|
-
|
|
722
|
-
|
|
720
|
+
parsed_columns: Optional[Set[str]] = (
|
|
721
|
+
None # maintains all columns encountered for this table during parsing SQL queries
|
|
722
|
+
)
|
|
723
723
|
|
|
724
724
|
def update_table(
|
|
725
725
|
self,
|
|
@@ -2310,8 +2310,7 @@ class TableauSiteSource:
|
|
|
2310
2310
|
c.EMBEDDED_DATA_SOURCE,
|
|
2311
2311
|
):
|
|
2312
2312
|
logger.debug(
|
|
2313
|
-
f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is "
|
|
2314
|
-
f"unsupported"
|
|
2313
|
+
f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is unsupported"
|
|
2315
2314
|
)
|
|
2316
2315
|
return None
|
|
2317
2316
|
|
|
@@ -2493,9 +2492,9 @@ class TableauSiteSource:
|
|
|
2493
2492
|
def _enrich_database_tables_with_parsed_schemas(
|
|
2494
2493
|
self, parsing_result: SqlParsingResult
|
|
2495
2494
|
) -> None:
|
|
2496
|
-
in_tables_schemas: Dict[
|
|
2497
|
-
|
|
2498
|
-
|
|
2495
|
+
in_tables_schemas: Dict[str, Set[str]] = (
|
|
2496
|
+
transform_parsing_result_to_in_tables_schemas(parsing_result)
|
|
2497
|
+
)
|
|
2499
2498
|
|
|
2500
2499
|
if not in_tables_schemas:
|
|
2501
2500
|
logger.info("Unable to extract table schema from parsing result")
|
|
@@ -3559,25 +3558,25 @@ class TableauSiteSource:
|
|
|
3559
3558
|
|
|
3560
3559
|
generated_project_keys.add(project_key.guid())
|
|
3561
3560
|
|
|
3562
|
-
parent_project_key: Optional[
|
|
3563
|
-
|
|
3564
|
-
|
|
3561
|
+
parent_project_key: Optional[Union[ProjectKey, SiteKey]] = (
|
|
3562
|
+
None # It is going
|
|
3563
|
+
)
|
|
3565
3564
|
# to be used as a parent container key for the current tableau project
|
|
3566
3565
|
|
|
3567
3566
|
if project_.parent_id is not None:
|
|
3568
3567
|
# Go to the parent project as we need to generate container first for parent
|
|
3569
3568
|
parent_project_key = self.gen_project_key(project_.parent_id)
|
|
3570
3569
|
|
|
3571
|
-
parent_tableau_project: Optional[
|
|
3572
|
-
|
|
3573
|
-
|
|
3570
|
+
parent_tableau_project: Optional[TableauProject] = (
|
|
3571
|
+
self.tableau_project_registry.get(project_.parent_id)
|
|
3572
|
+
)
|
|
3574
3573
|
|
|
3575
3574
|
if (
|
|
3576
3575
|
parent_tableau_project is None
|
|
3577
3576
|
): # It is not in project registry because of project_pattern
|
|
3578
|
-
assert (
|
|
3579
|
-
project_.
|
|
3580
|
-
)
|
|
3577
|
+
assert project_.parent_name, (
|
|
3578
|
+
f"project {project_.name} should not be null"
|
|
3579
|
+
)
|
|
3581
3580
|
parent_tableau_project = TableauProject(
|
|
3582
3581
|
id=project_.parent_id,
|
|
3583
3582
|
name=project_.parent_name,
|
|
@@ -3605,7 +3604,7 @@ class TableauSiteSource:
|
|
|
3605
3604
|
parent_container_key=parent_project_key,
|
|
3606
3605
|
)
|
|
3607
3606
|
|
|
3608
|
-
for
|
|
3607
|
+
for project in self.tableau_project_registry.values():
|
|
3609
3608
|
logger.debug(
|
|
3610
3609
|
f"project {project.name} and it's parent {project.parent_name} and parent id {project.parent_id}"
|
|
3611
3610
|
)
|
|
@@ -3669,16 +3668,16 @@ class TableauSiteSource:
|
|
|
3669
3668
|
if self.config.extract_usage_stats:
|
|
3670
3669
|
with PerfTimer() as timer:
|
|
3671
3670
|
self._populate_usage_stat_registry()
|
|
3672
|
-
self.report.extract_usage_stats_timer[
|
|
3673
|
-
|
|
3674
|
-
|
|
3671
|
+
self.report.extract_usage_stats_timer[self.site_content_url] = (
|
|
3672
|
+
timer.elapsed_seconds(digits=2)
|
|
3673
|
+
)
|
|
3675
3674
|
|
|
3676
3675
|
if self.config.permission_ingestion:
|
|
3677
3676
|
with PerfTimer() as timer:
|
|
3678
3677
|
self._fetch_groups()
|
|
3679
|
-
self.report.fetch_groups_timer[
|
|
3680
|
-
|
|
3681
|
-
|
|
3678
|
+
self.report.fetch_groups_timer[self.site_content_url] = (
|
|
3679
|
+
timer.elapsed_seconds(digits=2)
|
|
3680
|
+
)
|
|
3682
3681
|
|
|
3683
3682
|
# Populate the map of database names and database hostnames to be used later to map
|
|
3684
3683
|
# databases to platform instances.
|
|
@@ -3691,9 +3690,9 @@ class TableauSiteSource:
|
|
|
3691
3690
|
|
|
3692
3691
|
with PerfTimer() as timer:
|
|
3693
3692
|
self._populate_projects_registry()
|
|
3694
|
-
self.report.populate_projects_registry_timer[
|
|
3695
|
-
|
|
3696
|
-
|
|
3693
|
+
self.report.populate_projects_registry_timer[self.site_content_url] = (
|
|
3694
|
+
timer.elapsed_seconds(digits=2)
|
|
3695
|
+
)
|
|
3697
3696
|
|
|
3698
3697
|
if self.config.add_site_container:
|
|
3699
3698
|
yield from self.emit_site_container()
|
|
@@ -3701,23 +3700,23 @@ class TableauSiteSource:
|
|
|
3701
3700
|
|
|
3702
3701
|
with PerfTimer() as timer:
|
|
3703
3702
|
yield from self.emit_workbooks()
|
|
3704
|
-
self.report.emit_workbooks_timer[
|
|
3705
|
-
|
|
3706
|
-
|
|
3703
|
+
self.report.emit_workbooks_timer[self.site_content_url] = (
|
|
3704
|
+
timer.elapsed_seconds(digits=2)
|
|
3705
|
+
)
|
|
3707
3706
|
|
|
3708
3707
|
if self.sheet_ids:
|
|
3709
3708
|
with PerfTimer() as timer:
|
|
3710
3709
|
yield from self.emit_sheets()
|
|
3711
|
-
self.report.emit_sheets_timer[
|
|
3712
|
-
|
|
3713
|
-
|
|
3710
|
+
self.report.emit_sheets_timer[self.site_content_url] = (
|
|
3711
|
+
timer.elapsed_seconds(digits=2)
|
|
3712
|
+
)
|
|
3714
3713
|
|
|
3715
3714
|
if self.dashboard_ids:
|
|
3716
3715
|
with PerfTimer() as timer:
|
|
3717
3716
|
yield from self.emit_dashboards()
|
|
3718
|
-
self.report.emit_dashboards_timer[
|
|
3719
|
-
|
|
3720
|
-
|
|
3717
|
+
self.report.emit_dashboards_timer[self.site_content_url] = (
|
|
3718
|
+
timer.elapsed_seconds(digits=2)
|
|
3719
|
+
)
|
|
3721
3720
|
|
|
3722
3721
|
if self.embedded_datasource_ids_being_used:
|
|
3723
3722
|
with PerfTimer() as timer:
|
|
@@ -3743,6 +3742,6 @@ class TableauSiteSource:
|
|
|
3743
3742
|
if self.database_tables:
|
|
3744
3743
|
with PerfTimer() as timer:
|
|
3745
3744
|
yield from self.emit_upstream_tables()
|
|
3746
|
-
self.report.emit_upstream_tables_timer[
|
|
3747
|
-
|
|
3748
|
-
|
|
3745
|
+
self.report.emit_upstream_tables_timer[self.site_content_url] = (
|
|
3746
|
+
timer.elapsed_seconds(digits=2)
|
|
3747
|
+
)
|
|
@@ -254,7 +254,9 @@ class UnityCatalogSourceConfig(
|
|
|
254
254
|
)
|
|
255
255
|
|
|
256
256
|
# TODO: Remove `type:ignore` by refactoring config
|
|
257
|
-
profiling: Union[
|
|
257
|
+
profiling: Union[
|
|
258
|
+
UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
|
|
259
|
+
] = Field( # type: ignore
|
|
258
260
|
default=UnityCatalogGEProfilerConfig(),
|
|
259
261
|
description="Data profiling configuration",
|
|
260
262
|
discriminator="method",
|
|
@@ -363,7 +363,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
363
363
|
|
|
364
364
|
@staticmethod
|
|
365
365
|
def _create_metastore(
|
|
366
|
-
obj: Union[GetMetastoreSummaryResponse, MetastoreInfo]
|
|
366
|
+
obj: Union[GetMetastoreSummaryResponse, MetastoreInfo],
|
|
367
367
|
) -> Optional[Metastore]:
|
|
368
368
|
if not obj.name:
|
|
369
369
|
return None
|
|
@@ -205,9 +205,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
205
205
|
self.table_refs: Set[TableReference] = set()
|
|
206
206
|
self.view_refs: Set[TableReference] = set()
|
|
207
207
|
self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
|
|
208
|
-
self.view_definitions: FileBackedDict[
|
|
209
|
-
|
|
210
|
-
|
|
208
|
+
self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
|
|
209
|
+
FileBackedDict()
|
|
210
|
+
)
|
|
211
211
|
|
|
212
212
|
# Global map of tables, for profiling
|
|
213
213
|
self.tables: FileBackedDict[Table] = FileBackedDict()
|
|
@@ -103,7 +103,9 @@ class UnityCatalogUsageExtractor:
|
|
|
103
103
|
query, table_info
|
|
104
104
|
)
|
|
105
105
|
for source_table in table_info.source_tables:
|
|
106
|
-
with
|
|
106
|
+
with (
|
|
107
|
+
self.report.usage_perf_report.aggregator_add_event_timer
|
|
108
|
+
):
|
|
107
109
|
self.usage_aggregator.aggregate_event(
|
|
108
110
|
resource=source_table,
|
|
109
111
|
start_time=query.start_time,
|
|
@@ -213,15 +213,15 @@ class ClickHouseUsageSource(Source):
|
|
|
213
213
|
def _aggregate_access_events(
|
|
214
214
|
self, events: List[ClickHouseJoinedAccessEvent]
|
|
215
215
|
) -> Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]]:
|
|
216
|
-
datasets: Dict[
|
|
217
|
-
|
|
218
|
-
|
|
216
|
+
datasets: Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]] = (
|
|
217
|
+
collections.defaultdict(dict)
|
|
218
|
+
)
|
|
219
219
|
|
|
220
220
|
for event in events:
|
|
221
221
|
floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
|
|
222
222
|
|
|
223
223
|
resource = (
|
|
224
|
-
f
|
|
224
|
+
f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
|
|
225
225
|
f"{event.database}.{event.table}"
|
|
226
226
|
)
|
|
227
227
|
|
|
@@ -235,9 +235,9 @@ class TrinoUsageSource(Source):
|
|
|
235
235
|
def _aggregate_access_events(
|
|
236
236
|
self, events: List[TrinoJoinedAccessEvent]
|
|
237
237
|
) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]:
|
|
238
|
-
datasets: Dict[
|
|
239
|
-
|
|
240
|
-
|
|
238
|
+
datasets: Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]] = (
|
|
239
|
+
collections.defaultdict(dict)
|
|
240
|
+
)
|
|
241
241
|
|
|
242
242
|
for event in events:
|
|
243
243
|
floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
|
|
@@ -89,7 +89,7 @@ def make_usage_workunit(
|
|
|
89
89
|
top_sql_queries: Optional[List[str]] = None
|
|
90
90
|
if query_freq is not None:
|
|
91
91
|
if top_n_queries < len(query_freq):
|
|
92
|
-
logger.
|
|
92
|
+
logger.warning(
|
|
93
93
|
f"Top N query limit exceeded on {str(resource)}. Max number of queries {top_n_queries} < {len(query_freq)}. Truncating top queries to {top_n_queries}."
|
|
94
94
|
)
|
|
95
95
|
query_freq = query_freq[0:top_n_queries]
|
|
@@ -80,10 +80,10 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
|
|
|
80
80
|
).add_asset(container_urn)
|
|
81
81
|
data_products_container[data_product_urn] = container_product
|
|
82
82
|
else:
|
|
83
|
-
data_products_container[
|
|
84
|
-
data_product_urn
|
|
85
|
-
|
|
86
|
-
|
|
83
|
+
data_products_container[data_product_urn] = (
|
|
84
|
+
data_products_container[data_product_urn].add_asset(
|
|
85
|
+
container_urn
|
|
86
|
+
)
|
|
87
87
|
)
|
|
88
88
|
|
|
89
89
|
mcps: List[
|
|
@@ -61,9 +61,9 @@ class AddDatasetProperties(DatasetPropertiesTransformer):
|
|
|
61
61
|
) -> Optional[DatasetPropertiesClass]:
|
|
62
62
|
assert dataset_properties_aspect
|
|
63
63
|
|
|
64
|
-
server_dataset_properties_aspect: Optional[
|
|
65
|
-
|
|
66
|
-
|
|
64
|
+
server_dataset_properties_aspect: Optional[DatasetPropertiesClass] = (
|
|
65
|
+
graph.get_dataset_properties(entity_urn)
|
|
66
|
+
)
|
|
67
67
|
# No need to take any action if server properties is None or there is not customProperties in server properties
|
|
68
68
|
if (
|
|
69
69
|
server_dataset_properties_aspect is None
|
|
@@ -89,9 +89,9 @@ class AddDatasetSchemaTags(DatasetSchemaMetadataTransformer):
|
|
|
89
89
|
server_field_map: dict = {}
|
|
90
90
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
91
91
|
assert self.ctx.graph
|
|
92
|
-
server_schema_metadata_aspect: Optional[
|
|
93
|
-
|
|
94
|
-
|
|
92
|
+
server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
|
|
93
|
+
self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
|
|
94
|
+
)
|
|
95
95
|
if server_schema_metadata_aspect is not None:
|
|
96
96
|
if not schema_metadata_aspect:
|
|
97
97
|
schema_metadata_aspect = server_schema_metadata_aspect
|
|
@@ -108,9 +108,9 @@ class AddDatasetSchemaTerms(DatasetSchemaMetadataTransformer):
|
|
|
108
108
|
] = {} # Map to cache server field objects, where fieldPath is key
|
|
109
109
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
110
110
|
assert self.ctx.graph
|
|
111
|
-
server_schema_metadata_aspect: Optional[
|
|
112
|
-
|
|
113
|
-
|
|
111
|
+
server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
|
|
112
|
+
self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
|
|
113
|
+
)
|
|
114
114
|
if server_schema_metadata_aspect is not None:
|
|
115
115
|
if not schema_metadata_aspect:
|
|
116
116
|
schema_metadata_aspect = server_schema_metadata_aspect
|
|
@@ -60,10 +60,10 @@ class DatasetTagDomainMapper(DatasetDomainTransformer):
|
|
|
60
60
|
domain_aspect.domains.extend(mapped_domains.domains)
|
|
61
61
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
62
62
|
# Try merging with server-side domains
|
|
63
|
-
patch_domain_aspect: Optional[
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
63
|
+
patch_domain_aspect: Optional[DomainsClass] = (
|
|
64
|
+
AddDatasetDomain._merge_with_server_domains(
|
|
65
|
+
self.ctx.graph, entity_urn, domain_aspect
|
|
66
|
+
)
|
|
67
67
|
)
|
|
68
68
|
return cast(Optional[Aspect], patch_domain_aspect)
|
|
69
69
|
return cast(Optional[Aspect], domain_aspect)
|
|
@@ -141,9 +141,9 @@ class ExtractOwnersFromTagsTransformer(DatasetTagsTransformer):
|
|
|
141
141
|
else:
|
|
142
142
|
owner_type = get_owner_type(self.config.owner_type)
|
|
143
143
|
if owner_type == OwnershipTypeClass.CUSTOM:
|
|
144
|
-
assert (
|
|
145
|
-
|
|
146
|
-
)
|
|
144
|
+
assert self.config.owner_type_urn is not None, (
|
|
145
|
+
"owner_type_urn must be set if owner_type is CUSTOM"
|
|
146
|
+
)
|
|
147
147
|
|
|
148
148
|
owners.append(
|
|
149
149
|
OwnerClass(
|
|
@@ -92,9 +92,9 @@ class TagsToTermMapper(TagsToTermTransformer):
|
|
|
92
92
|
in_global_tags_aspect: Optional[GlobalTagsClass] = self.ctx.graph.get_tags(
|
|
93
93
|
entity_urn
|
|
94
94
|
)
|
|
95
|
-
in_schema_metadata_aspect: Optional[
|
|
96
|
-
|
|
97
|
-
|
|
95
|
+
in_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
|
|
96
|
+
self.ctx.graph.get_schema_metadata(entity_urn)
|
|
97
|
+
)
|
|
98
98
|
|
|
99
99
|
if in_global_tags_aspect is None and in_schema_metadata_aspect is None:
|
|
100
100
|
return cast(Aspect, in_glossary_terms)
|
|
@@ -134,10 +134,10 @@ class TagsToTermMapper(TagsToTermTransformer):
|
|
|
134
134
|
)
|
|
135
135
|
|
|
136
136
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
137
|
-
patch_glossary_terms: Optional[
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
137
|
+
patch_glossary_terms: Optional[GlossaryTermsClass] = (
|
|
138
|
+
TagsToTermMapper._merge_with_server_glossary_terms(
|
|
139
|
+
self.ctx.graph, entity_urn, out_glossary_terms
|
|
140
|
+
)
|
|
141
141
|
)
|
|
142
142
|
return cast(Optional[Aspect], patch_glossary_terms)
|
|
143
143
|
else:
|
|
@@ -61,17 +61,17 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
|
|
|
61
61
|
def create(
|
|
62
62
|
cls, output_dir: str, extras: Dict[str, str]
|
|
63
63
|
) -> "SnowflakeAssertionCompiler":
|
|
64
|
-
assert os.path.exists(
|
|
65
|
-
output_dir
|
|
66
|
-
)
|
|
64
|
+
assert os.path.exists(output_dir), (
|
|
65
|
+
f"Specified location {output_dir} does not exist."
|
|
66
|
+
)
|
|
67
67
|
|
|
68
|
-
assert os.path.isdir(
|
|
69
|
-
output_dir
|
|
70
|
-
)
|
|
68
|
+
assert os.path.isdir(output_dir), (
|
|
69
|
+
f"Specified location {output_dir} is not a folder."
|
|
70
|
+
)
|
|
71
71
|
|
|
72
|
-
assert any(
|
|
73
|
-
|
|
74
|
-
)
|
|
72
|
+
assert any(x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras), (
|
|
73
|
+
"Must specify value for DMF schema using -x DMF_SCHEMA=<db.schema>"
|
|
74
|
+
)
|
|
75
75
|
|
|
76
76
|
return SnowflakeAssertionCompiler(output_dir, extras)
|
|
77
77
|
|
|
@@ -232,6 +232,6 @@ def get_dmf_schedule(trigger: AssertionTrigger) -> str:
|
|
|
232
232
|
elif isinstance(trigger.trigger, CronTrigger):
|
|
233
233
|
return f"USING CRON {trigger.trigger.cron} {trigger.trigger.timezone}"
|
|
234
234
|
elif isinstance(trigger.trigger, IntervalTrigger):
|
|
235
|
-
return f"{trigger.trigger.interval.seconds/60} MIN"
|
|
235
|
+
return f"{trigger.trigger.interval.seconds / 60} MIN"
|
|
236
236
|
else:
|
|
237
237
|
raise ValueError(f"Unsupported trigger type {type(trigger.trigger)}")
|
datahub/lite/duckdb_lite.py
CHANGED
|
@@ -163,9 +163,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
163
163
|
|
|
164
164
|
if "properties" not in writeable_dict["systemMetadata"]:
|
|
165
165
|
writeable_dict["systemMetadata"]["properties"] = {}
|
|
166
|
-
writeable_dict["systemMetadata"]["properties"][
|
|
167
|
-
|
|
168
|
-
|
|
166
|
+
writeable_dict["systemMetadata"]["properties"]["sysVersion"] = (
|
|
167
|
+
new_version
|
|
168
|
+
)
|
|
169
169
|
if needs_write:
|
|
170
170
|
self.duckdb_client.execute(
|
|
171
171
|
query="INSERT INTO metadata_aspect_v2 VALUES (?, ?, ?, ?, ?, ?)",
|
|
@@ -208,9 +208,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
208
208
|
"lastObserved": writeable.systemMetadata.lastObserved
|
|
209
209
|
}
|
|
210
210
|
else:
|
|
211
|
-
system_metadata[
|
|
212
|
-
|
|
213
|
-
|
|
211
|
+
system_metadata["lastObserved"] = (
|
|
212
|
+
writeable.systemMetadata.lastObserved
|
|
213
|
+
)
|
|
214
214
|
self.duckdb_client.execute(
|
|
215
215
|
query="UPDATE metadata_aspect_v2 SET system_metadata = ? WHERE urn = ? AND aspect_name = ? AND version = 0",
|
|
216
216
|
parameters=[
|
|
@@ -497,9 +497,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
497
497
|
aspect_name = r[1]
|
|
498
498
|
aspect_payload = json.loads(r[2])
|
|
499
499
|
if typed:
|
|
500
|
-
assert (
|
|
501
|
-
aspect_name in
|
|
502
|
-
)
|
|
500
|
+
assert aspect_name in ASPECT_MAP, (
|
|
501
|
+
f"Missing aspect name {aspect_name} in the registry"
|
|
502
|
+
)
|
|
503
503
|
try:
|
|
504
504
|
aspect_payload = ASPECT_MAP[aspect_name].from_obj(
|
|
505
505
|
post_json_transform(aspect_payload)
|
|
@@ -531,7 +531,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
531
531
|
for r in results.fetchall():
|
|
532
532
|
urn = r[0]
|
|
533
533
|
aspect_name = r[1]
|
|
534
|
-
aspect_metadata = ASPECT_MAP[aspect_name].from_obj(
|
|
534
|
+
aspect_metadata = ASPECT_MAP[aspect_name].from_obj(
|
|
535
|
+
post_json_transform(json.loads(r[2]))
|
|
536
|
+
) # type: ignore
|
|
535
537
|
system_metadata = SystemMetadataClass.from_obj(json.loads(r[3]))
|
|
536
538
|
mcp = MetadataChangeProposalWrapper(
|
|
537
539
|
entityUrn=urn,
|
|
@@ -9096,7 +9096,7 @@ class DataProcessInstanceInputClass(_Aspect):
|
|
|
9096
9096
|
|
|
9097
9097
|
@property
|
|
9098
9098
|
def inputs(self) -> List[str]:
|
|
9099
|
-
"""Input
|
|
9099
|
+
"""Input assets consumed"""
|
|
9100
9100
|
return self._inner_dict.get('inputs') # type: ignore
|
|
9101
9101
|
|
|
9102
9102
|
@inputs.setter
|
datahub/metadata/schema.avsc
CHANGED
|
@@ -12699,8 +12699,10 @@
|
|
|
12699
12699
|
"Relationship": {
|
|
12700
12700
|
"/*": {
|
|
12701
12701
|
"entityTypes": [
|
|
12702
|
-
"dataset"
|
|
12702
|
+
"dataset",
|
|
12703
|
+
"mlModel"
|
|
12703
12704
|
],
|
|
12705
|
+
"isLineage": true,
|
|
12704
12706
|
"name": "Consumes"
|
|
12705
12707
|
}
|
|
12706
12708
|
},
|
|
@@ -12720,7 +12722,7 @@
|
|
|
12720
12722
|
"items": "string"
|
|
12721
12723
|
},
|
|
12722
12724
|
"name": "inputs",
|
|
12723
|
-
"doc": "Input
|
|
12725
|
+
"doc": "Input assets consumed"
|
|
12724
12726
|
}
|
|
12725
12727
|
],
|
|
12726
12728
|
"doc": "Information about the inputs datasets of a Data process"
|
|
@@ -12883,6 +12885,8 @@
|
|
|
12883
12885
|
"dataset",
|
|
12884
12886
|
"mlModel"
|
|
12885
12887
|
],
|
|
12888
|
+
"isLineage": true,
|
|
12889
|
+
"isUpstream": false,
|
|
12886
12890
|
"name": "Produces"
|
|
12887
12891
|
}
|
|
12888
12892
|
},
|
|
@@ -10,8 +10,10 @@
|
|
|
10
10
|
"Relationship": {
|
|
11
11
|
"/*": {
|
|
12
12
|
"entityTypes": [
|
|
13
|
-
"dataset"
|
|
13
|
+
"dataset",
|
|
14
|
+
"mlModel"
|
|
14
15
|
],
|
|
16
|
+
"isLineage": true,
|
|
15
17
|
"name": "Consumes"
|
|
16
18
|
}
|
|
17
19
|
},
|
|
@@ -29,7 +31,7 @@
|
|
|
29
31
|
"items": "string"
|
|
30
32
|
},
|
|
31
33
|
"name": "inputs",
|
|
32
|
-
"doc": "Input
|
|
34
|
+
"doc": "Input assets consumed",
|
|
33
35
|
"Urn": "Urn",
|
|
34
36
|
"urn_is_array": true
|
|
35
37
|
}
|
datahub/secret/secret_common.py
CHANGED
|
@@ -2,10 +2,7 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
-
from datahub.configuration.config_loader import
|
|
6
|
-
list_referenced_env_variables,
|
|
7
|
-
resolve_env_variables,
|
|
8
|
-
)
|
|
5
|
+
from datahub.configuration.config_loader import EnvResolver
|
|
9
6
|
from datahub.secret.secret_store import SecretStore
|
|
10
7
|
|
|
11
8
|
logger = logging.getLogger(__name__)
|
|
@@ -42,18 +39,27 @@ def resolve_secrets(secret_names: List[str], secret_stores: List[SecretStore]) -
|
|
|
42
39
|
return final_secret_values
|
|
43
40
|
|
|
44
41
|
|
|
45
|
-
def resolve_recipe(
|
|
42
|
+
def resolve_recipe(
|
|
43
|
+
recipe: str, secret_stores: List[SecretStore], strict_env_syntax: bool = True
|
|
44
|
+
) -> dict:
|
|
45
|
+
# Note: the default for `strict_env_syntax` is normally False, but here we override
|
|
46
|
+
# it to be true. Particularly when fetching secrets from external secret stores, we
|
|
47
|
+
# want to be more careful about not over-fetching secrets.
|
|
48
|
+
|
|
46
49
|
json_recipe_raw = json.loads(recipe)
|
|
47
50
|
|
|
48
51
|
# 1. Extract all secrets needing resolved.
|
|
49
|
-
secrets_to_resolve =
|
|
52
|
+
secrets_to_resolve = EnvResolver.list_referenced_variables(
|
|
53
|
+
json_recipe_raw, strict_env_syntax=strict_env_syntax
|
|
54
|
+
)
|
|
50
55
|
|
|
51
56
|
# 2. Resolve secret values
|
|
52
57
|
secret_values_dict = resolve_secrets(list(secrets_to_resolve), secret_stores)
|
|
53
58
|
|
|
54
59
|
# 3. Substitute secrets into recipe file
|
|
55
|
-
|
|
56
|
-
|
|
60
|
+
resolver = EnvResolver(
|
|
61
|
+
environ=secret_values_dict, strict_env_syntax=strict_env_syntax
|
|
57
62
|
)
|
|
63
|
+
json_recipe_resolved = resolver.resolve(json_recipe_raw)
|
|
58
64
|
|
|
59
65
|
return json_recipe_resolved
|
|
@@ -9,8 +9,7 @@ from datahub.emitter.mcp_patch_builder import MetadataPatchProposal, PatchPath
|
|
|
9
9
|
class HasCustomPropertiesPatch(MetadataPatchProposal):
|
|
10
10
|
@classmethod
|
|
11
11
|
@abstractmethod
|
|
12
|
-
def _custom_properties_location(self) -> Tuple[str, PatchPath]:
|
|
13
|
-
...
|
|
12
|
+
def _custom_properties_location(self) -> Tuple[str, PatchPath]: ...
|
|
14
13
|
|
|
15
14
|
def add_custom_property(self, key: str, value: str) -> Self:
|
|
16
15
|
"""Add a custom property to the entity.
|