acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/METADATA +2461 -2463
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/RECORD +161 -161
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/delete_cli.py +16 -2
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/config.py +4 -0
- datahub/ingestion/source/fivetran/fivetran.py +15 -5
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
- datahub/ingestion/source/gcs/gcs_source.py +5 -3
- datahub/ingestion/source/ge_data_profiler.py +4 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/query.py +77 -47
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +48 -49
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
- datahub/sql_parsing/sqlglot_lineage.py +5 -4
- datahub/sql_parsing/sqlglot_utils.py +3 -2
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/top_level.txt +0 -0
|
@@ -218,9 +218,7 @@ def _get_all_table_comments_and_properties(self, connection, **kw):
|
|
|
218
218
|
, comment
|
|
219
219
|
, {properties_clause} AS properties
|
|
220
220
|
FROM system.tables
|
|
221
|
-
WHERE name NOT LIKE '.inner%'""".format(
|
|
222
|
-
properties_clause=properties_clause
|
|
223
|
-
)
|
|
221
|
+
WHERE name NOT LIKE '.inner%'""".format(properties_clause=properties_clause)
|
|
224
222
|
)
|
|
225
223
|
|
|
226
224
|
all_table_comments: Dict[Tuple[str, str], Dict[str, Any]] = {}
|
|
@@ -268,7 +266,7 @@ def _get_table_or_view_names(self, relkind, connection, schema=None, **kw):
|
|
|
268
266
|
info_cache = kw.get("info_cache")
|
|
269
267
|
all_relations = self._get_all_relation_info(connection, info_cache=info_cache)
|
|
270
268
|
relation_names = []
|
|
271
|
-
for
|
|
269
|
+
for _, relation in all_relations.items():
|
|
272
270
|
if relation.database == schema and relation.relkind == relkind:
|
|
273
271
|
relation_names.append(relation.relname)
|
|
274
272
|
return relation_names
|
|
@@ -301,9 +299,7 @@ def _get_schema_column_info(self, connection, schema=None, **kw):
|
|
|
301
299
|
, comment
|
|
302
300
|
FROM system.columns
|
|
303
301
|
WHERE {schema_clause}
|
|
304
|
-
ORDER BY database, table, position""".format(
|
|
305
|
-
schema_clause=schema_clause
|
|
306
|
-
)
|
|
302
|
+
ORDER BY database, table, position""".format(schema_clause=schema_clause)
|
|
307
303
|
)
|
|
308
304
|
)
|
|
309
305
|
)
|
|
@@ -474,7 +470,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
474
470
|
logger.debug(f"sql_alchemy_url={url}")
|
|
475
471
|
engine = create_engine(url, **self.config.options)
|
|
476
472
|
for db_row in engine.execute(text(all_tables_query)):
|
|
477
|
-
all_tables_set.add(f
|
|
473
|
+
all_tables_set.add(f"{db_row['database']}.{db_row['table_name']}")
|
|
478
474
|
|
|
479
475
|
return all_tables_set
|
|
480
476
|
|
|
@@ -503,7 +499,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
503
499
|
|
|
504
500
|
try:
|
|
505
501
|
for db_row in engine.execute(text(query)):
|
|
506
|
-
dataset_name = f
|
|
502
|
+
dataset_name = f"{db_row['target_schema']}.{db_row['target_table']}"
|
|
507
503
|
if not self.config.database_pattern.allowed(
|
|
508
504
|
db_row["target_schema"]
|
|
509
505
|
) or not self.config.table_pattern.allowed(dataset_name):
|
|
@@ -512,7 +508,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
512
508
|
|
|
513
509
|
# Target
|
|
514
510
|
target_path = (
|
|
515
|
-
f
|
|
511
|
+
f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
|
|
516
512
|
f"{dataset_name}"
|
|
517
513
|
)
|
|
518
514
|
target = LineageItem(
|
|
@@ -525,7 +521,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
525
521
|
|
|
526
522
|
# Source
|
|
527
523
|
platform = LineageDatasetPlatform.CLICKHOUSE
|
|
528
|
-
path = f
|
|
524
|
+
path = f"{db_row['source_schema']}.{db_row['source_table']}"
|
|
529
525
|
|
|
530
526
|
sources = [
|
|
531
527
|
LineageDataset(
|
|
@@ -552,9 +548,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
552
548
|
target.dataset.path
|
|
553
549
|
].upstreams = self._lineage_map[
|
|
554
550
|
target.dataset.path
|
|
555
|
-
].upstreams.union(
|
|
556
|
-
target.upstreams
|
|
557
|
-
)
|
|
551
|
+
].upstreams.union(target.upstreams)
|
|
558
552
|
|
|
559
553
|
else:
|
|
560
554
|
self._lineage_map[target.dataset.path] = target
|
|
@@ -234,9 +234,7 @@ class OracleInspectorObjectWrapper:
|
|
|
234
234
|
WHERE col.table_name = id.table_name
|
|
235
235
|
AND col.column_name = id.column_name
|
|
236
236
|
AND col.owner = id.owner
|
|
237
|
-
) AS identity_options""".format(
|
|
238
|
-
dblink=dblink
|
|
239
|
-
)
|
|
237
|
+
) AS identity_options""".format(dblink=dblink)
|
|
240
238
|
else:
|
|
241
239
|
identity_cols = "NULL as default_on_null, NULL as identity_options"
|
|
242
240
|
|
|
@@ -278,8 +278,7 @@ class GenericProfiler:
|
|
|
278
278
|
|
|
279
279
|
if self.config.profiling.profile_table_size_limit is not None and (
|
|
280
280
|
size_in_bytes is not None
|
|
281
|
-
and size_in_bytes / (2**30)
|
|
282
|
-
> self.config.profiling.profile_table_size_limit
|
|
281
|
+
and size_in_bytes / (2**30) > self.config.profiling.profile_table_size_limit
|
|
283
282
|
):
|
|
284
283
|
self.report.profiling_skipped_size_limit[schema_name] += 1
|
|
285
284
|
logger.debug(
|
|
@@ -599,7 +599,12 @@ ORDER by DataBaseName, TableName;
|
|
|
599
599
|
setattr( # noqa: B010
|
|
600
600
|
TeradataDialect,
|
|
601
601
|
"get_columns",
|
|
602
|
-
lambda self,
|
|
602
|
+
lambda self,
|
|
603
|
+
connection,
|
|
604
|
+
table_name,
|
|
605
|
+
schema=None,
|
|
606
|
+
use_qvci=self.config.use_qvci,
|
|
607
|
+
**kw: optimized_get_columns(
|
|
603
608
|
self,
|
|
604
609
|
connection,
|
|
605
610
|
table_name,
|
|
@@ -613,7 +618,11 @@ ORDER by DataBaseName, TableName;
|
|
|
613
618
|
setattr( # noqa: B010
|
|
614
619
|
TeradataDialect,
|
|
615
620
|
"get_pk_constraint",
|
|
616
|
-
lambda self,
|
|
621
|
+
lambda self,
|
|
622
|
+
connection,
|
|
623
|
+
table_name,
|
|
624
|
+
schema=None,
|
|
625
|
+
**kw: optimized_get_pk_constraint(
|
|
617
626
|
self, connection, table_name, schema, **kw
|
|
618
627
|
),
|
|
619
628
|
)
|
|
@@ -621,7 +630,11 @@ ORDER by DataBaseName, TableName;
|
|
|
621
630
|
setattr( # noqa: B010
|
|
622
631
|
TeradataDialect,
|
|
623
632
|
"get_foreign_keys",
|
|
624
|
-
lambda self,
|
|
633
|
+
lambda self,
|
|
634
|
+
connection,
|
|
635
|
+
table_name,
|
|
636
|
+
schema=None,
|
|
637
|
+
**kw: optimized_get_foreign_keys(
|
|
625
638
|
self, connection, table_name, schema, **kw
|
|
626
639
|
),
|
|
627
640
|
)
|
|
@@ -41,9 +41,9 @@ class ProfilingHandler(StatefulIngestionUsecaseHandlerBase[ProfilingCheckpointSt
|
|
|
41
41
|
run_id: str,
|
|
42
42
|
):
|
|
43
43
|
self.state_provider = source.state_provider
|
|
44
|
-
self.stateful_ingestion_config: Optional[
|
|
45
|
-
|
|
46
|
-
|
|
44
|
+
self.stateful_ingestion_config: Optional[ProfilingStatefulIngestionConfig] = (
|
|
45
|
+
config.stateful_ingestion
|
|
46
|
+
)
|
|
47
47
|
self.pipeline_name = pipeline_name
|
|
48
48
|
self.run_id = run_id
|
|
49
49
|
self.checkpointing_enabled: bool = (
|
|
@@ -48,9 +48,9 @@ class RedundantRunSkipHandler(
|
|
|
48
48
|
):
|
|
49
49
|
self.source = source
|
|
50
50
|
self.state_provider = source.state_provider
|
|
51
|
-
self.stateful_ingestion_config: Optional[
|
|
52
|
-
|
|
53
|
-
|
|
51
|
+
self.stateful_ingestion_config: Optional[StatefulIngestionConfig] = (
|
|
52
|
+
config.stateful_ingestion
|
|
53
|
+
)
|
|
54
54
|
self.pipeline_name = pipeline_name
|
|
55
55
|
self.run_id = run_id
|
|
56
56
|
self._job_id = self._init_job_id()
|
|
@@ -145,8 +145,7 @@ class RedundantRunSkipHandler(
|
|
|
145
145
|
)
|
|
146
146
|
|
|
147
147
|
logger.debug(
|
|
148
|
-
f"{self.job_id} : Last run start, end times:"
|
|
149
|
-
f"({last_run_time_window})"
|
|
148
|
+
f"{self.job_id} : Last run start, end times:({last_run_time_window})"
|
|
150
149
|
)
|
|
151
150
|
|
|
152
151
|
# If current run's time window is subset of last run's time window, then skip.
|
|
@@ -212,8 +211,7 @@ class RedundantRunSkipHandler(
|
|
|
212
211
|
)
|
|
213
212
|
|
|
214
213
|
self.log(
|
|
215
|
-
"Adjusted start, end times: "
|
|
216
|
-
f"({suggested_start_time}, {suggested_end_time})"
|
|
214
|
+
f"Adjusted start, end times: ({suggested_start_time}, {suggested_end_time})"
|
|
217
215
|
)
|
|
218
216
|
return (suggested_start_time, suggested_end_time)
|
|
219
217
|
|
|
@@ -111,9 +111,9 @@ class StaleEntityRemovalHandler(
|
|
|
111
111
|
self.state_type_class = state_type_class
|
|
112
112
|
self.pipeline_name = pipeline_name
|
|
113
113
|
self.run_id = run_id
|
|
114
|
-
self.stateful_ingestion_config: Optional[
|
|
115
|
-
|
|
116
|
-
|
|
114
|
+
self.stateful_ingestion_config: Optional[StatefulStaleMetadataRemovalConfig] = (
|
|
115
|
+
config.stateful_ingestion
|
|
116
|
+
)
|
|
117
117
|
self.checkpointing_enabled: bool = (
|
|
118
118
|
True
|
|
119
119
|
if (
|
|
@@ -70,20 +70,20 @@ class DatahubIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
|
|
|
70
70
|
self.orchestrator_name, pipeline_name, job_name
|
|
71
71
|
)
|
|
72
72
|
|
|
73
|
-
latest_checkpoint: Optional[
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
73
|
+
latest_checkpoint: Optional[DatahubIngestionCheckpointClass] = (
|
|
74
|
+
self.graph.get_latest_timeseries_value(
|
|
75
|
+
entity_urn=data_job_urn,
|
|
76
|
+
aspect_type=DatahubIngestionCheckpointClass,
|
|
77
|
+
filter_criteria_map={
|
|
78
|
+
"pipelineName": pipeline_name,
|
|
79
|
+
},
|
|
80
|
+
)
|
|
81
81
|
)
|
|
82
82
|
if latest_checkpoint:
|
|
83
83
|
logger.debug(
|
|
84
84
|
f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
|
|
85
85
|
f" job_name:'{job_name}' found with start_time:"
|
|
86
|
-
f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}"
|
|
86
|
+
f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis / 1000)}"
|
|
87
87
|
)
|
|
88
88
|
return latest_checkpoint
|
|
89
89
|
else:
|
|
@@ -67,7 +67,7 @@ class FileIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
|
|
|
67
67
|
logger.debug(
|
|
68
68
|
f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
|
|
69
69
|
f" job_name:'{job_name}' found with start_time:"
|
|
70
|
-
f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}"
|
|
70
|
+
f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis / 1000)}"
|
|
71
71
|
)
|
|
72
72
|
return latest_checkpoint
|
|
73
73
|
else:
|
|
@@ -281,9 +281,9 @@ class TableauConnectionConfig(ConfigModel):
|
|
|
281
281
|
return authentication
|
|
282
282
|
|
|
283
283
|
def make_tableau_client(self, site: str) -> Server:
|
|
284
|
-
authentication: Union[
|
|
285
|
-
|
|
286
|
-
|
|
284
|
+
authentication: Union[TableauAuth, PersonalAccessTokenAuth] = (
|
|
285
|
+
self.get_tableau_auth(site)
|
|
286
|
+
)
|
|
287
287
|
try:
|
|
288
288
|
server = Server(
|
|
289
289
|
self.connect_uri,
|
|
@@ -635,7 +635,7 @@ class TableauConfig(
|
|
|
635
635
|
project_path_pattern = values.get("project_path_pattern")
|
|
636
636
|
if project_pattern is None and project_path_pattern is None and projects:
|
|
637
637
|
logger.warning(
|
|
638
|
-
"projects is deprecated, please use
|
|
638
|
+
"projects is deprecated, please use project_path_pattern instead."
|
|
639
639
|
)
|
|
640
640
|
logger.info("Initializing project_pattern from projects")
|
|
641
641
|
values["project_pattern"] = AllowDenyPattern(
|
|
@@ -708,18 +708,18 @@ class DatabaseTable:
|
|
|
708
708
|
"""
|
|
709
709
|
|
|
710
710
|
urn: str
|
|
711
|
-
id: Optional[
|
|
712
|
-
|
|
713
|
-
|
|
711
|
+
id: Optional[str] = (
|
|
712
|
+
None # is not None only for tables that came from Tableau metadata
|
|
713
|
+
)
|
|
714
714
|
num_cols: Optional[int] = None
|
|
715
715
|
|
|
716
|
-
paths: Optional[
|
|
717
|
-
|
|
718
|
-
|
|
716
|
+
paths: Optional[Set[str]] = (
|
|
717
|
+
None # maintains all browse paths encountered for this table
|
|
718
|
+
)
|
|
719
719
|
|
|
720
|
-
parsed_columns: Optional[
|
|
721
|
-
|
|
722
|
-
|
|
720
|
+
parsed_columns: Optional[Set[str]] = (
|
|
721
|
+
None # maintains all columns encountered for this table during parsing SQL queries
|
|
722
|
+
)
|
|
723
723
|
|
|
724
724
|
def update_table(
|
|
725
725
|
self,
|
|
@@ -2310,8 +2310,7 @@ class TableauSiteSource:
|
|
|
2310
2310
|
c.EMBEDDED_DATA_SOURCE,
|
|
2311
2311
|
):
|
|
2312
2312
|
logger.debug(
|
|
2313
|
-
f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is "
|
|
2314
|
-
f"unsupported"
|
|
2313
|
+
f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is unsupported"
|
|
2315
2314
|
)
|
|
2316
2315
|
return None
|
|
2317
2316
|
|
|
@@ -2493,9 +2492,9 @@ class TableauSiteSource:
|
|
|
2493
2492
|
def _enrich_database_tables_with_parsed_schemas(
|
|
2494
2493
|
self, parsing_result: SqlParsingResult
|
|
2495
2494
|
) -> None:
|
|
2496
|
-
in_tables_schemas: Dict[
|
|
2497
|
-
|
|
2498
|
-
|
|
2495
|
+
in_tables_schemas: Dict[str, Set[str]] = (
|
|
2496
|
+
transform_parsing_result_to_in_tables_schemas(parsing_result)
|
|
2497
|
+
)
|
|
2499
2498
|
|
|
2500
2499
|
if not in_tables_schemas:
|
|
2501
2500
|
logger.info("Unable to extract table schema from parsing result")
|
|
@@ -3559,25 +3558,25 @@ class TableauSiteSource:
|
|
|
3559
3558
|
|
|
3560
3559
|
generated_project_keys.add(project_key.guid())
|
|
3561
3560
|
|
|
3562
|
-
parent_project_key: Optional[
|
|
3563
|
-
|
|
3564
|
-
|
|
3561
|
+
parent_project_key: Optional[Union[ProjectKey, SiteKey]] = (
|
|
3562
|
+
None # It is going
|
|
3563
|
+
)
|
|
3565
3564
|
# to be used as a parent container key for the current tableau project
|
|
3566
3565
|
|
|
3567
3566
|
if project_.parent_id is not None:
|
|
3568
3567
|
# Go to the parent project as we need to generate container first for parent
|
|
3569
3568
|
parent_project_key = self.gen_project_key(project_.parent_id)
|
|
3570
3569
|
|
|
3571
|
-
parent_tableau_project: Optional[
|
|
3572
|
-
|
|
3573
|
-
|
|
3570
|
+
parent_tableau_project: Optional[TableauProject] = (
|
|
3571
|
+
self.tableau_project_registry.get(project_.parent_id)
|
|
3572
|
+
)
|
|
3574
3573
|
|
|
3575
3574
|
if (
|
|
3576
3575
|
parent_tableau_project is None
|
|
3577
3576
|
): # It is not in project registry because of project_pattern
|
|
3578
|
-
assert (
|
|
3579
|
-
project_.
|
|
3580
|
-
)
|
|
3577
|
+
assert project_.parent_name, (
|
|
3578
|
+
f"project {project_.name} should not be null"
|
|
3579
|
+
)
|
|
3581
3580
|
parent_tableau_project = TableauProject(
|
|
3582
3581
|
id=project_.parent_id,
|
|
3583
3582
|
name=project_.parent_name,
|
|
@@ -3605,7 +3604,7 @@ class TableauSiteSource:
|
|
|
3605
3604
|
parent_container_key=parent_project_key,
|
|
3606
3605
|
)
|
|
3607
3606
|
|
|
3608
|
-
for
|
|
3607
|
+
for project in self.tableau_project_registry.values():
|
|
3609
3608
|
logger.debug(
|
|
3610
3609
|
f"project {project.name} and it's parent {project.parent_name} and parent id {project.parent_id}"
|
|
3611
3610
|
)
|
|
@@ -3669,16 +3668,16 @@ class TableauSiteSource:
|
|
|
3669
3668
|
if self.config.extract_usage_stats:
|
|
3670
3669
|
with PerfTimer() as timer:
|
|
3671
3670
|
self._populate_usage_stat_registry()
|
|
3672
|
-
self.report.extract_usage_stats_timer[
|
|
3673
|
-
|
|
3674
|
-
|
|
3671
|
+
self.report.extract_usage_stats_timer[self.site_content_url] = (
|
|
3672
|
+
timer.elapsed_seconds(digits=2)
|
|
3673
|
+
)
|
|
3675
3674
|
|
|
3676
3675
|
if self.config.permission_ingestion:
|
|
3677
3676
|
with PerfTimer() as timer:
|
|
3678
3677
|
self._fetch_groups()
|
|
3679
|
-
self.report.fetch_groups_timer[
|
|
3680
|
-
|
|
3681
|
-
|
|
3678
|
+
self.report.fetch_groups_timer[self.site_content_url] = (
|
|
3679
|
+
timer.elapsed_seconds(digits=2)
|
|
3680
|
+
)
|
|
3682
3681
|
|
|
3683
3682
|
# Populate the map of database names and database hostnames to be used later to map
|
|
3684
3683
|
# databases to platform instances.
|
|
@@ -3691,9 +3690,9 @@ class TableauSiteSource:
|
|
|
3691
3690
|
|
|
3692
3691
|
with PerfTimer() as timer:
|
|
3693
3692
|
self._populate_projects_registry()
|
|
3694
|
-
self.report.populate_projects_registry_timer[
|
|
3695
|
-
|
|
3696
|
-
|
|
3693
|
+
self.report.populate_projects_registry_timer[self.site_content_url] = (
|
|
3694
|
+
timer.elapsed_seconds(digits=2)
|
|
3695
|
+
)
|
|
3697
3696
|
|
|
3698
3697
|
if self.config.add_site_container:
|
|
3699
3698
|
yield from self.emit_site_container()
|
|
@@ -3701,23 +3700,23 @@ class TableauSiteSource:
|
|
|
3701
3700
|
|
|
3702
3701
|
with PerfTimer() as timer:
|
|
3703
3702
|
yield from self.emit_workbooks()
|
|
3704
|
-
self.report.emit_workbooks_timer[
|
|
3705
|
-
|
|
3706
|
-
|
|
3703
|
+
self.report.emit_workbooks_timer[self.site_content_url] = (
|
|
3704
|
+
timer.elapsed_seconds(digits=2)
|
|
3705
|
+
)
|
|
3707
3706
|
|
|
3708
3707
|
if self.sheet_ids:
|
|
3709
3708
|
with PerfTimer() as timer:
|
|
3710
3709
|
yield from self.emit_sheets()
|
|
3711
|
-
self.report.emit_sheets_timer[
|
|
3712
|
-
|
|
3713
|
-
|
|
3710
|
+
self.report.emit_sheets_timer[self.site_content_url] = (
|
|
3711
|
+
timer.elapsed_seconds(digits=2)
|
|
3712
|
+
)
|
|
3714
3713
|
|
|
3715
3714
|
if self.dashboard_ids:
|
|
3716
3715
|
with PerfTimer() as timer:
|
|
3717
3716
|
yield from self.emit_dashboards()
|
|
3718
|
-
self.report.emit_dashboards_timer[
|
|
3719
|
-
|
|
3720
|
-
|
|
3717
|
+
self.report.emit_dashboards_timer[self.site_content_url] = (
|
|
3718
|
+
timer.elapsed_seconds(digits=2)
|
|
3719
|
+
)
|
|
3721
3720
|
|
|
3722
3721
|
if self.embedded_datasource_ids_being_used:
|
|
3723
3722
|
with PerfTimer() as timer:
|
|
@@ -3743,6 +3742,6 @@ class TableauSiteSource:
|
|
|
3743
3742
|
if self.database_tables:
|
|
3744
3743
|
with PerfTimer() as timer:
|
|
3745
3744
|
yield from self.emit_upstream_tables()
|
|
3746
|
-
self.report.emit_upstream_tables_timer[
|
|
3747
|
-
|
|
3748
|
-
|
|
3745
|
+
self.report.emit_upstream_tables_timer[self.site_content_url] = (
|
|
3746
|
+
timer.elapsed_seconds(digits=2)
|
|
3747
|
+
)
|
|
@@ -254,7 +254,9 @@ class UnityCatalogSourceConfig(
|
|
|
254
254
|
)
|
|
255
255
|
|
|
256
256
|
# TODO: Remove `type:ignore` by refactoring config
|
|
257
|
-
profiling: Union[
|
|
257
|
+
profiling: Union[
|
|
258
|
+
UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig
|
|
259
|
+
] = Field( # type: ignore
|
|
258
260
|
default=UnityCatalogGEProfilerConfig(),
|
|
259
261
|
description="Data profiling configuration",
|
|
260
262
|
discriminator="method",
|
|
@@ -363,7 +363,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
363
363
|
|
|
364
364
|
@staticmethod
|
|
365
365
|
def _create_metastore(
|
|
366
|
-
obj: Union[GetMetastoreSummaryResponse, MetastoreInfo]
|
|
366
|
+
obj: Union[GetMetastoreSummaryResponse, MetastoreInfo],
|
|
367
367
|
) -> Optional[Metastore]:
|
|
368
368
|
if not obj.name:
|
|
369
369
|
return None
|
|
@@ -205,9 +205,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
205
205
|
self.table_refs: Set[TableReference] = set()
|
|
206
206
|
self.view_refs: Set[TableReference] = set()
|
|
207
207
|
self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
|
|
208
|
-
self.view_definitions: FileBackedDict[
|
|
209
|
-
|
|
210
|
-
|
|
208
|
+
self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
|
|
209
|
+
FileBackedDict()
|
|
210
|
+
)
|
|
211
211
|
|
|
212
212
|
# Global map of tables, for profiling
|
|
213
213
|
self.tables: FileBackedDict[Table] = FileBackedDict()
|
|
@@ -103,7 +103,9 @@ class UnityCatalogUsageExtractor:
|
|
|
103
103
|
query, table_info
|
|
104
104
|
)
|
|
105
105
|
for source_table in table_info.source_tables:
|
|
106
|
-
with
|
|
106
|
+
with (
|
|
107
|
+
self.report.usage_perf_report.aggregator_add_event_timer
|
|
108
|
+
):
|
|
107
109
|
self.usage_aggregator.aggregate_event(
|
|
108
110
|
resource=source_table,
|
|
109
111
|
start_time=query.start_time,
|
|
@@ -213,15 +213,15 @@ class ClickHouseUsageSource(Source):
|
|
|
213
213
|
def _aggregate_access_events(
|
|
214
214
|
self, events: List[ClickHouseJoinedAccessEvent]
|
|
215
215
|
) -> Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]]:
|
|
216
|
-
datasets: Dict[
|
|
217
|
-
|
|
218
|
-
|
|
216
|
+
datasets: Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]] = (
|
|
217
|
+
collections.defaultdict(dict)
|
|
218
|
+
)
|
|
219
219
|
|
|
220
220
|
for event in events:
|
|
221
221
|
floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
|
|
222
222
|
|
|
223
223
|
resource = (
|
|
224
|
-
f
|
|
224
|
+
f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
|
|
225
225
|
f"{event.database}.{event.table}"
|
|
226
226
|
)
|
|
227
227
|
|
|
@@ -235,9 +235,9 @@ class TrinoUsageSource(Source):
|
|
|
235
235
|
def _aggregate_access_events(
|
|
236
236
|
self, events: List[TrinoJoinedAccessEvent]
|
|
237
237
|
) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]:
|
|
238
|
-
datasets: Dict[
|
|
239
|
-
|
|
240
|
-
|
|
238
|
+
datasets: Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]] = (
|
|
239
|
+
collections.defaultdict(dict)
|
|
240
|
+
)
|
|
241
241
|
|
|
242
242
|
for event in events:
|
|
243
243
|
floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)
|
|
@@ -89,7 +89,7 @@ def make_usage_workunit(
|
|
|
89
89
|
top_sql_queries: Optional[List[str]] = None
|
|
90
90
|
if query_freq is not None:
|
|
91
91
|
if top_n_queries < len(query_freq):
|
|
92
|
-
logger.
|
|
92
|
+
logger.warning(
|
|
93
93
|
f"Top N query limit exceeded on {str(resource)}. Max number of queries {top_n_queries} < {len(query_freq)}. Truncating top queries to {top_n_queries}."
|
|
94
94
|
)
|
|
95
95
|
query_freq = query_freq[0:top_n_queries]
|
|
@@ -80,10 +80,10 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
|
|
|
80
80
|
).add_asset(container_urn)
|
|
81
81
|
data_products_container[data_product_urn] = container_product
|
|
82
82
|
else:
|
|
83
|
-
data_products_container[
|
|
84
|
-
data_product_urn
|
|
85
|
-
|
|
86
|
-
|
|
83
|
+
data_products_container[data_product_urn] = (
|
|
84
|
+
data_products_container[data_product_urn].add_asset(
|
|
85
|
+
container_urn
|
|
86
|
+
)
|
|
87
87
|
)
|
|
88
88
|
|
|
89
89
|
mcps: List[
|
|
@@ -61,9 +61,9 @@ class AddDatasetProperties(DatasetPropertiesTransformer):
|
|
|
61
61
|
) -> Optional[DatasetPropertiesClass]:
|
|
62
62
|
assert dataset_properties_aspect
|
|
63
63
|
|
|
64
|
-
server_dataset_properties_aspect: Optional[
|
|
65
|
-
|
|
66
|
-
|
|
64
|
+
server_dataset_properties_aspect: Optional[DatasetPropertiesClass] = (
|
|
65
|
+
graph.get_dataset_properties(entity_urn)
|
|
66
|
+
)
|
|
67
67
|
# No need to take any action if server properties is None or there is not customProperties in server properties
|
|
68
68
|
if (
|
|
69
69
|
server_dataset_properties_aspect is None
|
|
@@ -89,9 +89,9 @@ class AddDatasetSchemaTags(DatasetSchemaMetadataTransformer):
|
|
|
89
89
|
server_field_map: dict = {}
|
|
90
90
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
91
91
|
assert self.ctx.graph
|
|
92
|
-
server_schema_metadata_aspect: Optional[
|
|
93
|
-
|
|
94
|
-
|
|
92
|
+
server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
|
|
93
|
+
self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
|
|
94
|
+
)
|
|
95
95
|
if server_schema_metadata_aspect is not None:
|
|
96
96
|
if not schema_metadata_aspect:
|
|
97
97
|
schema_metadata_aspect = server_schema_metadata_aspect
|
|
@@ -108,9 +108,9 @@ class AddDatasetSchemaTerms(DatasetSchemaMetadataTransformer):
|
|
|
108
108
|
] = {} # Map to cache server field objects, where fieldPath is key
|
|
109
109
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
110
110
|
assert self.ctx.graph
|
|
111
|
-
server_schema_metadata_aspect: Optional[
|
|
112
|
-
|
|
113
|
-
|
|
111
|
+
server_schema_metadata_aspect: Optional[SchemaMetadataClass] = (
|
|
112
|
+
self.ctx.graph.get_schema_metadata(entity_urn=entity_urn)
|
|
113
|
+
)
|
|
114
114
|
if server_schema_metadata_aspect is not None:
|
|
115
115
|
if not schema_metadata_aspect:
|
|
116
116
|
schema_metadata_aspect = server_schema_metadata_aspect
|
|
@@ -60,10 +60,10 @@ class DatasetTagDomainMapper(DatasetDomainTransformer):
|
|
|
60
60
|
domain_aspect.domains.extend(mapped_domains.domains)
|
|
61
61
|
if self.config.semantics == TransformerSemantics.PATCH:
|
|
62
62
|
# Try merging with server-side domains
|
|
63
|
-
patch_domain_aspect: Optional[
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
63
|
+
patch_domain_aspect: Optional[DomainsClass] = (
|
|
64
|
+
AddDatasetDomain._merge_with_server_domains(
|
|
65
|
+
self.ctx.graph, entity_urn, domain_aspect
|
|
66
|
+
)
|
|
67
67
|
)
|
|
68
68
|
return cast(Optional[Aspect], patch_domain_aspect)
|
|
69
69
|
return cast(Optional[Aspect], domain_aspect)
|
|
@@ -141,9 +141,9 @@ class ExtractOwnersFromTagsTransformer(DatasetTagsTransformer):
|
|
|
141
141
|
else:
|
|
142
142
|
owner_type = get_owner_type(self.config.owner_type)
|
|
143
143
|
if owner_type == OwnershipTypeClass.CUSTOM:
|
|
144
|
-
assert (
|
|
145
|
-
|
|
146
|
-
)
|
|
144
|
+
assert self.config.owner_type_urn is not None, (
|
|
145
|
+
"owner_type_urn must be set if owner_type is CUSTOM"
|
|
146
|
+
)
|
|
147
147
|
|
|
148
148
|
owners.append(
|
|
149
149
|
OwnerClass(
|