acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2335 -2337
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +157 -157
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +4 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +48 -49
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
- datahub/sql_parsing/sqlglot_lineage.py +5 -4
- datahub/sql_parsing/sqlglot_utils.py +3 -2
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
|
@@ -477,9 +477,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
477
477
|
upstream_dataset_urns
|
|
478
478
|
and dataset_urn not in self.dataset_upstream_urn_mapping
|
|
479
479
|
):
|
|
480
|
-
self.dataset_upstream_urn_mapping[
|
|
481
|
-
|
|
482
|
-
|
|
480
|
+
self.dataset_upstream_urn_mapping[dataset_urn] = (
|
|
481
|
+
upstream_dataset_urns
|
|
482
|
+
)
|
|
483
483
|
|
|
484
484
|
element_input_fields = [
|
|
485
485
|
InputFieldClass(
|
|
@@ -126,9 +126,9 @@ class SigmaAPI:
|
|
|
126
126
|
response.raise_for_status()
|
|
127
127
|
response_dict = response.json()
|
|
128
128
|
for workspace_dict in response_dict[Constant.ENTRIES]:
|
|
129
|
-
self.workspaces[
|
|
130
|
-
workspace_dict
|
|
131
|
-
|
|
129
|
+
self.workspaces[workspace_dict[Constant.WORKSPACEID]] = (
|
|
130
|
+
Workspace.parse_obj(workspace_dict)
|
|
131
|
+
)
|
|
132
132
|
if response_dict[Constant.NEXTPAGE]:
|
|
133
133
|
url = f"{workspace_url}&page={response_dict[Constant.NEXTPAGE]}"
|
|
134
134
|
else:
|
|
@@ -147,9 +147,9 @@ class SigmaAPI:
|
|
|
147
147
|
response.raise_for_status()
|
|
148
148
|
response_dict = response.json()
|
|
149
149
|
for user_dict in response_dict[Constant.ENTRIES]:
|
|
150
|
-
users[
|
|
151
|
-
user_dict[Constant.
|
|
152
|
-
|
|
150
|
+
users[user_dict[Constant.MEMBERID]] = (
|
|
151
|
+
f"{user_dict[Constant.FIRSTNAME]}_{user_dict[Constant.LASTNAME]}"
|
|
152
|
+
)
|
|
153
153
|
if response_dict[Constant.NEXTPAGE]:
|
|
154
154
|
url = f"{members_url}&page={response_dict[Constant.NEXTPAGE]}"
|
|
155
155
|
else:
|
|
@@ -327,10 +327,12 @@ class SigmaAPI:
|
|
|
327
327
|
response.raise_for_status()
|
|
328
328
|
for i, element_dict in enumerate(response.json()[Constant.ENTRIES]):
|
|
329
329
|
if not element_dict.get(Constant.NAME):
|
|
330
|
-
element_dict[Constant.NAME] =
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
330
|
+
element_dict[Constant.NAME] = (
|
|
331
|
+
f"Element {i + 1} of Page '{page.name}'"
|
|
332
|
+
)
|
|
333
|
+
element_dict[Constant.URL] = (
|
|
334
|
+
f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true"
|
|
335
|
+
)
|
|
334
336
|
element = Element.parse_obj(element_dict)
|
|
335
337
|
if (
|
|
336
338
|
self.config.extract_lineage
|
|
@@ -384,18 +384,20 @@ class SnowflakeV2Config(
|
|
|
384
384
|
assert all(
|
|
385
385
|
consumer.platform_instance != share_details.platform_instance
|
|
386
386
|
for consumer in share_details.consumers
|
|
387
|
-
),
|
|
387
|
+
), (
|
|
388
|
+
"Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake."
|
|
389
|
+
)
|
|
388
390
|
|
|
389
391
|
databases_included_in_share.append(shared_db)
|
|
390
392
|
databases_created_from_share.extend(share_details.consumers)
|
|
391
393
|
|
|
392
394
|
for db_from_share in databases_created_from_share:
|
|
393
|
-
assert (
|
|
394
|
-
|
|
395
|
-
)
|
|
396
|
-
assert (
|
|
397
|
-
|
|
398
|
-
)
|
|
395
|
+
assert db_from_share not in databases_included_in_share, (
|
|
396
|
+
"Database included in a share can not be present as consumer in any share."
|
|
397
|
+
)
|
|
398
|
+
assert databases_created_from_share.count(db_from_share) == 1, (
|
|
399
|
+
"Same database can not be present as consumer in more than one share."
|
|
400
|
+
)
|
|
399
401
|
|
|
400
402
|
return shares
|
|
401
403
|
|
|
@@ -250,9 +250,9 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
250
250
|
if self.private_key is not None:
|
|
251
251
|
pkey_bytes = self.private_key.replace("\\n", "\n").encode()
|
|
252
252
|
else:
|
|
253
|
-
assert (
|
|
254
|
-
|
|
255
|
-
)
|
|
253
|
+
assert self.private_key_path, (
|
|
254
|
+
"missing required private key path to read key from"
|
|
255
|
+
)
|
|
256
256
|
with open(self.private_key_path, "rb") as key:
|
|
257
257
|
pkey_bytes = key.read()
|
|
258
258
|
|
|
@@ -284,9 +284,9 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
284
284
|
return self.options
|
|
285
285
|
|
|
286
286
|
def get_oauth_connection(self) -> NativeSnowflakeConnection:
|
|
287
|
-
assert (
|
|
288
|
-
|
|
289
|
-
)
|
|
287
|
+
assert self.oauth_config, (
|
|
288
|
+
"oauth_config should be provided if using oauth based authentication"
|
|
289
|
+
)
|
|
290
290
|
generator = OAuthTokenGenerator(
|
|
291
291
|
client_id=self.oauth_config.client_id,
|
|
292
292
|
authority_url=self.oauth_config.authority_url,
|
|
@@ -623,7 +623,7 @@ fingerprinted_queries as (
|
|
|
623
623
|
query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
624
624
|
AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
625
625
|
AND execution_status = 'SUCCESS'
|
|
626
|
-
AND {users_filter or
|
|
626
|
+
AND {users_filter or "TRUE"}
|
|
627
627
|
)
|
|
628
628
|
, deduplicated_queries as (
|
|
629
629
|
SELECT
|
|
@@ -651,7 +651,7 @@ fingerprinted_queries as (
|
|
|
651
651
|
WHERE
|
|
652
652
|
query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
653
653
|
AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
654
|
-
AND {users_filter or
|
|
654
|
+
AND {users_filter or "TRUE"}
|
|
655
655
|
AND query_id IN (
|
|
656
656
|
SELECT query_id FROM deduplicated_queries
|
|
657
657
|
)
|
|
@@ -142,9 +142,9 @@ class _SnowflakeTagCache:
|
|
|
142
142
|
)
|
|
143
143
|
|
|
144
144
|
# self._table_tags[<database_name>][<schema_name>][<table_name>] = list of tags applied to table
|
|
145
|
-
self._table_tags: Dict[
|
|
146
|
-
|
|
147
|
-
|
|
145
|
+
self._table_tags: Dict[str, Dict[str, Dict[str, List[SnowflakeTag]]]] = (
|
|
146
|
+
defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
|
147
|
+
)
|
|
148
148
|
|
|
149
149
|
# self._column_tags[<database_name>][<schema_name>][<table_name>][<column_name>] = list of tags applied to column
|
|
150
150
|
self._column_tags: Dict[
|
|
@@ -194,9 +194,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
194
194
|
config, self.data_dictionary, self.report
|
|
195
195
|
)
|
|
196
196
|
self.profiler: Optional[SnowflakeProfiler] = profiler
|
|
197
|
-
self.snowsight_url_builder: Optional[
|
|
198
|
-
|
|
199
|
-
|
|
197
|
+
self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = (
|
|
198
|
+
snowsight_url_builder
|
|
199
|
+
)
|
|
200
200
|
|
|
201
201
|
# These are populated as side-effects of get_workunits_internal.
|
|
202
202
|
self.databases: List[SnowflakeDatabase] = []
|
|
@@ -267,9 +267,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
267
267
|
)
|
|
268
268
|
return None
|
|
269
269
|
else:
|
|
270
|
-
ischema_databases: List[
|
|
271
|
-
|
|
272
|
-
|
|
270
|
+
ischema_databases: List[SnowflakeDatabase] = (
|
|
271
|
+
self.get_databases_from_ischema(databases)
|
|
272
|
+
)
|
|
273
273
|
|
|
274
274
|
if len(ischema_databases) == 0:
|
|
275
275
|
self.structured_reporter.failure(
|
|
@@ -38,9 +38,9 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
38
38
|
table_name: Optional[str],
|
|
39
39
|
) -> List[SnowflakeTag]:
|
|
40
40
|
if db_name not in self.tag_cache:
|
|
41
|
-
self.tag_cache[
|
|
42
|
-
db_name
|
|
43
|
-
|
|
41
|
+
self.tag_cache[db_name] = (
|
|
42
|
+
self.data_dictionary.get_tags_for_database_without_propagation(db_name)
|
|
43
|
+
)
|
|
44
44
|
|
|
45
45
|
if domain == SnowflakeObjectDomain.DATABASE:
|
|
46
46
|
return self.tag_cache[db_name].get_database_tags(db_name)
|
|
@@ -130,10 +130,10 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
130
130
|
temp_column_tags: Dict[str, List[SnowflakeTag]] = {}
|
|
131
131
|
if self.config.extract_tags == TagOption.without_lineage:
|
|
132
132
|
if db_name not in self.tag_cache:
|
|
133
|
-
self.tag_cache[
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
133
|
+
self.tag_cache[db_name] = (
|
|
134
|
+
self.data_dictionary.get_tags_for_database_without_propagation(
|
|
135
|
+
db_name
|
|
136
|
+
)
|
|
137
137
|
)
|
|
138
138
|
temp_column_tags = self.tag_cache[db_name].get_column_tags_for_table(
|
|
139
139
|
table_name, schema_name, db_name
|
|
@@ -549,9 +549,9 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
549
549
|
):
|
|
550
550
|
# NOTE: Generated emails may be incorrect, as email may be different than
|
|
551
551
|
# username@email_domain
|
|
552
|
-
event_dict[
|
|
553
|
-
"
|
|
554
|
-
|
|
552
|
+
event_dict["EMAIL"] = (
|
|
553
|
+
f"{event_dict['USER_NAME']}@{self.config.email_domain}".lower()
|
|
554
|
+
)
|
|
555
555
|
|
|
556
556
|
if not event_dict["EMAIL"]:
|
|
557
557
|
self.report.rows_missing_email += 1
|
|
@@ -21,8 +21,7 @@ from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Repor
|
|
|
21
21
|
class SnowflakeStructuredReportMixin(abc.ABC):
|
|
22
22
|
@property
|
|
23
23
|
@abc.abstractmethod
|
|
24
|
-
def structured_reporter(self) -> SourceReport:
|
|
25
|
-
...
|
|
24
|
+
def structured_reporter(self) -> SourceReport: ...
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
class SnowsightUrlBuilder:
|
|
@@ -211,9 +211,9 @@ class SnowflakeV2Source(
|
|
|
211
211
|
|
|
212
212
|
self.usage_extractor: Optional[SnowflakeUsageExtractor] = None
|
|
213
213
|
if self.config.include_usage_stats or self.config.include_operational_stats:
|
|
214
|
-
redundant_usage_run_skip_handler: Optional[
|
|
215
|
-
|
|
216
|
-
|
|
214
|
+
redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = (
|
|
215
|
+
None
|
|
216
|
+
)
|
|
217
217
|
if self.config.enable_stateful_usage_ingestion:
|
|
218
218
|
redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler(
|
|
219
219
|
source=self,
|
|
@@ -296,7 +296,16 @@ class SnowflakeV2Source(
|
|
|
296
296
|
|
|
297
297
|
_report: Dict[Union[SourceCapability, str], CapabilityReport] = dict()
|
|
298
298
|
privileges: List[SnowflakePrivilege] = []
|
|
299
|
-
capabilities: List[SourceCapability] = [
|
|
299
|
+
capabilities: List[SourceCapability] = [
|
|
300
|
+
c.capability
|
|
301
|
+
for c in SnowflakeV2Source.get_capabilities() # type: ignore
|
|
302
|
+
if c.capability
|
|
303
|
+
not in (
|
|
304
|
+
SourceCapability.PLATFORM_INSTANCE,
|
|
305
|
+
SourceCapability.DOMAINS,
|
|
306
|
+
SourceCapability.DELETION_DETECTION,
|
|
307
|
+
)
|
|
308
|
+
]
|
|
300
309
|
|
|
301
310
|
cur = conn.query("select current_role()")
|
|
302
311
|
current_role = [row["CURRENT_ROLE()"] for row in cur][0]
|
|
@@ -104,9 +104,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
|
|
|
104
104
|
return "\n".join([r for r in res])
|
|
105
105
|
|
|
106
106
|
@typing.no_type_check
|
|
107
|
-
def _get_column_type(
|
|
108
|
-
self, type_: Union[str, Dict[str, Any]]
|
|
109
|
-
) -> TypeEngine: # noqa: C901
|
|
107
|
+
def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine: # noqa: C901
|
|
110
108
|
"""Derives the data type of the Athena column.
|
|
111
109
|
|
|
112
110
|
This method is overwritten to extend the behavior of PyAthena.
|
|
@@ -218,9 +218,7 @@ def _get_all_table_comments_and_properties(self, connection, **kw):
|
|
|
218
218
|
, comment
|
|
219
219
|
, {properties_clause} AS properties
|
|
220
220
|
FROM system.tables
|
|
221
|
-
WHERE name NOT LIKE '.inner%'""".format(
|
|
222
|
-
properties_clause=properties_clause
|
|
223
|
-
)
|
|
221
|
+
WHERE name NOT LIKE '.inner%'""".format(properties_clause=properties_clause)
|
|
224
222
|
)
|
|
225
223
|
|
|
226
224
|
all_table_comments: Dict[Tuple[str, str], Dict[str, Any]] = {}
|
|
@@ -268,7 +266,7 @@ def _get_table_or_view_names(self, relkind, connection, schema=None, **kw):
|
|
|
268
266
|
info_cache = kw.get("info_cache")
|
|
269
267
|
all_relations = self._get_all_relation_info(connection, info_cache=info_cache)
|
|
270
268
|
relation_names = []
|
|
271
|
-
for
|
|
269
|
+
for _, relation in all_relations.items():
|
|
272
270
|
if relation.database == schema and relation.relkind == relkind:
|
|
273
271
|
relation_names.append(relation.relname)
|
|
274
272
|
return relation_names
|
|
@@ -301,9 +299,7 @@ def _get_schema_column_info(self, connection, schema=None, **kw):
|
|
|
301
299
|
, comment
|
|
302
300
|
FROM system.columns
|
|
303
301
|
WHERE {schema_clause}
|
|
304
|
-
ORDER BY database, table, position""".format(
|
|
305
|
-
schema_clause=schema_clause
|
|
306
|
-
)
|
|
302
|
+
ORDER BY database, table, position""".format(schema_clause=schema_clause)
|
|
307
303
|
)
|
|
308
304
|
)
|
|
309
305
|
)
|
|
@@ -474,7 +470,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
474
470
|
logger.debug(f"sql_alchemy_url={url}")
|
|
475
471
|
engine = create_engine(url, **self.config.options)
|
|
476
472
|
for db_row in engine.execute(text(all_tables_query)):
|
|
477
|
-
all_tables_set.add(f
|
|
473
|
+
all_tables_set.add(f"{db_row['database']}.{db_row['table_name']}")
|
|
478
474
|
|
|
479
475
|
return all_tables_set
|
|
480
476
|
|
|
@@ -503,7 +499,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
503
499
|
|
|
504
500
|
try:
|
|
505
501
|
for db_row in engine.execute(text(query)):
|
|
506
|
-
dataset_name = f
|
|
502
|
+
dataset_name = f"{db_row['target_schema']}.{db_row['target_table']}"
|
|
507
503
|
if not self.config.database_pattern.allowed(
|
|
508
504
|
db_row["target_schema"]
|
|
509
505
|
) or not self.config.table_pattern.allowed(dataset_name):
|
|
@@ -512,7 +508,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
512
508
|
|
|
513
509
|
# Target
|
|
514
510
|
target_path = (
|
|
515
|
-
f
|
|
511
|
+
f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
|
|
516
512
|
f"{dataset_name}"
|
|
517
513
|
)
|
|
518
514
|
target = LineageItem(
|
|
@@ -525,7 +521,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
525
521
|
|
|
526
522
|
# Source
|
|
527
523
|
platform = LineageDatasetPlatform.CLICKHOUSE
|
|
528
|
-
path = f
|
|
524
|
+
path = f"{db_row['source_schema']}.{db_row['source_table']}"
|
|
529
525
|
|
|
530
526
|
sources = [
|
|
531
527
|
LineageDataset(
|
|
@@ -552,9 +548,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
552
548
|
target.dataset.path
|
|
553
549
|
].upstreams = self._lineage_map[
|
|
554
550
|
target.dataset.path
|
|
555
|
-
].upstreams.union(
|
|
556
|
-
target.upstreams
|
|
557
|
-
)
|
|
551
|
+
].upstreams.union(target.upstreams)
|
|
558
552
|
|
|
559
553
|
else:
|
|
560
554
|
self._lineage_map[target.dataset.path] = target
|
|
@@ -234,9 +234,7 @@ class OracleInspectorObjectWrapper:
|
|
|
234
234
|
WHERE col.table_name = id.table_name
|
|
235
235
|
AND col.column_name = id.column_name
|
|
236
236
|
AND col.owner = id.owner
|
|
237
|
-
) AS identity_options""".format(
|
|
238
|
-
dblink=dblink
|
|
239
|
-
)
|
|
237
|
+
) AS identity_options""".format(dblink=dblink)
|
|
240
238
|
else:
|
|
241
239
|
identity_cols = "NULL as default_on_null, NULL as identity_options"
|
|
242
240
|
|
|
@@ -278,8 +278,7 @@ class GenericProfiler:
|
|
|
278
278
|
|
|
279
279
|
if self.config.profiling.profile_table_size_limit is not None and (
|
|
280
280
|
size_in_bytes is not None
|
|
281
|
-
and size_in_bytes / (2**30)
|
|
282
|
-
> self.config.profiling.profile_table_size_limit
|
|
281
|
+
and size_in_bytes / (2**30) > self.config.profiling.profile_table_size_limit
|
|
283
282
|
):
|
|
284
283
|
self.report.profiling_skipped_size_limit[schema_name] += 1
|
|
285
284
|
logger.debug(
|
|
@@ -599,7 +599,12 @@ ORDER by DataBaseName, TableName;
|
|
|
599
599
|
setattr( # noqa: B010
|
|
600
600
|
TeradataDialect,
|
|
601
601
|
"get_columns",
|
|
602
|
-
lambda self,
|
|
602
|
+
lambda self,
|
|
603
|
+
connection,
|
|
604
|
+
table_name,
|
|
605
|
+
schema=None,
|
|
606
|
+
use_qvci=self.config.use_qvci,
|
|
607
|
+
**kw: optimized_get_columns(
|
|
603
608
|
self,
|
|
604
609
|
connection,
|
|
605
610
|
table_name,
|
|
@@ -613,7 +618,11 @@ ORDER by DataBaseName, TableName;
|
|
|
613
618
|
setattr( # noqa: B010
|
|
614
619
|
TeradataDialect,
|
|
615
620
|
"get_pk_constraint",
|
|
616
|
-
lambda self,
|
|
621
|
+
lambda self,
|
|
622
|
+
connection,
|
|
623
|
+
table_name,
|
|
624
|
+
schema=None,
|
|
625
|
+
**kw: optimized_get_pk_constraint(
|
|
617
626
|
self, connection, table_name, schema, **kw
|
|
618
627
|
),
|
|
619
628
|
)
|
|
@@ -621,7 +630,11 @@ ORDER by DataBaseName, TableName;
|
|
|
621
630
|
setattr( # noqa: B010
|
|
622
631
|
TeradataDialect,
|
|
623
632
|
"get_foreign_keys",
|
|
624
|
-
lambda self,
|
|
633
|
+
lambda self,
|
|
634
|
+
connection,
|
|
635
|
+
table_name,
|
|
636
|
+
schema=None,
|
|
637
|
+
**kw: optimized_get_foreign_keys(
|
|
625
638
|
self, connection, table_name, schema, **kw
|
|
626
639
|
),
|
|
627
640
|
)
|
|
@@ -41,9 +41,9 @@ class ProfilingHandler(StatefulIngestionUsecaseHandlerBase[ProfilingCheckpointSt
|
|
|
41
41
|
run_id: str,
|
|
42
42
|
):
|
|
43
43
|
self.state_provider = source.state_provider
|
|
44
|
-
self.stateful_ingestion_config: Optional[
|
|
45
|
-
|
|
46
|
-
|
|
44
|
+
self.stateful_ingestion_config: Optional[ProfilingStatefulIngestionConfig] = (
|
|
45
|
+
config.stateful_ingestion
|
|
46
|
+
)
|
|
47
47
|
self.pipeline_name = pipeline_name
|
|
48
48
|
self.run_id = run_id
|
|
49
49
|
self.checkpointing_enabled: bool = (
|
|
@@ -48,9 +48,9 @@ class RedundantRunSkipHandler(
|
|
|
48
48
|
):
|
|
49
49
|
self.source = source
|
|
50
50
|
self.state_provider = source.state_provider
|
|
51
|
-
self.stateful_ingestion_config: Optional[
|
|
52
|
-
|
|
53
|
-
|
|
51
|
+
self.stateful_ingestion_config: Optional[StatefulIngestionConfig] = (
|
|
52
|
+
config.stateful_ingestion
|
|
53
|
+
)
|
|
54
54
|
self.pipeline_name = pipeline_name
|
|
55
55
|
self.run_id = run_id
|
|
56
56
|
self._job_id = self._init_job_id()
|
|
@@ -145,8 +145,7 @@ class RedundantRunSkipHandler(
|
|
|
145
145
|
)
|
|
146
146
|
|
|
147
147
|
logger.debug(
|
|
148
|
-
f"{self.job_id} : Last run start, end times:"
|
|
149
|
-
f"({last_run_time_window})"
|
|
148
|
+
f"{self.job_id} : Last run start, end times:({last_run_time_window})"
|
|
150
149
|
)
|
|
151
150
|
|
|
152
151
|
# If current run's time window is subset of last run's time window, then skip.
|
|
@@ -212,8 +211,7 @@ class RedundantRunSkipHandler(
|
|
|
212
211
|
)
|
|
213
212
|
|
|
214
213
|
self.log(
|
|
215
|
-
"Adjusted start, end times: "
|
|
216
|
-
f"({suggested_start_time}, {suggested_end_time})"
|
|
214
|
+
f"Adjusted start, end times: ({suggested_start_time}, {suggested_end_time})"
|
|
217
215
|
)
|
|
218
216
|
return (suggested_start_time, suggested_end_time)
|
|
219
217
|
|
|
@@ -111,9 +111,9 @@ class StaleEntityRemovalHandler(
|
|
|
111
111
|
self.state_type_class = state_type_class
|
|
112
112
|
self.pipeline_name = pipeline_name
|
|
113
113
|
self.run_id = run_id
|
|
114
|
-
self.stateful_ingestion_config: Optional[
|
|
115
|
-
|
|
116
|
-
|
|
114
|
+
self.stateful_ingestion_config: Optional[StatefulStaleMetadataRemovalConfig] = (
|
|
115
|
+
config.stateful_ingestion
|
|
116
|
+
)
|
|
117
117
|
self.checkpointing_enabled: bool = (
|
|
118
118
|
True
|
|
119
119
|
if (
|
|
@@ -70,20 +70,20 @@ class DatahubIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
|
|
|
70
70
|
self.orchestrator_name, pipeline_name, job_name
|
|
71
71
|
)
|
|
72
72
|
|
|
73
|
-
latest_checkpoint: Optional[
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
73
|
+
latest_checkpoint: Optional[DatahubIngestionCheckpointClass] = (
|
|
74
|
+
self.graph.get_latest_timeseries_value(
|
|
75
|
+
entity_urn=data_job_urn,
|
|
76
|
+
aspect_type=DatahubIngestionCheckpointClass,
|
|
77
|
+
filter_criteria_map={
|
|
78
|
+
"pipelineName": pipeline_name,
|
|
79
|
+
},
|
|
80
|
+
)
|
|
81
81
|
)
|
|
82
82
|
if latest_checkpoint:
|
|
83
83
|
logger.debug(
|
|
84
84
|
f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
|
|
85
85
|
f" job_name:'{job_name}' found with start_time:"
|
|
86
|
-
f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}"
|
|
86
|
+
f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis / 1000)}"
|
|
87
87
|
)
|
|
88
88
|
return latest_checkpoint
|
|
89
89
|
else:
|
|
@@ -67,7 +67,7 @@ class FileIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
|
|
|
67
67
|
logger.debug(
|
|
68
68
|
f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
|
|
69
69
|
f" job_name:'{job_name}' found with start_time:"
|
|
70
|
-
f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}"
|
|
70
|
+
f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis / 1000)}"
|
|
71
71
|
)
|
|
72
72
|
return latest_checkpoint
|
|
73
73
|
else:
|