acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +3 -3
- datahub/api/entities/datajob/datajob.py +7 -4
- datahub/api/entities/dataset/dataset.py +9 -11
- datahub/api/entities/forms/forms.py +34 -34
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/operation.py +4 -4
- datahub/cli/check_cli.py +3 -2
- datahub/cli/config_utils.py +2 -2
- datahub/cli/delete_cli.py +6 -5
- datahub/cli/docker_cli.py +2 -2
- datahub/cli/exists_cli.py +2 -1
- datahub/cli/get_cli.py +2 -1
- datahub/cli/iceberg_cli.py +6 -5
- datahub/cli/ingest_cli.py +9 -6
- datahub/cli/migrate.py +4 -3
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +3 -2
- datahub/cli/specific/assertions_cli.py +2 -1
- datahub/cli/specific/datacontract_cli.py +3 -2
- datahub/cli/specific/dataproduct_cli.py +10 -9
- datahub/cli/specific/dataset_cli.py +4 -3
- datahub/cli/specific/forms_cli.py +2 -1
- datahub/cli/specific/group_cli.py +2 -1
- datahub/cli/specific/structuredproperties_cli.py +4 -3
- datahub/cli/specific/user_cli.py +2 -1
- datahub/cli/state_cli.py +2 -1
- datahub/cli/timeline_cli.py +2 -1
- datahub/configuration/common.py +5 -0
- datahub/configuration/source_common.py +1 -1
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/request_helper.py +116 -3
- datahub/emitter/rest_emitter.py +163 -93
- datahub/entrypoints.py +2 -1
- datahub/errors.py +4 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
- datahub/ingestion/api/source.py +2 -5
- datahub/ingestion/api/source_helpers.py +1 -0
- datahub/ingestion/glossary/classification_mixin.py +4 -2
- datahub/ingestion/graph/client.py +33 -8
- datahub/ingestion/graph/config.py +14 -0
- datahub/ingestion/graph/filters.py +1 -1
- datahub/ingestion/graph/links.py +53 -0
- datahub/ingestion/run/pipeline.py +9 -6
- datahub/ingestion/run/pipeline_config.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +5 -6
- datahub/ingestion/source/apply/datahub_apply.py +2 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
- datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
- datahub/ingestion/source/dbt/dbt_common.py +10 -2
- datahub/ingestion/source/dbt/dbt_core.py +82 -42
- datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
- datahub/ingestion/source/feast.py +4 -4
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
- datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
- datahub/ingestion/source/ge_data_profiler.py +27 -1
- datahub/ingestion/source/hex/api.py +1 -20
- datahub/ingestion/source/hex/query_fetcher.py +4 -1
- datahub/ingestion/source/iceberg/iceberg.py +20 -4
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +17 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +34 -5
- datahub/ingestion/source/looker/lookml_source.py +7 -1
- datahub/ingestion/source/metadata/lineage.py +2 -1
- datahub/ingestion/source/mlflow.py +19 -6
- datahub/ingestion/source/mode.py +74 -28
- datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
- datahub/ingestion/source/powerbi/config.py +13 -1
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/redshift/usage.py +10 -9
- datahub/ingestion/source/sigma/config.py +74 -6
- datahub/ingestion/source/sigma/sigma.py +16 -1
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +4 -52
- datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
- datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +5 -1
- datahub/ingestion/source/sql/druid.py +7 -2
- datahub/ingestion/source/sql/hive.py +7 -2
- datahub/ingestion/source/sql/hive_metastore.py +5 -5
- datahub/ingestion/source/sql/mssql/source.py +1 -1
- datahub/ingestion/source/sql/oracle.py +6 -2
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +31 -6
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +2 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
- datahub/ingestion/source/vertexai/vertexai.py +316 -4
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
- datahub/integrations/assertion/common.py +3 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
- datahub/metadata/_urns/urn_defs.py +1819 -1763
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +17296 -16883
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/_all_entities.py +4 -0
- datahub/sdk/_shared.py +142 -4
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/dataset.py +2 -2
- datahub/sdk/entity_client.py +8 -0
- datahub/sdk/lineage_client.py +235 -0
- datahub/sdk/main_client.py +6 -3
- datahub/sdk/mlmodel.py +301 -0
- datahub/sdk/mlmodelgroup.py +233 -0
- datahub/secret/datahub_secret_store.py +2 -1
- datahub/specific/dataset.py +12 -0
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
- datahub/sql_parsing/sqlglot_utils.py +18 -14
- datahub/telemetry/telemetry.py +2 -2
- datahub/testing/check_imports.py +1 -1
- datahub/testing/mcp_diff.py +15 -2
- datahub/upgrade/upgrade.py +10 -12
- datahub/utilities/logging_manager.py +8 -1
- datahub/utilities/server_config_util.py +350 -10
- datahub/utilities/sqlalchemy_query_combiner.py +4 -5
- datahub/utilities/urn_encoder.py +1 -1
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -95,22 +95,22 @@ class SigmaAPI:
|
|
|
95
95
|
return get_response
|
|
96
96
|
|
|
97
97
|
def get_workspace(self, workspace_id: str) -> Optional[Workspace]:
|
|
98
|
+
if workspace_id in self.workspaces:
|
|
99
|
+
return self.workspaces[workspace_id]
|
|
100
|
+
|
|
98
101
|
logger.debug(f"Fetching workspace metadata with id '{workspace_id}'")
|
|
99
102
|
try:
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
workspace = Workspace.parse_obj(response.json())
|
|
112
|
-
self.workspaces[workspace.workspaceId] = workspace
|
|
113
|
-
return workspace
|
|
103
|
+
response = self._get_api_call(
|
|
104
|
+
f"{self.config.api_url}/workspaces/{workspace_id}"
|
|
105
|
+
)
|
|
106
|
+
if response.status_code == 403:
|
|
107
|
+
logger.debug(f"Workspace {workspace_id} not accessible.")
|
|
108
|
+
self.report.non_accessible_workspaces_count += 1
|
|
109
|
+
return None
|
|
110
|
+
response.raise_for_status()
|
|
111
|
+
workspace = Workspace.parse_obj(response.json())
|
|
112
|
+
self.workspaces[workspace.workspaceId] = workspace
|
|
113
|
+
return workspace
|
|
114
114
|
except Exception as e:
|
|
115
115
|
self._log_http_error(
|
|
116
116
|
message=f"Unable to fetch workspace '{workspace_id}'. Exception: {e}"
|
|
@@ -187,7 +187,9 @@ class SigmaAPI:
|
|
|
187
187
|
@functools.lru_cache
|
|
188
188
|
def _get_files_metadata(self, file_type: str) -> Dict[str, File]:
|
|
189
189
|
logger.debug(f"Fetching file metadata with type {file_type}.")
|
|
190
|
-
file_url = url =
|
|
190
|
+
file_url = url = (
|
|
191
|
+
f"{self.config.api_url}/files?permissionFilter=view&typeFilters={file_type}"
|
|
192
|
+
)
|
|
191
193
|
try:
|
|
192
194
|
files_metadata: Dict[str, File] = {}
|
|
193
195
|
while True:
|
|
@@ -225,31 +227,50 @@ class SigmaAPI:
|
|
|
225
227
|
for dataset_dict in response_dict[Constant.ENTRIES]:
|
|
226
228
|
dataset = SigmaDataset.parse_obj(dataset_dict)
|
|
227
229
|
|
|
228
|
-
if dataset.datasetId in dataset_files_metadata:
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
230
|
+
if dataset.datasetId not in dataset_files_metadata:
|
|
231
|
+
self.report.datasets.dropped(
|
|
232
|
+
f"{dataset.name} ({dataset.datasetId}) (missing file metadata)"
|
|
233
|
+
)
|
|
234
|
+
continue
|
|
235
|
+
|
|
236
|
+
dataset.workspaceId = dataset_files_metadata[
|
|
237
|
+
dataset.datasetId
|
|
238
|
+
].workspaceId
|
|
239
|
+
|
|
240
|
+
dataset.path = dataset_files_metadata[dataset.datasetId].path
|
|
241
|
+
dataset.badge = dataset_files_metadata[dataset.datasetId].badge
|
|
242
|
+
|
|
243
|
+
workspace = None
|
|
244
|
+
if dataset.workspaceId:
|
|
245
|
+
workspace = self.get_workspace(dataset.workspaceId)
|
|
246
|
+
|
|
247
|
+
if workspace:
|
|
248
|
+
if self.config.workspace_pattern.allowed(workspace.name):
|
|
249
|
+
self.report.datasets.processed(
|
|
250
|
+
f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
|
|
251
|
+
)
|
|
252
|
+
datasets.append(dataset)
|
|
253
|
+
else:
|
|
254
|
+
self.report.datasets.dropped(
|
|
255
|
+
f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
|
|
256
|
+
)
|
|
257
|
+
elif self.config.ingest_shared_entities:
|
|
258
|
+
# If no workspace for dataset we can consider it as shared entity
|
|
259
|
+
self.report.datasets_without_workspace += 1
|
|
260
|
+
self.report.datasets.processed(
|
|
261
|
+
f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
|
|
262
|
+
)
|
|
263
|
+
datasets.append(dataset)
|
|
264
|
+
else:
|
|
265
|
+
self.report.datasets.dropped(
|
|
266
|
+
f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
|
|
267
|
+
)
|
|
247
268
|
|
|
248
269
|
if response_dict[Constant.NEXTPAGE]:
|
|
249
270
|
url = f"{dataset_url}?page={response_dict[Constant.NEXTPAGE]}"
|
|
250
271
|
else:
|
|
251
272
|
break
|
|
252
|
-
|
|
273
|
+
|
|
253
274
|
return datasets
|
|
254
275
|
except Exception as e:
|
|
255
276
|
self._log_http_error(
|
|
@@ -381,34 +402,54 @@ class SigmaAPI:
|
|
|
381
402
|
for workbook_dict in response_dict[Constant.ENTRIES]:
|
|
382
403
|
workbook = Workbook.parse_obj(workbook_dict)
|
|
383
404
|
|
|
384
|
-
if workbook.workbookId in workbook_files_metadata:
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
workbook.workbookId
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
405
|
+
if workbook.workbookId not in workbook_files_metadata:
|
|
406
|
+
# Due to a bug in the Sigma API, it seems like the /files endpoint does not
|
|
407
|
+
# return file metadata when the user has access via admin permissions. In
|
|
408
|
+
# those cases, the user associated with the token needs to be manually added
|
|
409
|
+
# to the workspace.
|
|
410
|
+
self.report.workbooks.dropped(
|
|
411
|
+
f"{workbook.name} ({workbook.workbookId}) (missing file metadata; path: {workbook.path}; likely need to manually add user to workspace)"
|
|
412
|
+
)
|
|
413
|
+
continue
|
|
414
|
+
|
|
415
|
+
workbook.workspaceId = workbook_files_metadata[
|
|
416
|
+
workbook.workbookId
|
|
417
|
+
].workspaceId
|
|
418
|
+
|
|
419
|
+
workbook.badge = workbook_files_metadata[workbook.workbookId].badge
|
|
420
|
+
|
|
421
|
+
workspace = None
|
|
422
|
+
if workbook.workspaceId:
|
|
423
|
+
workspace = self.get_workspace(workbook.workspaceId)
|
|
424
|
+
|
|
425
|
+
if workspace:
|
|
426
|
+
if self.config.workspace_pattern.allowed(workspace.name):
|
|
427
|
+
self.report.workbooks.processed(
|
|
428
|
+
f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
|
|
429
|
+
)
|
|
430
|
+
workbook.pages = self.get_workbook_pages(workbook)
|
|
431
|
+
workbooks.append(workbook)
|
|
432
|
+
else:
|
|
433
|
+
self.report.workbooks.dropped(
|
|
434
|
+
f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
|
|
435
|
+
)
|
|
436
|
+
elif self.config.ingest_shared_entities:
|
|
437
|
+
# If no workspace for workbook we can consider it as shared entity
|
|
438
|
+
self.report.workbooks_without_workspace += 1
|
|
439
|
+
self.report.workbooks.processed(
|
|
440
|
+
f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
|
|
441
|
+
)
|
|
442
|
+
workbook.pages = self.get_workbook_pages(workbook)
|
|
443
|
+
workbooks.append(workbook)
|
|
444
|
+
else:
|
|
445
|
+
self.report.workbooks.dropped(
|
|
446
|
+
f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
|
|
447
|
+
)
|
|
406
448
|
|
|
407
449
|
if response_dict[Constant.NEXTPAGE]:
|
|
408
450
|
url = f"{workbook_url}?page={response_dict[Constant.NEXTPAGE]}"
|
|
409
451
|
else:
|
|
410
452
|
break
|
|
411
|
-
self.report.number_of_workbooks = len(workbooks)
|
|
412
453
|
return workbooks
|
|
413
454
|
except Exception as e:
|
|
414
455
|
self._log_http_error(
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
-
import textwrap
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
6
5
|
|
|
@@ -613,6 +612,10 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
613
612
|
),
|
|
614
613
|
)
|
|
615
614
|
|
|
615
|
+
@retry(
|
|
616
|
+
wait=wait_exponential(multiplier=2, min=4, max=60),
|
|
617
|
+
before_sleep=before_sleep_log(logger, logging.ERROR, True),
|
|
618
|
+
)
|
|
616
619
|
def get_user_to_be_updated(
|
|
617
620
|
self,
|
|
618
621
|
) -> Iterable[Tuple[CorpUser, Optional[CorpUserEditableInfoClass]]]:
|
|
@@ -634,56 +637,5 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
634
637
|
if user_obj.email is not None:
|
|
635
638
|
yield (user_obj, editable_properties)
|
|
636
639
|
|
|
637
|
-
@retry(
|
|
638
|
-
wait=wait_exponential(multiplier=2, min=4, max=60),
|
|
639
|
-
before_sleep=before_sleep_log(logger, logging.ERROR, True),
|
|
640
|
-
)
|
|
641
|
-
def get_user_to_be_updated_oss(self) -> Iterable[CorpUser]:
|
|
642
|
-
graphql_query = textwrap.dedent(
|
|
643
|
-
"""
|
|
644
|
-
query listUsers($input: ListUsersInput!) {
|
|
645
|
-
listUsers(input: $input) {
|
|
646
|
-
total
|
|
647
|
-
users {
|
|
648
|
-
urn
|
|
649
|
-
editableProperties {
|
|
650
|
-
email
|
|
651
|
-
slack
|
|
652
|
-
}
|
|
653
|
-
}
|
|
654
|
-
}
|
|
655
|
-
}
|
|
656
|
-
"""
|
|
657
|
-
)
|
|
658
|
-
start = 0
|
|
659
|
-
count = 10
|
|
660
|
-
total = count
|
|
661
|
-
|
|
662
|
-
assert self.ctx.graph is not None
|
|
663
|
-
|
|
664
|
-
while start < total:
|
|
665
|
-
variables = {"input": {"start": start, "count": count}}
|
|
666
|
-
response = self.ctx.graph.execute_graphql(
|
|
667
|
-
query=graphql_query, variables=variables
|
|
668
|
-
)
|
|
669
|
-
list_users = response.get("listUsers", {})
|
|
670
|
-
total = list_users.get("total", 0)
|
|
671
|
-
users = list_users.get("users", [])
|
|
672
|
-
for user in users:
|
|
673
|
-
user_obj = CorpUser()
|
|
674
|
-
editable_properties = user.get("editableProperties", {})
|
|
675
|
-
user_obj.urn = user.get("urn")
|
|
676
|
-
if user_obj.urn is None:
|
|
677
|
-
continue
|
|
678
|
-
if editable_properties is not None:
|
|
679
|
-
user_obj.email = editable_properties.get("email")
|
|
680
|
-
if user_obj.email is None:
|
|
681
|
-
urn_id = Urn.from_string(user_obj.urn).get_entity_id_as_string()
|
|
682
|
-
if "@" in urn_id:
|
|
683
|
-
user_obj.email = urn_id
|
|
684
|
-
if user_obj.email is not None:
|
|
685
|
-
yield user_obj
|
|
686
|
-
start += count
|
|
687
|
-
|
|
688
640
|
def get_report(self) -> SourceReport:
|
|
689
641
|
return self.report
|
|
@@ -4,7 +4,7 @@ from dataclasses import dataclass
|
|
|
4
4
|
from typing import Dict, List, Optional, Set
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
|
-
from pydantic import Field,
|
|
7
|
+
from pydantic import Field, root_validator, validator
|
|
8
8
|
|
|
9
9
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
10
10
|
from datahub.configuration.pattern_utils import UUID_REGEX
|
|
@@ -301,6 +301,7 @@ class SnowflakeV2Config(
|
|
|
301
301
|
default=AllowDenyPattern.allow_all(),
|
|
302
302
|
description=(
|
|
303
303
|
"List of regex patterns for structured properties to include in ingestion."
|
|
304
|
+
" Applied to tags with form `<database>.<schema>.<tag_name>`."
|
|
304
305
|
" Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
|
|
305
306
|
),
|
|
306
307
|
)
|
|
@@ -384,17 +385,6 @@ class SnowflakeV2Config(
|
|
|
384
385
|
|
|
385
386
|
return values
|
|
386
387
|
|
|
387
|
-
def get_sql_alchemy_url(
|
|
388
|
-
self,
|
|
389
|
-
database: Optional[str] = None,
|
|
390
|
-
username: Optional[str] = None,
|
|
391
|
-
password: Optional[SecretStr] = None,
|
|
392
|
-
role: Optional[str] = None,
|
|
393
|
-
) -> str:
|
|
394
|
-
return SnowflakeConnectionConfig.get_sql_alchemy_url(
|
|
395
|
-
self, database=database, username=username, password=password, role=role
|
|
396
|
-
)
|
|
397
|
-
|
|
398
388
|
@validator("shares")
|
|
399
389
|
def validate_shares(
|
|
400
390
|
cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import threading
|
|
2
3
|
from typing import Any, Dict, Optional
|
|
3
4
|
|
|
4
5
|
import pydantic
|
|
@@ -27,7 +28,7 @@ from datahub.ingestion.source.snowflake.oauth_config import (
|
|
|
27
28
|
OAuthIdentityProvider,
|
|
28
29
|
)
|
|
29
30
|
from datahub.ingestion.source.snowflake.oauth_generator import OAuthTokenGenerator
|
|
30
|
-
from datahub.ingestion.source.sql.
|
|
31
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
31
32
|
from datahub.utilities.config_clean import (
|
|
32
33
|
remove_protocol,
|
|
33
34
|
remove_suffix,
|
|
@@ -192,23 +193,11 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
192
193
|
"but should be set when using use_certificate false for oauth_config"
|
|
193
194
|
)
|
|
194
195
|
|
|
195
|
-
def get_sql_alchemy_url(
|
|
196
|
-
self,
|
|
197
|
-
database: Optional[str] = None,
|
|
198
|
-
username: Optional[str] = None,
|
|
199
|
-
password: Optional[pydantic.SecretStr] = None,
|
|
200
|
-
role: Optional[str] = None,
|
|
201
|
-
) -> str:
|
|
202
|
-
if username is None:
|
|
203
|
-
username = self.username
|
|
204
|
-
if password is None:
|
|
205
|
-
password = self.password
|
|
206
|
-
if role is None:
|
|
207
|
-
role = self.role
|
|
196
|
+
def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
|
|
208
197
|
return make_sqlalchemy_uri(
|
|
209
198
|
self.scheme,
|
|
210
|
-
username,
|
|
211
|
-
password.get_secret_value() if password else None,
|
|
199
|
+
self.username,
|
|
200
|
+
self.password.get_secret_value() if self.password else None,
|
|
212
201
|
self.account_id,
|
|
213
202
|
f'"{database}"' if database is not None else database,
|
|
214
203
|
uri_opts={
|
|
@@ -217,7 +206,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
217
206
|
for (key, value) in {
|
|
218
207
|
"authenticator": _VALID_AUTH_TYPES.get(self.authentication_type),
|
|
219
208
|
"warehouse": self.warehouse,
|
|
220
|
-
"role": role,
|
|
209
|
+
"role": self.role,
|
|
221
210
|
"application": _APPLICATION_NAME,
|
|
222
211
|
}.items()
|
|
223
212
|
if value
|
|
@@ -402,13 +391,30 @@ class SnowflakeConnection(Closeable):
|
|
|
402
391
|
def __init__(self, connection: NativeSnowflakeConnection):
|
|
403
392
|
self._connection = connection
|
|
404
393
|
|
|
394
|
+
self._query_num_lock = threading.Lock()
|
|
395
|
+
self._query_num = 1
|
|
396
|
+
|
|
405
397
|
def native_connection(self) -> NativeSnowflakeConnection:
|
|
406
398
|
return self._connection
|
|
407
399
|
|
|
400
|
+
def get_query_no(self) -> int:
|
|
401
|
+
with self._query_num_lock:
|
|
402
|
+
no = self._query_num
|
|
403
|
+
self._query_num += 1
|
|
404
|
+
return no
|
|
405
|
+
|
|
408
406
|
def query(self, query: str) -> Any:
|
|
409
407
|
try:
|
|
410
|
-
|
|
408
|
+
# We often run multiple queries in parallel across multiple threads,
|
|
409
|
+
# so we need to number them to help with log readability.
|
|
410
|
+
query_num = self.get_query_no()
|
|
411
|
+
logger.info(f"Query #{query_num}: {query}", stacklevel=2)
|
|
411
412
|
resp = self._connection.cursor(DictCursor).execute(query)
|
|
413
|
+
if resp is not None and resp.rowcount is not None:
|
|
414
|
+
logger.info(
|
|
415
|
+
f"Query #{query_num} got {resp.rowcount} row(s) back from Snowflake",
|
|
416
|
+
stacklevel=2,
|
|
417
|
+
)
|
|
412
418
|
return resp
|
|
413
419
|
|
|
414
420
|
except Exception as e:
|
|
@@ -135,12 +135,7 @@ class SnowflakeProfiler(GenericProfiler, SnowflakeCommonMixin):
|
|
|
135
135
|
) -> "DatahubGEProfiler":
|
|
136
136
|
assert db_name
|
|
137
137
|
|
|
138
|
-
url = self.config.get_sql_alchemy_url(
|
|
139
|
-
database=db_name,
|
|
140
|
-
username=self.config.username,
|
|
141
|
-
password=self.config.password,
|
|
142
|
-
role=self.config.role,
|
|
143
|
-
)
|
|
138
|
+
url = self.config.get_sql_alchemy_url(database=db_name)
|
|
144
139
|
|
|
145
140
|
logger.debug(f"sql_alchemy_url={url}")
|
|
146
141
|
|
|
@@ -515,7 +515,10 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
515
515
|
# job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
|
|
516
516
|
# here
|
|
517
517
|
query_id=get_query_fingerprint(
|
|
518
|
-
res["query_text"],
|
|
518
|
+
res["query_text"],
|
|
519
|
+
self.identifiers.platform,
|
|
520
|
+
fast=True,
|
|
521
|
+
secondary_id=res["query_secondary_fingerprint"],
|
|
519
522
|
),
|
|
520
523
|
query_text=res["query_text"],
|
|
521
524
|
upstreams=upstreams,
|
|
@@ -654,7 +657,17 @@ WITH
|
|
|
654
657
|
fingerprinted_queries as (
|
|
655
658
|
SELECT *,
|
|
656
659
|
-- TODO: Generate better fingerprints for each query by pushing down regex logic.
|
|
657
|
-
query_history.query_parameterized_hash as query_fingerprint
|
|
660
|
+
query_history.query_parameterized_hash as query_fingerprint,
|
|
661
|
+
-- Optional and additional hash to be used for query deduplication and final query identity
|
|
662
|
+
CASE
|
|
663
|
+
WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
|
|
664
|
+
-- Extract project id and hash it
|
|
665
|
+
THEN CAST(HASH(
|
|
666
|
+
REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
|
|
667
|
+
REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
|
|
668
|
+
) AS VARCHAR)
|
|
669
|
+
ELSE NULL
|
|
670
|
+
END as query_secondary_fingerprint
|
|
658
671
|
FROM
|
|
659
672
|
snowflake.account_usage.query_history
|
|
660
673
|
WHERE
|
|
@@ -670,11 +683,11 @@ fingerprinted_queries as (
|
|
|
670
683
|
{time_bucket_size},
|
|
671
684
|
CONVERT_TIMEZONE('UTC', start_time)
|
|
672
685
|
) AS bucket_start_time,
|
|
673
|
-
COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint) AS query_count,
|
|
686
|
+
COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
|
|
674
687
|
FROM
|
|
675
688
|
fingerprinted_queries
|
|
676
689
|
QUALIFY
|
|
677
|
-
ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint ORDER BY start_time DESC) = 1
|
|
690
|
+
ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
|
|
678
691
|
)
|
|
679
692
|
, raw_access_history AS (
|
|
680
693
|
SELECT
|
|
@@ -714,6 +727,7 @@ fingerprinted_queries as (
|
|
|
714
727
|
q.bucket_start_time,
|
|
715
728
|
q.query_id,
|
|
716
729
|
q.query_fingerprint,
|
|
730
|
+
q.query_secondary_fingerprint,
|
|
717
731
|
q.query_count,
|
|
718
732
|
q.session_id AS "SESSION_ID",
|
|
719
733
|
q.start_time AS "QUERY_START_TIME",
|
|
@@ -71,14 +71,6 @@ class SnowflakeQuery:
|
|
|
71
71
|
def current_warehouse() -> str:
|
|
72
72
|
return "select CURRENT_WAREHOUSE()"
|
|
73
73
|
|
|
74
|
-
@staticmethod
|
|
75
|
-
def current_database() -> str:
|
|
76
|
-
return "select CURRENT_DATABASE()"
|
|
77
|
-
|
|
78
|
-
@staticmethod
|
|
79
|
-
def current_schema() -> str:
|
|
80
|
-
return "select CURRENT_SCHEMA()"
|
|
81
|
-
|
|
82
74
|
@staticmethod
|
|
83
75
|
def show_databases() -> str:
|
|
84
76
|
return "show databases"
|
|
@@ -107,8 +99,8 @@ class SnowflakeQuery:
|
|
|
107
99
|
order by database_name"""
|
|
108
100
|
|
|
109
101
|
@staticmethod
|
|
110
|
-
def schemas_for_database(db_name:
|
|
111
|
-
db_clause = f'"{db_name}".'
|
|
102
|
+
def schemas_for_database(db_name: str) -> str:
|
|
103
|
+
db_clause = f'"{db_name}".'
|
|
112
104
|
return f"""
|
|
113
105
|
SELECT schema_name AS "SCHEMA_NAME",
|
|
114
106
|
created AS "CREATED",
|
|
@@ -119,8 +111,8 @@ class SnowflakeQuery:
|
|
|
119
111
|
order by schema_name"""
|
|
120
112
|
|
|
121
113
|
@staticmethod
|
|
122
|
-
def tables_for_database(db_name:
|
|
123
|
-
db_clause = f'"{db_name}".'
|
|
114
|
+
def tables_for_database(db_name: str) -> str:
|
|
115
|
+
db_clause = f'"{db_name}".'
|
|
124
116
|
return f"""
|
|
125
117
|
SELECT table_catalog AS "TABLE_CATALOG",
|
|
126
118
|
table_schema AS "TABLE_SCHEMA",
|
|
@@ -142,8 +134,8 @@ class SnowflakeQuery:
|
|
|
142
134
|
order by table_schema, table_name"""
|
|
143
135
|
|
|
144
136
|
@staticmethod
|
|
145
|
-
def tables_for_schema(schema_name: str, db_name:
|
|
146
|
-
db_clause = f'"{db_name}".'
|
|
137
|
+
def tables_for_schema(schema_name: str, db_name: str) -> str:
|
|
138
|
+
db_clause = f'"{db_name}".'
|
|
147
139
|
return f"""
|
|
148
140
|
SELECT table_catalog AS "TABLE_CATALOG",
|
|
149
141
|
table_schema AS "TABLE_SCHEMA",
|
|
@@ -165,8 +157,8 @@ class SnowflakeQuery:
|
|
|
165
157
|
order by table_schema, table_name"""
|
|
166
158
|
|
|
167
159
|
@staticmethod
|
|
168
|
-
def procedures_for_database(db_name:
|
|
169
|
-
db_clause = f'"{db_name}".'
|
|
160
|
+
def procedures_for_database(db_name: str) -> str:
|
|
161
|
+
db_clause = f'"{db_name}".'
|
|
170
162
|
return f"""
|
|
171
163
|
SELECT procedure_catalog AS "PROCEDURE_CATALOG",
|
|
172
164
|
procedure_schema AS "PROCEDURE_SCHEMA",
|
|
@@ -382,26 +374,6 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
382
374
|
ORDER BY query_start_time DESC
|
|
383
375
|
;"""
|
|
384
376
|
|
|
385
|
-
@staticmethod
|
|
386
|
-
def view_dependencies() -> str:
|
|
387
|
-
return """
|
|
388
|
-
SELECT
|
|
389
|
-
concat(
|
|
390
|
-
referenced_database, '.', referenced_schema,
|
|
391
|
-
'.', referenced_object_name
|
|
392
|
-
) AS "VIEW_UPSTREAM",
|
|
393
|
-
referenced_object_domain as "REFERENCED_OBJECT_DOMAIN",
|
|
394
|
-
concat(
|
|
395
|
-
referencing_database, '.', referencing_schema,
|
|
396
|
-
'.', referencing_object_name
|
|
397
|
-
) AS "DOWNSTREAM_VIEW",
|
|
398
|
-
referencing_object_domain AS "REFERENCING_OBJECT_DOMAIN"
|
|
399
|
-
FROM
|
|
400
|
-
snowflake.account_usage.object_dependencies
|
|
401
|
-
WHERE
|
|
402
|
-
referencing_object_domain in ('VIEW', 'MATERIALIZED VIEW')
|
|
403
|
-
"""
|
|
404
|
-
|
|
405
377
|
# Note on use of `upstreams_deny_pattern` to ignore temporary tables:
|
|
406
378
|
# Snowflake access history may include temporary tables in DIRECT_OBJECTS_ACCESSED and
|
|
407
379
|
# OBJECTS_MODIFIED->columns->directSources. We do not need these temporary tables and filter these in the query.
|
|
@@ -425,32 +397,6 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
425
397
|
upstreams_deny_pattern,
|
|
426
398
|
)
|
|
427
399
|
|
|
428
|
-
@staticmethod
|
|
429
|
-
def view_dependencies_v2() -> str:
|
|
430
|
-
return """
|
|
431
|
-
SELECT
|
|
432
|
-
ARRAY_UNIQUE_AGG(
|
|
433
|
-
OBJECT_CONSTRUCT(
|
|
434
|
-
'upstream_object_name', concat(
|
|
435
|
-
referenced_database, '.', referenced_schema,
|
|
436
|
-
'.', referenced_object_name
|
|
437
|
-
),
|
|
438
|
-
'upstream_object_domain', referenced_object_domain
|
|
439
|
-
)
|
|
440
|
-
) as "UPSTREAM_TABLES",
|
|
441
|
-
concat(
|
|
442
|
-
referencing_database, '.', referencing_schema,
|
|
443
|
-
'.', referencing_object_name
|
|
444
|
-
) AS "DOWNSTREAM_TABLE_NAME",
|
|
445
|
-
ANY_VALUE(referencing_object_domain) AS "DOWNSTREAM_TABLE_DOMAIN"
|
|
446
|
-
FROM
|
|
447
|
-
snowflake.account_usage.object_dependencies
|
|
448
|
-
WHERE
|
|
449
|
-
referencing_object_domain in ('VIEW', 'MATERIALIZED VIEW')
|
|
450
|
-
GROUP BY
|
|
451
|
-
DOWNSTREAM_TABLE_NAME
|
|
452
|
-
"""
|
|
453
|
-
|
|
454
400
|
@staticmethod
|
|
455
401
|
def show_external_tables() -> str:
|
|
456
402
|
return "show external tables in account"
|
|
@@ -1000,4 +946,4 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
1000
946
|
from_clause = (
|
|
1001
947
|
f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
|
|
1002
948
|
)
|
|
1003
|
-
return f"""SHOW STREAMS IN DATABASE {db_name} LIMIT {limit} {from_clause};"""
|
|
949
|
+
return f"""SHOW STREAMS IN DATABASE "{db_name}" LIMIT {limit} {from_clause};"""
|
|
@@ -23,6 +23,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
|
|
|
23
23
|
from datahub.metadata.com.linkedin.pegasus2avro.structured import (
|
|
24
24
|
StructuredPropertyDefinition,
|
|
25
25
|
)
|
|
26
|
+
from datahub.metadata.schema_classes import ChangeTypeClass
|
|
26
27
|
from datahub.metadata.urns import (
|
|
27
28
|
ContainerUrn,
|
|
28
29
|
DatasetUrn,
|
|
@@ -81,7 +82,7 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
81
82
|
def create_structured_property_templates(self) -> Iterable[MetadataWorkUnit]:
|
|
82
83
|
for tag in self.data_dictionary.get_all_tags():
|
|
83
84
|
if not self.config.structured_property_pattern.allowed(
|
|
84
|
-
tag.
|
|
85
|
+
tag._id_prefix_as_str()
|
|
85
86
|
):
|
|
86
87
|
continue
|
|
87
88
|
if self.config.extract_tags_as_structured_properties:
|
|
@@ -111,6 +112,8 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
111
112
|
yield MetadataChangeProposalWrapper(
|
|
112
113
|
entityUrn=urn,
|
|
113
114
|
aspect=aspect,
|
|
115
|
+
changeType=ChangeTypeClass.CREATE,
|
|
116
|
+
headers={"If-None-Match": "*"},
|
|
114
117
|
).as_workunit()
|
|
115
118
|
|
|
116
119
|
def _get_tags_on_object_with_propagation(
|
|
@@ -35,13 +35,14 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
35
35
|
SQLAlchemySource,
|
|
36
36
|
register_custom_type,
|
|
37
37
|
)
|
|
38
|
-
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
38
|
+
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
39
39
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
40
40
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
41
41
|
add_table_to_schema_container,
|
|
42
42
|
gen_database_container,
|
|
43
43
|
gen_database_key,
|
|
44
44
|
)
|
|
45
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
45
46
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
|
46
47
|
from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass
|
|
47
48
|
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
|
|
@@ -145,7 +145,11 @@ class ClickHouseConfig(
|
|
|
145
145
|
)
|
|
146
146
|
include_materialized_views: Optional[bool] = Field(default=True, description="")
|
|
147
147
|
|
|
148
|
-
def get_sql_alchemy_url(
|
|
148
|
+
def get_sql_alchemy_url(
|
|
149
|
+
self,
|
|
150
|
+
uri_opts: Optional[Dict[str, Any]] = None,
|
|
151
|
+
current_db: Optional[str] = None,
|
|
152
|
+
) -> str:
|
|
149
153
|
url = make_url(
|
|
150
154
|
super().get_sql_alchemy_url(uri_opts=self.uri_opts, current_db=current_db)
|
|
151
155
|
)
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
# This import verifies that the dependencies are available.
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
2
4
|
import pydruid # noqa: F401
|
|
3
5
|
from pydantic.fields import Field
|
|
4
6
|
from pydruid.db.sqlalchemy import DruidDialect
|
|
@@ -38,8 +40,11 @@ class DruidConfig(BasicSQLAlchemyConfig):
|
|
|
38
40
|
description="regex patterns for schemas to filter in ingestion.",
|
|
39
41
|
)
|
|
40
42
|
|
|
41
|
-
def get_sql_alchemy_url(
|
|
42
|
-
|
|
43
|
+
def get_sql_alchemy_url(
|
|
44
|
+
self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
|
|
45
|
+
) -> str:
|
|
46
|
+
base_url = super().get_sql_alchemy_url(uri_opts=uri_opts, database=database)
|
|
47
|
+
return f"{base_url}/druid/v2/sql/"
|
|
43
48
|
|
|
44
49
|
"""
|
|
45
50
|
The pydruid library already formats the table name correctly, so we do not
|