acryl-datahub 1.1.0rc4__py3-none-any.whl → 1.1.0.1rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1rc6.dist-info}/METADATA +2609 -2607
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1rc6.dist-info}/RECORD +87 -70
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1rc6.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +9 -8
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +170 -0
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/delete_cli.py +4 -4
- datahub/cli/ingest_cli.py +9 -1
- datahub/emitter/mce_builder.py +3 -1
- datahub/emitter/response_helper.py +86 -1
- datahub/emitter/rest_emitter.py +1 -1
- datahub/ingestion/graph/client.py +3 -3
- datahub/ingestion/source/apply/datahub_apply.py +4 -4
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +22 -10
- datahub/ingestion/source/data_lake_common/object_store.py +644 -0
- datahub/ingestion/source/datahub/config.py +11 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_common.py +30 -11
- datahub/ingestion/source/gcs/gcs_source.py +22 -7
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/hex/query_fetcher.py +9 -3
- datahub/ingestion/source/openapi.py +12 -0
- datahub/ingestion/source/openapi_parser.py +56 -37
- datahub/ingestion/source/s3/source.py +65 -6
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -21
- datahub/ingestion/source/snowflake/snowflake_query.py +0 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
- datahub/ingestion/source/sql/athena.py +1 -0
- datahub/ingestion/source/sql/hive.py +2 -3
- datahub/ingestion/source/sql/sql_common.py +98 -34
- datahub/ingestion/source/sql/sql_types.py +5 -2
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +117 -0
- datahub/ingestion/source/unity/source.py +167 -15
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/metadata/_internal_schema_classes.py +667 -522
- datahub/metadata/_urns/urn_defs.py +1804 -1748
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/schema.avsc +17358 -17584
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +1 -0
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +1 -0
- datahub/metadata/schemas/MLModelKey.avsc +1 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +6 -0
- datahub/sdk/_all_entities.py +11 -0
- datahub/sdk/_shared.py +118 -1
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +342 -0
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +90 -2
- datahub/sdk/lineage_client.py +681 -82
- datahub/sdk/main_client.py +27 -8
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/sdk_v2_helpers.py +18 -0
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1rc6.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1rc6.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1rc6.dist-info}/top_level.txt +0 -0
|
@@ -59,17 +59,21 @@ def request_call(
|
|
|
59
59
|
username: Optional[str] = None,
|
|
60
60
|
password: Optional[str] = None,
|
|
61
61
|
proxies: Optional[dict] = None,
|
|
62
|
+
verify_ssl: bool = True,
|
|
62
63
|
) -> requests.Response:
|
|
63
64
|
headers = {"accept": "application/json"}
|
|
64
65
|
if username is not None and password is not None:
|
|
65
66
|
return requests.get(
|
|
66
|
-
url,
|
|
67
|
+
url,
|
|
68
|
+
headers=headers,
|
|
69
|
+
auth=HTTPBasicAuth(username, password),
|
|
70
|
+
verify=verify_ssl,
|
|
67
71
|
)
|
|
68
72
|
elif token is not None:
|
|
69
73
|
headers["Authorization"] = f"{token}"
|
|
70
|
-
return requests.get(url, proxies=proxies, headers=headers)
|
|
74
|
+
return requests.get(url, proxies=proxies, headers=headers, verify=verify_ssl)
|
|
71
75
|
else:
|
|
72
|
-
return requests.get(url, headers=headers)
|
|
76
|
+
return requests.get(url, headers=headers, verify=verify_ssl)
|
|
73
77
|
|
|
74
78
|
|
|
75
79
|
def get_swag_json(
|
|
@@ -79,10 +83,16 @@ def get_swag_json(
|
|
|
79
83
|
password: Optional[str] = None,
|
|
80
84
|
swagger_file: str = "",
|
|
81
85
|
proxies: Optional[dict] = None,
|
|
86
|
+
verify_ssl: bool = True,
|
|
82
87
|
) -> Dict:
|
|
83
88
|
tot_url = url + swagger_file
|
|
84
89
|
response = request_call(
|
|
85
|
-
url=tot_url,
|
|
90
|
+
url=tot_url,
|
|
91
|
+
token=token,
|
|
92
|
+
username=username,
|
|
93
|
+
password=password,
|
|
94
|
+
proxies=proxies,
|
|
95
|
+
verify_ssl=verify_ssl,
|
|
86
96
|
)
|
|
87
97
|
|
|
88
98
|
if response.status_code != 200:
|
|
@@ -127,37 +137,45 @@ def get_endpoints(sw_dict: dict) -> dict:
|
|
|
127
137
|
check_sw_version(sw_dict)
|
|
128
138
|
|
|
129
139
|
for p_k, p_o in sw_dict["paths"].items():
|
|
130
|
-
method
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
url_details[p_k]
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
140
|
+
for method, method_spec in p_o.items():
|
|
141
|
+
# skip non-method keys like "parameters"
|
|
142
|
+
if method.lower() not in [
|
|
143
|
+
"get",
|
|
144
|
+
"post",
|
|
145
|
+
"put",
|
|
146
|
+
"delete",
|
|
147
|
+
"patch",
|
|
148
|
+
"options",
|
|
149
|
+
"head",
|
|
150
|
+
]:
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
responses = method_spec.get("responses", {})
|
|
154
|
+
base_res = responses.get("200") or responses.get(200)
|
|
155
|
+
if not base_res:
|
|
156
|
+
# if there is no 200 response, we skip this method
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
# if the description is not present, we will use the summary
|
|
160
|
+
# if both are not present, we will use an empty string
|
|
161
|
+
desc = method_spec.get("description") or method_spec.get("summary", "")
|
|
162
|
+
|
|
163
|
+
# if the tags are not present, we will use an empty list
|
|
164
|
+
tags = method_spec.get("tags", [])
|
|
165
|
+
|
|
166
|
+
url_details[p_k] = {
|
|
167
|
+
"description": desc,
|
|
168
|
+
"tags": tags,
|
|
169
|
+
"method": method.upper(),
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
example_data = check_for_api_example_data(base_res, p_k)
|
|
173
|
+
if example_data:
|
|
174
|
+
url_details[p_k]["data"] = example_data
|
|
175
|
+
|
|
176
|
+
# checking whether there are defined parameters to execute the call...
|
|
177
|
+
if "parameters" in p_o[method]:
|
|
178
|
+
url_details[p_k]["parameters"] = p_o[method]["parameters"]
|
|
161
179
|
|
|
162
180
|
return dict(sorted(url_details.items()))
|
|
163
181
|
|
|
@@ -358,6 +376,7 @@ def get_tok(
|
|
|
358
376
|
tok_url: str = "",
|
|
359
377
|
method: str = "post",
|
|
360
378
|
proxies: Optional[dict] = None,
|
|
379
|
+
verify_ssl: bool = True,
|
|
361
380
|
) -> str:
|
|
362
381
|
"""
|
|
363
382
|
Trying to post username/password to get auth.
|
|
@@ -368,7 +387,7 @@ def get_tok(
|
|
|
368
387
|
# this will make a POST call with username and password
|
|
369
388
|
data = {"username": username, "password": password, "maxDuration": True}
|
|
370
389
|
# url2post = url + "api/authenticate/"
|
|
371
|
-
response = requests.post(url4req, proxies=proxies, json=data)
|
|
390
|
+
response = requests.post(url4req, proxies=proxies, json=data, verify=verify_ssl)
|
|
372
391
|
if response.status_code == 200:
|
|
373
392
|
cont = json.loads(response.content)
|
|
374
393
|
if "token" in cont: # other authentication scheme
|
|
@@ -377,7 +396,7 @@ def get_tok(
|
|
|
377
396
|
token = f"Bearer {cont['tokens']['access']}"
|
|
378
397
|
elif method == "get":
|
|
379
398
|
# this will make a GET call with username and password
|
|
380
|
-
response = requests.get(url4req)
|
|
399
|
+
response = requests.get(url4req, verify=verify_ssl)
|
|
381
400
|
if response.status_code == 200:
|
|
382
401
|
cont = json.loads(response.content)
|
|
383
402
|
token = cont["token"]
|
|
@@ -7,7 +7,7 @@ import re
|
|
|
7
7
|
import time
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from pathlib import PurePath
|
|
10
|
-
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
|
|
11
11
|
from urllib.parse import urlparse
|
|
12
12
|
|
|
13
13
|
import smart_open.compression as so_compression
|
|
@@ -43,6 +43,9 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
43
43
|
strip_s3_prefix,
|
|
44
44
|
)
|
|
45
45
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
|
|
46
|
+
from datahub.ingestion.source.data_lake_common.object_store import (
|
|
47
|
+
create_object_store_adapter,
|
|
48
|
+
)
|
|
46
49
|
from datahub.ingestion.source.data_lake_common.path_spec import FolderTraversalMethod
|
|
47
50
|
from datahub.ingestion.source.s3.config import DataLakeSourceConfig, PathSpec
|
|
48
51
|
from datahub.ingestion.source.s3.report import DataLakeSourceReport
|
|
@@ -197,12 +200,59 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
197
200
|
report: DataLakeSourceReport
|
|
198
201
|
profiling_times_taken: List[float]
|
|
199
202
|
container_WU_creator: ContainerWUCreator
|
|
203
|
+
object_store_adapter: Any
|
|
200
204
|
|
|
201
205
|
def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext):
|
|
202
206
|
super().__init__(config, ctx)
|
|
203
207
|
self.source_config = config
|
|
204
208
|
self.report = DataLakeSourceReport()
|
|
205
209
|
self.profiling_times_taken = []
|
|
210
|
+
self.container_WU_creator = ContainerWUCreator(
|
|
211
|
+
self.source_config.platform,
|
|
212
|
+
self.source_config.platform_instance,
|
|
213
|
+
self.source_config.env,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Create an object store adapter for handling external URLs and paths
|
|
217
|
+
if self.is_s3_platform():
|
|
218
|
+
# Get the AWS region from config, if available
|
|
219
|
+
aws_region = None
|
|
220
|
+
if self.source_config.aws_config:
|
|
221
|
+
aws_region = self.source_config.aws_config.aws_region
|
|
222
|
+
|
|
223
|
+
# For backward compatibility with tests: if we're using a test endpoint, use us-east-1
|
|
224
|
+
if self.source_config.aws_config.aws_endpoint_url and (
|
|
225
|
+
"localstack"
|
|
226
|
+
in self.source_config.aws_config.aws_endpoint_url.lower()
|
|
227
|
+
or "storage.googleapis.com"
|
|
228
|
+
in self.source_config.aws_config.aws_endpoint_url.lower()
|
|
229
|
+
):
|
|
230
|
+
aws_region = "us-east-1"
|
|
231
|
+
|
|
232
|
+
# Create an S3 adapter with the configured region
|
|
233
|
+
self.object_store_adapter = create_object_store_adapter(
|
|
234
|
+
"s3", aws_region=aws_region
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Special handling for GCS via S3 (via boto compatibility layer)
|
|
238
|
+
if (
|
|
239
|
+
self.source_config.aws_config
|
|
240
|
+
and self.source_config.aws_config.aws_endpoint_url
|
|
241
|
+
and "storage.googleapis.com"
|
|
242
|
+
in self.source_config.aws_config.aws_endpoint_url.lower()
|
|
243
|
+
):
|
|
244
|
+
# We need to preserve the S3-style paths but use GCS external URL generation
|
|
245
|
+
self.object_store_adapter = create_object_store_adapter("gcs")
|
|
246
|
+
# Override create_s3_path to maintain S3 compatibility
|
|
247
|
+
self.object_store_adapter.register_customization(
|
|
248
|
+
"create_s3_path", lambda bucket, key: f"s3://{bucket}/{key}"
|
|
249
|
+
)
|
|
250
|
+
else:
|
|
251
|
+
# For local files, create a default adapter
|
|
252
|
+
self.object_store_adapter = create_object_store_adapter(
|
|
253
|
+
self.source_config.platform or "file"
|
|
254
|
+
)
|
|
255
|
+
|
|
206
256
|
config_report = {
|
|
207
257
|
config_option: config.dict().get(config_option)
|
|
208
258
|
for config_option in config_options_to_report
|
|
@@ -605,6 +655,19 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
605
655
|
maxPartition=max_partition_summary, minPartition=min_partition_summary
|
|
606
656
|
)
|
|
607
657
|
|
|
658
|
+
def get_external_url(self, table_data: TableData) -> Optional[str]:
|
|
659
|
+
"""
|
|
660
|
+
Get the external URL for a table using the configured object store adapter.
|
|
661
|
+
|
|
662
|
+
Args:
|
|
663
|
+
table_data: Table data containing path information
|
|
664
|
+
|
|
665
|
+
Returns:
|
|
666
|
+
An external URL or None if not applicable
|
|
667
|
+
"""
|
|
668
|
+
# The adapter handles all the URL generation with proper region handling
|
|
669
|
+
return self.object_store_adapter.get_external_url(table_data)
|
|
670
|
+
|
|
608
671
|
def ingest_table(
|
|
609
672
|
self, table_data: TableData, path_spec: PathSpec
|
|
610
673
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -674,6 +737,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
674
737
|
if max_partition
|
|
675
738
|
else None
|
|
676
739
|
),
|
|
740
|
+
externalUrl=self.get_external_url(table_data),
|
|
677
741
|
)
|
|
678
742
|
aspects.append(dataset_properties)
|
|
679
743
|
if table_data.size_in_bytes > 0:
|
|
@@ -1082,11 +1146,6 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1082
1146
|
)
|
|
1083
1147
|
|
|
1084
1148
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1085
|
-
self.container_WU_creator = ContainerWUCreator(
|
|
1086
|
-
self.source_config.platform,
|
|
1087
|
-
self.source_config.platform_instance,
|
|
1088
|
-
self.source_config.env,
|
|
1089
|
-
)
|
|
1090
1149
|
with PerfTimer() as timer:
|
|
1091
1150
|
assert self.source_config.path_specs
|
|
1092
1151
|
for path_spec in self.source_config.path_specs:
|
|
@@ -22,6 +22,7 @@ from datahub.ingestion.api.incremental_properties_helper import (
|
|
|
22
22
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
23
23
|
ClassificationSourceConfigMixin,
|
|
24
24
|
)
|
|
25
|
+
from datahub.ingestion.source.snowflake.constants import SnowflakeEdition
|
|
25
26
|
from datahub.ingestion.source.snowflake.snowflake_connection import (
|
|
26
27
|
SnowflakeConnectionConfig,
|
|
27
28
|
)
|
|
@@ -326,6 +327,18 @@ class SnowflakeV2Config(
|
|
|
326
327
|
" Map of share name -> details of share.",
|
|
327
328
|
)
|
|
328
329
|
|
|
330
|
+
known_snowflake_edition: Optional[SnowflakeEdition] = Field(
|
|
331
|
+
default=None,
|
|
332
|
+
description="Explicitly specify the Snowflake edition (STANDARD or ENTERPRISE). If unset, the edition will be inferred automatically using 'SHOW TAGS'.",
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Allows empty containers to be ingested before datasets are added, avoiding permission errors
|
|
336
|
+
warn_no_datasets: bool = Field(
|
|
337
|
+
hidden_from_docs=True,
|
|
338
|
+
default=False,
|
|
339
|
+
description="If True, warns when no datasets are found during ingestion. If False, ingestion fails when no datasets are found.",
|
|
340
|
+
)
|
|
341
|
+
|
|
329
342
|
include_assertion_results: bool = Field(
|
|
330
343
|
default=False,
|
|
331
344
|
description="Whether to ingest assertion run results for assertions created using Datahub"
|
|
@@ -127,6 +127,8 @@ class SnowflakeQueriesExtractorReport(Report):
|
|
|
127
127
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
128
128
|
|
|
129
129
|
num_ddl_queries_dropped: int = 0
|
|
130
|
+
num_stream_queries_observed: int = 0
|
|
131
|
+
num_create_temp_view_queries_observed: int = 0
|
|
130
132
|
num_users: int = 0
|
|
131
133
|
|
|
132
134
|
|
|
@@ -373,6 +375,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
373
375
|
if entry:
|
|
374
376
|
yield entry
|
|
375
377
|
|
|
378
|
+
@classmethod
|
|
379
|
+
def _has_temp_keyword(cls, query_text: str) -> bool:
|
|
380
|
+
return (
|
|
381
|
+
re.search(r"\bTEMP\b", query_text, re.IGNORECASE) is not None
|
|
382
|
+
or re.search(r"\bTEMPORARY\b", query_text, re.IGNORECASE) is not None
|
|
383
|
+
)
|
|
384
|
+
|
|
376
385
|
def _parse_audit_log_row(
|
|
377
386
|
self, row: Dict[str, Any], users: UsersMapping
|
|
378
387
|
) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
|
|
@@ -389,6 +398,15 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
389
398
|
key = key.lower()
|
|
390
399
|
res[key] = value
|
|
391
400
|
|
|
401
|
+
timestamp: datetime = res["query_start_time"]
|
|
402
|
+
timestamp = timestamp.astimezone(timezone.utc)
|
|
403
|
+
|
|
404
|
+
# TODO need to map snowflake query types to ours
|
|
405
|
+
query_text: str = res["query_text"]
|
|
406
|
+
query_type: QueryType = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
|
|
407
|
+
res["query_type"], QueryType.UNKNOWN
|
|
408
|
+
)
|
|
409
|
+
|
|
392
410
|
direct_objects_accessed = res["direct_objects_accessed"]
|
|
393
411
|
objects_modified = res["objects_modified"]
|
|
394
412
|
object_modified_by_ddl = res["object_modified_by_ddl"]
|
|
@@ -399,9 +417,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
399
417
|
"Error fetching ddl lineage from Snowflake"
|
|
400
418
|
):
|
|
401
419
|
known_ddl_entry = self.parse_ddl_query(
|
|
402
|
-
|
|
420
|
+
query_text,
|
|
403
421
|
res["session_id"],
|
|
404
|
-
|
|
422
|
+
timestamp,
|
|
405
423
|
object_modified_by_ddl,
|
|
406
424
|
res["query_type"],
|
|
407
425
|
)
|
|
@@ -419,24 +437,38 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
419
437
|
)
|
|
420
438
|
)
|
|
421
439
|
|
|
422
|
-
#
|
|
423
|
-
#
|
|
440
|
+
# There are a couple cases when we'd want to prefer our own SQL parsing
|
|
441
|
+
# over Snowflake's metadata.
|
|
442
|
+
# 1. For queries that use a stream, objects_modified returns $SYS_VIEW_X with no mapping.
|
|
443
|
+
# We can check direct_objects_accessed to see if there is a stream used, and if so,
|
|
444
|
+
# prefer doing SQL parsing over Snowflake's metadata.
|
|
445
|
+
# 2. For queries that create a view, objects_modified is empty and object_modified_by_ddl
|
|
446
|
+
# contains the view name and columns. Because `object_modified_by_ddl` doesn't contain
|
|
447
|
+
# source columns e.g. lineage information, we must do our own SQL parsing. We're mainly
|
|
448
|
+
# focused on temporary views. It's fine if we parse a couple extra views, but in general
|
|
449
|
+
# we want view definitions to come from Snowflake's schema metadata and not from query logs.
|
|
450
|
+
|
|
424
451
|
has_stream_objects = any(
|
|
425
452
|
obj.get("objectDomain") == "Stream" for obj in direct_objects_accessed
|
|
426
453
|
)
|
|
454
|
+
is_create_view = query_type == QueryType.CREATE_VIEW
|
|
455
|
+
is_create_temp_view = is_create_view and self._has_temp_keyword(query_text)
|
|
456
|
+
|
|
457
|
+
if has_stream_objects or is_create_temp_view:
|
|
458
|
+
if has_stream_objects:
|
|
459
|
+
self.report.num_stream_queries_observed += 1
|
|
460
|
+
elif is_create_temp_view:
|
|
461
|
+
self.report.num_create_temp_view_queries_observed += 1
|
|
427
462
|
|
|
428
|
-
# If a stream is used, default to query parsing.
|
|
429
|
-
if has_stream_objects:
|
|
430
|
-
logger.debug("Found matching stream object")
|
|
431
463
|
return ObservedQuery(
|
|
432
|
-
query=
|
|
464
|
+
query=query_text,
|
|
433
465
|
session_id=res["session_id"],
|
|
434
|
-
timestamp=
|
|
466
|
+
timestamp=timestamp,
|
|
435
467
|
user=user,
|
|
436
468
|
default_db=res["default_db"],
|
|
437
469
|
default_schema=res["default_schema"],
|
|
438
470
|
query_hash=get_query_fingerprint(
|
|
439
|
-
|
|
471
|
+
query_text, self.identifiers.platform, fast=True
|
|
440
472
|
),
|
|
441
473
|
)
|
|
442
474
|
|
|
@@ -502,25 +534,17 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
502
534
|
)
|
|
503
535
|
)
|
|
504
536
|
|
|
505
|
-
timestamp: datetime = res["query_start_time"]
|
|
506
|
-
timestamp = timestamp.astimezone(timezone.utc)
|
|
507
|
-
|
|
508
|
-
# TODO need to map snowflake query types to ours
|
|
509
|
-
query_type = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
|
|
510
|
-
res["query_type"], QueryType.UNKNOWN
|
|
511
|
-
)
|
|
512
|
-
|
|
513
537
|
entry = PreparsedQuery(
|
|
514
538
|
# Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
|
|
515
539
|
# job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
|
|
516
540
|
# here
|
|
517
541
|
query_id=get_query_fingerprint(
|
|
518
|
-
|
|
542
|
+
query_text,
|
|
519
543
|
self.identifiers.platform,
|
|
520
544
|
fast=True,
|
|
521
545
|
secondary_id=res["query_secondary_fingerprint"],
|
|
522
546
|
),
|
|
523
|
-
query_text=
|
|
547
|
+
query_text=query_text,
|
|
524
548
|
upstreams=upstreams,
|
|
525
549
|
downstream=downstream,
|
|
526
550
|
column_lineage=column_lineage,
|
|
@@ -543,7 +567,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
543
567
|
object_modified_by_ddl: dict,
|
|
544
568
|
query_type: str,
|
|
545
569
|
) -> Optional[Union[TableRename, TableSwap]]:
|
|
546
|
-
timestamp = timestamp.astimezone(timezone.utc)
|
|
547
570
|
if (
|
|
548
571
|
object_modified_by_ddl["operationType"] == "ALTER"
|
|
549
572
|
and query_type == "RENAME_TABLE"
|
|
@@ -43,13 +43,6 @@ class SnowflakeQuery:
|
|
|
43
43
|
ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format(
|
|
44
44
|
",".join(f"'{domain}'" for domain in ACCESS_HISTORY_TABLE_VIEW_DOMAINS)
|
|
45
45
|
)
|
|
46
|
-
ACCESS_HISTORY_TABLE_DOMAINS_FILTER = (
|
|
47
|
-
"("
|
|
48
|
-
f"'{SnowflakeObjectDomain.TABLE.capitalize()}',"
|
|
49
|
-
f"'{SnowflakeObjectDomain.VIEW.capitalize()}',"
|
|
50
|
-
f"'{SnowflakeObjectDomain.STREAM.capitalize()}',"
|
|
51
|
-
")"
|
|
52
|
-
)
|
|
53
46
|
|
|
54
47
|
@staticmethod
|
|
55
48
|
def current_account() -> str:
|
|
@@ -9,6 +9,7 @@ import re
|
|
|
9
9
|
from dataclasses import dataclass
|
|
10
10
|
from typing import Dict, Iterable, List, Optional, Union
|
|
11
11
|
|
|
12
|
+
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
12
13
|
from datahub.ingestion.api.common import PipelineContext
|
|
13
14
|
from datahub.ingestion.api.decorators import (
|
|
14
15
|
SupportStatus,
|
|
@@ -551,11 +552,15 @@ class SnowflakeV2Source(
|
|
|
551
552
|
and len(discovered_views) == 0
|
|
552
553
|
and len(discovered_streams) == 0
|
|
553
554
|
):
|
|
554
|
-
self.
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
555
|
+
if self.config.warn_no_datasets:
|
|
556
|
+
self.structured_reporter.warning(
|
|
557
|
+
"No tables/views/streams found. Verify dataset permissions if Snowflake source is not empty.",
|
|
558
|
+
)
|
|
559
|
+
else:
|
|
560
|
+
self.structured_reporter.failure(
|
|
561
|
+
GENERIC_PERMISSION_ERROR_KEY,
|
|
562
|
+
"No tables/views/streams found. Verify dataset permissions in Snowflake.",
|
|
563
|
+
)
|
|
559
564
|
|
|
560
565
|
self.discovered_datasets = (
|
|
561
566
|
discovered_tables + discovered_views + discovered_streams
|
|
@@ -571,7 +576,11 @@ class SnowflakeV2Source(
|
|
|
571
576
|
queries_extractor = SnowflakeQueriesExtractor(
|
|
572
577
|
connection=self.connection,
|
|
573
578
|
config=SnowflakeQueriesExtractorConfig(
|
|
574
|
-
window=
|
|
579
|
+
window=BaseTimeWindowConfig(
|
|
580
|
+
start_time=self.config.start_time,
|
|
581
|
+
end_time=self.config.end_time,
|
|
582
|
+
bucket_duration=self.config.bucket_duration,
|
|
583
|
+
),
|
|
575
584
|
temporary_tables_pattern=self.config.temporary_tables_pattern,
|
|
576
585
|
include_lineage=self.config.include_table_lineage,
|
|
577
586
|
include_usage_statistics=self.config.include_usage_stats,
|
|
@@ -732,6 +741,8 @@ class SnowflakeV2Source(
|
|
|
732
741
|
return None
|
|
733
742
|
|
|
734
743
|
def is_standard_edition(self) -> bool:
|
|
744
|
+
if self.config.known_snowflake_edition is not None:
|
|
745
|
+
return self.config.known_snowflake_edition == SnowflakeEdition.STANDARD
|
|
735
746
|
try:
|
|
736
747
|
self.connection.query(SnowflakeQuery.show_tags())
|
|
737
748
|
return False
|
|
@@ -323,6 +323,7 @@ class Partitionitem:
|
|
|
323
323
|
"Optionally enabled via configuration. Profiling uses sql queries on whole table which can be expensive operation.",
|
|
324
324
|
)
|
|
325
325
|
@capability(SourceCapability.LINEAGE_COARSE, "Supported for S3 tables")
|
|
326
|
+
@capability(SourceCapability.LINEAGE_FINE, "Supported for S3 tables")
|
|
326
327
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
327
328
|
class AthenaSource(SQLAlchemySource):
|
|
328
329
|
"""
|
|
@@ -139,7 +139,7 @@ class StoragePathParser:
|
|
|
139
139
|
path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
|
|
140
140
|
|
|
141
141
|
elif platform == StoragePlatform.AZURE:
|
|
142
|
-
if scheme in ("abfs", "abfss"):
|
|
142
|
+
if scheme in ("abfs", "abfss", "wasbs"):
|
|
143
143
|
# Format: abfss://container@account.dfs.core.windows.net/path
|
|
144
144
|
container = parsed.netloc.split("@")[0]
|
|
145
145
|
path = f"{container}/{parsed.path.lstrip('/')}"
|
|
@@ -153,7 +153,7 @@ class StoragePathParser:
|
|
|
153
153
|
|
|
154
154
|
elif platform == StoragePlatform.DBFS:
|
|
155
155
|
# For DBFS, use path as-is
|
|
156
|
-
path = parsed.path.lstrip("/")
|
|
156
|
+
path = "/" + parsed.path.lstrip("/")
|
|
157
157
|
|
|
158
158
|
elif platform == StoragePlatform.LOCAL:
|
|
159
159
|
# For local files, use full path
|
|
@@ -169,7 +169,6 @@ class StoragePathParser:
|
|
|
169
169
|
# Clean up the path
|
|
170
170
|
path = path.rstrip("/") # Remove trailing slashes
|
|
171
171
|
path = re.sub(r"/+", "/", path) # Normalize multiple slashes
|
|
172
|
-
path = f"/{path}"
|
|
173
172
|
|
|
174
173
|
return platform, path
|
|
175
174
|
|