acryl-datahub 1.1.0rc4__py3-none-any.whl → 1.1.1rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.1rc2.dist-info}/METADATA +2480 -2480
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.1rc2.dist-info}/RECORD +19 -16
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.1rc2.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/ingestion/source/apply/datahub_apply.py +4 -4
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +22 -10
- datahub/ingestion/source/data_lake_common/object_store.py +644 -0
- datahub/ingestion/source/gcs/gcs_source.py +22 -7
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/s3/source.py +65 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -21
- datahub/ingestion/source/snowflake/snowflake_query.py +0 -7
- datahub/ingestion/source/sql/hive.py +2 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/sdk_v2_helpers.py +12 -0
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.1rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.1rc2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.1rc2.dist-info}/top_level.txt +0 -0
|
@@ -4,35 +4,62 @@ GCS_PREFIX = "gs://"
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def is_gcs_uri(uri: str) -> bool:
|
|
7
|
+
"""
|
|
8
|
+
Check if a URI is a GCS URI (starts with gs://).
|
|
9
|
+
|
|
10
|
+
For more general URI handling, consider using object_store.get_object_store_for_uri.
|
|
11
|
+
"""
|
|
7
12
|
return uri.startswith(GCS_PREFIX)
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
def get_gcs_prefix(gcs_uri: str) -> Optional[str]:
|
|
16
|
+
"""
|
|
17
|
+
Get the GCS prefix (gs://) if the URI is a GCS URI.
|
|
18
|
+
|
|
19
|
+
For more general URI handling, consider using object_store.get_object_store_for_uri.
|
|
20
|
+
"""
|
|
11
21
|
if gcs_uri.startswith(GCS_PREFIX):
|
|
12
22
|
return GCS_PREFIX
|
|
13
23
|
return None
|
|
14
24
|
|
|
15
25
|
|
|
16
26
|
def strip_gcs_prefix(gcs_uri: str) -> str:
|
|
17
|
-
|
|
27
|
+
"""
|
|
28
|
+
Remove the GCS prefix (gs://) from a GCS URI.
|
|
29
|
+
|
|
30
|
+
For more general URI handling, consider using the object_store module.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
gcs_uri: A GCS URI starting with gs://
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
The URI without the gs:// prefix
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If the URI doesn't start with gs://
|
|
40
|
+
"""
|
|
18
41
|
prefix = get_gcs_prefix(gcs_uri)
|
|
19
42
|
if not prefix:
|
|
20
|
-
raise ValueError(f"Not
|
|
43
|
+
raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
|
|
21
44
|
|
|
22
45
|
return gcs_uri[len(GCS_PREFIX) :]
|
|
23
46
|
|
|
24
47
|
|
|
25
|
-
def get_gcs_bucket_name(path):
|
|
26
|
-
if not is_gcs_uri(path):
|
|
27
|
-
raise ValueError(f"Not a GCS URI. Must start with prefixe: {GCS_PREFIX}")
|
|
28
|
-
return strip_gcs_prefix(path).split("/")[0]
|
|
29
|
-
|
|
30
|
-
|
|
31
48
|
def get_gcs_bucket_relative_path(gcs_uri: str) -> str:
|
|
49
|
+
"""
|
|
50
|
+
Get the path relative to the bucket from a GCS URI.
|
|
51
|
+
|
|
52
|
+
For more general URI handling, consider using object_store.get_object_key.
|
|
53
|
+
"""
|
|
32
54
|
return "/".join(strip_gcs_prefix(gcs_uri).split("/")[1:])
|
|
33
55
|
|
|
34
56
|
|
|
35
57
|
def get_gcs_key_prefix(gcs_uri: str) -> str:
|
|
58
|
+
"""
|
|
59
|
+
Get the key prefix (first path component after bucket) from a GCS URI.
|
|
60
|
+
|
|
61
|
+
For more general URI handling, consider using object_store.get_object_key.
|
|
62
|
+
"""
|
|
36
63
|
if not is_gcs_uri(gcs_uri):
|
|
37
|
-
raise ValueError(f"Not a GCS URI. Must start with
|
|
64
|
+
raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
|
|
38
65
|
return strip_gcs_prefix(gcs_uri).split("/", maxsplit=1)[1]
|
|
@@ -7,7 +7,7 @@ import re
|
|
|
7
7
|
import time
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from pathlib import PurePath
|
|
10
|
-
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
|
|
11
11
|
from urllib.parse import urlparse
|
|
12
12
|
|
|
13
13
|
import smart_open.compression as so_compression
|
|
@@ -43,6 +43,9 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
43
43
|
strip_s3_prefix,
|
|
44
44
|
)
|
|
45
45
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
|
|
46
|
+
from datahub.ingestion.source.data_lake_common.object_store import (
|
|
47
|
+
create_object_store_adapter,
|
|
48
|
+
)
|
|
46
49
|
from datahub.ingestion.source.data_lake_common.path_spec import FolderTraversalMethod
|
|
47
50
|
from datahub.ingestion.source.s3.config import DataLakeSourceConfig, PathSpec
|
|
48
51
|
from datahub.ingestion.source.s3.report import DataLakeSourceReport
|
|
@@ -197,12 +200,59 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
197
200
|
report: DataLakeSourceReport
|
|
198
201
|
profiling_times_taken: List[float]
|
|
199
202
|
container_WU_creator: ContainerWUCreator
|
|
203
|
+
object_store_adapter: Any
|
|
200
204
|
|
|
201
205
|
def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext):
|
|
202
206
|
super().__init__(config, ctx)
|
|
203
207
|
self.source_config = config
|
|
204
208
|
self.report = DataLakeSourceReport()
|
|
205
209
|
self.profiling_times_taken = []
|
|
210
|
+
self.container_WU_creator = ContainerWUCreator(
|
|
211
|
+
self.source_config.platform,
|
|
212
|
+
self.source_config.platform_instance,
|
|
213
|
+
self.source_config.env,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Create an object store adapter for handling external URLs and paths
|
|
217
|
+
if self.is_s3_platform():
|
|
218
|
+
# Get the AWS region from config, if available
|
|
219
|
+
aws_region = None
|
|
220
|
+
if self.source_config.aws_config:
|
|
221
|
+
aws_region = self.source_config.aws_config.aws_region
|
|
222
|
+
|
|
223
|
+
# For backward compatibility with tests: if we're using a test endpoint, use us-east-1
|
|
224
|
+
if self.source_config.aws_config.aws_endpoint_url and (
|
|
225
|
+
"localstack"
|
|
226
|
+
in self.source_config.aws_config.aws_endpoint_url.lower()
|
|
227
|
+
or "storage.googleapis.com"
|
|
228
|
+
in self.source_config.aws_config.aws_endpoint_url.lower()
|
|
229
|
+
):
|
|
230
|
+
aws_region = "us-east-1"
|
|
231
|
+
|
|
232
|
+
# Create an S3 adapter with the configured region
|
|
233
|
+
self.object_store_adapter = create_object_store_adapter(
|
|
234
|
+
"s3", aws_region=aws_region
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Special handling for GCS via S3 (via boto compatibility layer)
|
|
238
|
+
if (
|
|
239
|
+
self.source_config.aws_config
|
|
240
|
+
and self.source_config.aws_config.aws_endpoint_url
|
|
241
|
+
and "storage.googleapis.com"
|
|
242
|
+
in self.source_config.aws_config.aws_endpoint_url.lower()
|
|
243
|
+
):
|
|
244
|
+
# We need to preserve the S3-style paths but use GCS external URL generation
|
|
245
|
+
self.object_store_adapter = create_object_store_adapter("gcs")
|
|
246
|
+
# Override create_s3_path to maintain S3 compatibility
|
|
247
|
+
self.object_store_adapter.register_customization(
|
|
248
|
+
"create_s3_path", lambda bucket, key: f"s3://{bucket}/{key}"
|
|
249
|
+
)
|
|
250
|
+
else:
|
|
251
|
+
# For local files, create a default adapter
|
|
252
|
+
self.object_store_adapter = create_object_store_adapter(
|
|
253
|
+
self.source_config.platform or "file"
|
|
254
|
+
)
|
|
255
|
+
|
|
206
256
|
config_report = {
|
|
207
257
|
config_option: config.dict().get(config_option)
|
|
208
258
|
for config_option in config_options_to_report
|
|
@@ -605,6 +655,19 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
605
655
|
maxPartition=max_partition_summary, minPartition=min_partition_summary
|
|
606
656
|
)
|
|
607
657
|
|
|
658
|
+
def get_external_url(self, table_data: TableData) -> Optional[str]:
|
|
659
|
+
"""
|
|
660
|
+
Get the external URL for a table using the configured object store adapter.
|
|
661
|
+
|
|
662
|
+
Args:
|
|
663
|
+
table_data: Table data containing path information
|
|
664
|
+
|
|
665
|
+
Returns:
|
|
666
|
+
An external URL or None if not applicable
|
|
667
|
+
"""
|
|
668
|
+
# The adapter handles all the URL generation with proper region handling
|
|
669
|
+
return self.object_store_adapter.get_external_url(table_data)
|
|
670
|
+
|
|
608
671
|
def ingest_table(
|
|
609
672
|
self, table_data: TableData, path_spec: PathSpec
|
|
610
673
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -674,6 +737,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
674
737
|
if max_partition
|
|
675
738
|
else None
|
|
676
739
|
),
|
|
740
|
+
externalUrl=self.get_external_url(table_data),
|
|
677
741
|
)
|
|
678
742
|
aspects.append(dataset_properties)
|
|
679
743
|
if table_data.size_in_bytes > 0:
|
|
@@ -1082,11 +1146,6 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1082
1146
|
)
|
|
1083
1147
|
|
|
1084
1148
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1085
|
-
self.container_WU_creator = ContainerWUCreator(
|
|
1086
|
-
self.source_config.platform,
|
|
1087
|
-
self.source_config.platform_instance,
|
|
1088
|
-
self.source_config.env,
|
|
1089
|
-
)
|
|
1090
1149
|
with PerfTimer() as timer:
|
|
1091
1150
|
assert self.source_config.path_specs
|
|
1092
1151
|
for path_spec in self.source_config.path_specs:
|
|
@@ -127,6 +127,8 @@ class SnowflakeQueriesExtractorReport(Report):
|
|
|
127
127
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
128
128
|
|
|
129
129
|
num_ddl_queries_dropped: int = 0
|
|
130
|
+
num_stream_queries_observed: int = 0
|
|
131
|
+
num_create_temp_view_queries_observed: int = 0
|
|
130
132
|
num_users: int = 0
|
|
131
133
|
|
|
132
134
|
|
|
@@ -373,6 +375,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
373
375
|
if entry:
|
|
374
376
|
yield entry
|
|
375
377
|
|
|
378
|
+
@classmethod
|
|
379
|
+
def _has_temp_keyword(cls, query_text: str) -> bool:
|
|
380
|
+
return (
|
|
381
|
+
re.search(r"\bTEMP\b", query_text, re.IGNORECASE) is not None
|
|
382
|
+
or re.search(r"\bTEMPORARY\b", query_text, re.IGNORECASE) is not None
|
|
383
|
+
)
|
|
384
|
+
|
|
376
385
|
def _parse_audit_log_row(
|
|
377
386
|
self, row: Dict[str, Any], users: UsersMapping
|
|
378
387
|
) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
|
|
@@ -389,6 +398,15 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
389
398
|
key = key.lower()
|
|
390
399
|
res[key] = value
|
|
391
400
|
|
|
401
|
+
timestamp: datetime = res["query_start_time"]
|
|
402
|
+
timestamp = timestamp.astimezone(timezone.utc)
|
|
403
|
+
|
|
404
|
+
# TODO need to map snowflake query types to ours
|
|
405
|
+
query_text: str = res["query_text"]
|
|
406
|
+
query_type: QueryType = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
|
|
407
|
+
res["query_type"], QueryType.UNKNOWN
|
|
408
|
+
)
|
|
409
|
+
|
|
392
410
|
direct_objects_accessed = res["direct_objects_accessed"]
|
|
393
411
|
objects_modified = res["objects_modified"]
|
|
394
412
|
object_modified_by_ddl = res["object_modified_by_ddl"]
|
|
@@ -399,9 +417,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
399
417
|
"Error fetching ddl lineage from Snowflake"
|
|
400
418
|
):
|
|
401
419
|
known_ddl_entry = self.parse_ddl_query(
|
|
402
|
-
|
|
420
|
+
query_text,
|
|
403
421
|
res["session_id"],
|
|
404
|
-
|
|
422
|
+
timestamp,
|
|
405
423
|
object_modified_by_ddl,
|
|
406
424
|
res["query_type"],
|
|
407
425
|
)
|
|
@@ -419,24 +437,38 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
419
437
|
)
|
|
420
438
|
)
|
|
421
439
|
|
|
422
|
-
#
|
|
423
|
-
#
|
|
440
|
+
# There are a couple cases when we'd want to prefer our own SQL parsing
|
|
441
|
+
# over Snowflake's metadata.
|
|
442
|
+
# 1. For queries that use a stream, objects_modified returns $SYS_VIEW_X with no mapping.
|
|
443
|
+
# We can check direct_objects_accessed to see if there is a stream used, and if so,
|
|
444
|
+
# prefer doing SQL parsing over Snowflake's metadata.
|
|
445
|
+
# 2. For queries that create a view, objects_modified is empty and object_modified_by_ddl
|
|
446
|
+
# contains the view name and columns. Because `object_modified_by_ddl` doesn't contain
|
|
447
|
+
# source columns e.g. lineage information, we must do our own SQL parsing. We're mainly
|
|
448
|
+
# focused on temporary views. It's fine if we parse a couple extra views, but in general
|
|
449
|
+
# we want view definitions to come from Snowflake's schema metadata and not from query logs.
|
|
450
|
+
|
|
424
451
|
has_stream_objects = any(
|
|
425
452
|
obj.get("objectDomain") == "Stream" for obj in direct_objects_accessed
|
|
426
453
|
)
|
|
454
|
+
is_create_view = query_type == QueryType.CREATE_VIEW
|
|
455
|
+
is_create_temp_view = is_create_view and self._has_temp_keyword(query_text)
|
|
456
|
+
|
|
457
|
+
if has_stream_objects or is_create_temp_view:
|
|
458
|
+
if has_stream_objects:
|
|
459
|
+
self.report.num_stream_queries_observed += 1
|
|
460
|
+
elif is_create_temp_view:
|
|
461
|
+
self.report.num_create_temp_view_queries_observed += 1
|
|
427
462
|
|
|
428
|
-
# If a stream is used, default to query parsing.
|
|
429
|
-
if has_stream_objects:
|
|
430
|
-
logger.debug("Found matching stream object")
|
|
431
463
|
return ObservedQuery(
|
|
432
|
-
query=
|
|
464
|
+
query=query_text,
|
|
433
465
|
session_id=res["session_id"],
|
|
434
|
-
timestamp=
|
|
466
|
+
timestamp=timestamp,
|
|
435
467
|
user=user,
|
|
436
468
|
default_db=res["default_db"],
|
|
437
469
|
default_schema=res["default_schema"],
|
|
438
470
|
query_hash=get_query_fingerprint(
|
|
439
|
-
|
|
471
|
+
query_text, self.identifiers.platform, fast=True
|
|
440
472
|
),
|
|
441
473
|
)
|
|
442
474
|
|
|
@@ -502,25 +534,17 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
502
534
|
)
|
|
503
535
|
)
|
|
504
536
|
|
|
505
|
-
timestamp: datetime = res["query_start_time"]
|
|
506
|
-
timestamp = timestamp.astimezone(timezone.utc)
|
|
507
|
-
|
|
508
|
-
# TODO need to map snowflake query types to ours
|
|
509
|
-
query_type = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
|
|
510
|
-
res["query_type"], QueryType.UNKNOWN
|
|
511
|
-
)
|
|
512
|
-
|
|
513
537
|
entry = PreparsedQuery(
|
|
514
538
|
# Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
|
|
515
539
|
# job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
|
|
516
540
|
# here
|
|
517
541
|
query_id=get_query_fingerprint(
|
|
518
|
-
|
|
542
|
+
query_text,
|
|
519
543
|
self.identifiers.platform,
|
|
520
544
|
fast=True,
|
|
521
545
|
secondary_id=res["query_secondary_fingerprint"],
|
|
522
546
|
),
|
|
523
|
-
query_text=
|
|
547
|
+
query_text=query_text,
|
|
524
548
|
upstreams=upstreams,
|
|
525
549
|
downstream=downstream,
|
|
526
550
|
column_lineage=column_lineage,
|
|
@@ -543,7 +567,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
543
567
|
object_modified_by_ddl: dict,
|
|
544
568
|
query_type: str,
|
|
545
569
|
) -> Optional[Union[TableRename, TableSwap]]:
|
|
546
|
-
timestamp = timestamp.astimezone(timezone.utc)
|
|
547
570
|
if (
|
|
548
571
|
object_modified_by_ddl["operationType"] == "ALTER"
|
|
549
572
|
and query_type == "RENAME_TABLE"
|
|
@@ -43,13 +43,6 @@ class SnowflakeQuery:
|
|
|
43
43
|
ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format(
|
|
44
44
|
",".join(f"'{domain}'" for domain in ACCESS_HISTORY_TABLE_VIEW_DOMAINS)
|
|
45
45
|
)
|
|
46
|
-
ACCESS_HISTORY_TABLE_DOMAINS_FILTER = (
|
|
47
|
-
"("
|
|
48
|
-
f"'{SnowflakeObjectDomain.TABLE.capitalize()}',"
|
|
49
|
-
f"'{SnowflakeObjectDomain.VIEW.capitalize()}',"
|
|
50
|
-
f"'{SnowflakeObjectDomain.STREAM.capitalize()}',"
|
|
51
|
-
")"
|
|
52
|
-
)
|
|
53
46
|
|
|
54
47
|
@staticmethod
|
|
55
48
|
def current_account() -> str:
|
|
@@ -139,7 +139,7 @@ class StoragePathParser:
|
|
|
139
139
|
path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
|
|
140
140
|
|
|
141
141
|
elif platform == StoragePlatform.AZURE:
|
|
142
|
-
if scheme in ("abfs", "abfss"):
|
|
142
|
+
if scheme in ("abfs", "abfss", "wasbs"):
|
|
143
143
|
# Format: abfss://container@account.dfs.core.windows.net/path
|
|
144
144
|
container = parsed.netloc.split("@")[0]
|
|
145
145
|
path = f"{container}/{parsed.path.lstrip('/')}"
|
|
@@ -153,7 +153,7 @@ class StoragePathParser:
|
|
|
153
153
|
|
|
154
154
|
elif platform == StoragePlatform.DBFS:
|
|
155
155
|
# For DBFS, use path as-is
|
|
156
|
-
path = parsed.path.lstrip("/")
|
|
156
|
+
path = "/" + parsed.path.lstrip("/")
|
|
157
157
|
|
|
158
158
|
elif platform == StoragePlatform.LOCAL:
|
|
159
159
|
# For local files, use full path
|
|
@@ -169,7 +169,6 @@ class StoragePathParser:
|
|
|
169
169
|
# Clean up the path
|
|
170
170
|
path = path.rstrip("/") # Remove trailing slashes
|
|
171
171
|
path = re.sub(r"/+", "/", path) # Normalize multiple slashes
|
|
172
|
-
path = f"/{path}"
|
|
173
172
|
|
|
174
173
|
return platform, path
|
|
175
174
|
|
|
@@ -109,7 +109,7 @@ class ObservedQuery:
|
|
|
109
109
|
query_hash: Optional[str] = None
|
|
110
110
|
usage_multiplier: int = 1
|
|
111
111
|
|
|
112
|
-
# Use this to store
|
|
112
|
+
# Use this to store additional key-value information about the query for debugging.
|
|
113
113
|
extra_info: Optional[dict] = None
|
|
114
114
|
|
|
115
115
|
|