acryl-datahub 1.1.0rc3__py3-none-any.whl → 1.1.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -4,35 +4,62 @@ GCS_PREFIX = "gs://"
4
4
 
5
5
 
6
6
  def is_gcs_uri(uri: str) -> bool:
7
+ """
8
+ Check if a URI is a GCS URI (starts with gs://).
9
+
10
+ For more general URI handling, consider using object_store.get_object_store_for_uri.
11
+ """
7
12
  return uri.startswith(GCS_PREFIX)
8
13
 
9
14
 
10
15
  def get_gcs_prefix(gcs_uri: str) -> Optional[str]:
16
+ """
17
+ Get the GCS prefix (gs://) if the URI is a GCS URI.
18
+
19
+ For more general URI handling, consider using object_store.get_object_store_for_uri.
20
+ """
11
21
  if gcs_uri.startswith(GCS_PREFIX):
12
22
  return GCS_PREFIX
13
23
  return None
14
24
 
15
25
 
16
26
  def strip_gcs_prefix(gcs_uri: str) -> str:
17
- # remove GCS prefix (gs://)
27
+ """
28
+ Remove the GCS prefix (gs://) from a GCS URI.
29
+
30
+ For more general URI handling, consider using the object_store module.
31
+
32
+ Args:
33
+ gcs_uri: A GCS URI starting with gs://
34
+
35
+ Returns:
36
+ The URI without the gs:// prefix
37
+
38
+ Raises:
39
+ ValueError: If the URI doesn't start with gs://
40
+ """
18
41
  prefix = get_gcs_prefix(gcs_uri)
19
42
  if not prefix:
20
- raise ValueError(f"Not an GCS URI. Must start with prefix: {GCS_PREFIX}")
43
+ raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
21
44
 
22
45
  return gcs_uri[len(GCS_PREFIX) :]
23
46
 
24
47
 
25
- def get_gcs_bucket_name(path):
26
- if not is_gcs_uri(path):
27
- raise ValueError(f"Not a GCS URI. Must start with prefixe: {GCS_PREFIX}")
28
- return strip_gcs_prefix(path).split("/")[0]
29
-
30
-
31
48
  def get_gcs_bucket_relative_path(gcs_uri: str) -> str:
49
+ """
50
+ Get the path relative to the bucket from a GCS URI.
51
+
52
+ For more general URI handling, consider using object_store.get_object_key.
53
+ """
32
54
  return "/".join(strip_gcs_prefix(gcs_uri).split("/")[1:])
33
55
 
34
56
 
35
57
  def get_gcs_key_prefix(gcs_uri: str) -> str:
58
+ """
59
+ Get the key prefix (first path component after bucket) from a GCS URI.
60
+
61
+ For more general URI handling, consider using object_store.get_object_key.
62
+ """
36
63
  if not is_gcs_uri(gcs_uri):
37
- raise ValueError(f"Not a GCS URI. Must start with prefixe: {GCS_PREFIX}")
64
+ raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
38
65
  return strip_gcs_prefix(gcs_uri).split("/", maxsplit=1)[1]
@@ -7,7 +7,7 @@ import re
7
7
  import time
8
8
  from datetime import datetime
9
9
  from pathlib import PurePath
10
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
10
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
11
11
  from urllib.parse import urlparse
12
12
 
13
13
  import smart_open.compression as so_compression
@@ -43,6 +43,9 @@ from datahub.ingestion.source.aws.s3_util import (
43
43
  strip_s3_prefix,
44
44
  )
45
45
  from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
46
+ from datahub.ingestion.source.data_lake_common.object_store import (
47
+ create_object_store_adapter,
48
+ )
46
49
  from datahub.ingestion.source.data_lake_common.path_spec import FolderTraversalMethod
47
50
  from datahub.ingestion.source.s3.config import DataLakeSourceConfig, PathSpec
48
51
  from datahub.ingestion.source.s3.report import DataLakeSourceReport
@@ -197,12 +200,59 @@ class S3Source(StatefulIngestionSourceBase):
197
200
  report: DataLakeSourceReport
198
201
  profiling_times_taken: List[float]
199
202
  container_WU_creator: ContainerWUCreator
203
+ object_store_adapter: Any
200
204
 
201
205
  def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext):
202
206
  super().__init__(config, ctx)
203
207
  self.source_config = config
204
208
  self.report = DataLakeSourceReport()
205
209
  self.profiling_times_taken = []
210
+ self.container_WU_creator = ContainerWUCreator(
211
+ self.source_config.platform,
212
+ self.source_config.platform_instance,
213
+ self.source_config.env,
214
+ )
215
+
216
+ # Create an object store adapter for handling external URLs and paths
217
+ if self.is_s3_platform():
218
+ # Get the AWS region from config, if available
219
+ aws_region = None
220
+ if self.source_config.aws_config:
221
+ aws_region = self.source_config.aws_config.aws_region
222
+
223
+ # For backward compatibility with tests: if we're using a test endpoint, use us-east-1
224
+ if self.source_config.aws_config.aws_endpoint_url and (
225
+ "localstack"
226
+ in self.source_config.aws_config.aws_endpoint_url.lower()
227
+ or "storage.googleapis.com"
228
+ in self.source_config.aws_config.aws_endpoint_url.lower()
229
+ ):
230
+ aws_region = "us-east-1"
231
+
232
+ # Create an S3 adapter with the configured region
233
+ self.object_store_adapter = create_object_store_adapter(
234
+ "s3", aws_region=aws_region
235
+ )
236
+
237
+ # Special handling for GCS via S3 (via boto compatibility layer)
238
+ if (
239
+ self.source_config.aws_config
240
+ and self.source_config.aws_config.aws_endpoint_url
241
+ and "storage.googleapis.com"
242
+ in self.source_config.aws_config.aws_endpoint_url.lower()
243
+ ):
244
+ # We need to preserve the S3-style paths but use GCS external URL generation
245
+ self.object_store_adapter = create_object_store_adapter("gcs")
246
+ # Override create_s3_path to maintain S3 compatibility
247
+ self.object_store_adapter.register_customization(
248
+ "create_s3_path", lambda bucket, key: f"s3://{bucket}/{key}"
249
+ )
250
+ else:
251
+ # For local files, create a default adapter
252
+ self.object_store_adapter = create_object_store_adapter(
253
+ self.source_config.platform or "file"
254
+ )
255
+
206
256
  config_report = {
207
257
  config_option: config.dict().get(config_option)
208
258
  for config_option in config_options_to_report
@@ -605,6 +655,19 @@ class S3Source(StatefulIngestionSourceBase):
605
655
  maxPartition=max_partition_summary, minPartition=min_partition_summary
606
656
  )
607
657
 
658
+ def get_external_url(self, table_data: TableData) -> Optional[str]:
659
+ """
660
+ Get the external URL for a table using the configured object store adapter.
661
+
662
+ Args:
663
+ table_data: Table data containing path information
664
+
665
+ Returns:
666
+ An external URL or None if not applicable
667
+ """
668
+ # The adapter handles all the URL generation with proper region handling
669
+ return self.object_store_adapter.get_external_url(table_data)
670
+
608
671
  def ingest_table(
609
672
  self, table_data: TableData, path_spec: PathSpec
610
673
  ) -> Iterable[MetadataWorkUnit]:
@@ -674,6 +737,7 @@ class S3Source(StatefulIngestionSourceBase):
674
737
  if max_partition
675
738
  else None
676
739
  ),
740
+ externalUrl=self.get_external_url(table_data),
677
741
  )
678
742
  aspects.append(dataset_properties)
679
743
  if table_data.size_in_bytes > 0:
@@ -1082,11 +1146,6 @@ class S3Source(StatefulIngestionSourceBase):
1082
1146
  )
1083
1147
 
1084
1148
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1085
- self.container_WU_creator = ContainerWUCreator(
1086
- self.source_config.platform,
1087
- self.source_config.platform_instance,
1088
- self.source_config.env,
1089
- )
1090
1149
  with PerfTimer() as timer:
1091
1150
  assert self.source_config.path_specs
1092
1151
  for path_spec in self.source_config.path_specs:
@@ -127,6 +127,8 @@ class SnowflakeQueriesExtractorReport(Report):
127
127
  sql_aggregator: Optional[SqlAggregatorReport] = None
128
128
 
129
129
  num_ddl_queries_dropped: int = 0
130
+ num_stream_queries_observed: int = 0
131
+ num_create_temp_view_queries_observed: int = 0
130
132
  num_users: int = 0
131
133
 
132
134
 
@@ -373,6 +375,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
373
375
  if entry:
374
376
  yield entry
375
377
 
378
+ @classmethod
379
+ def _has_temp_keyword(cls, query_text: str) -> bool:
380
+ return (
381
+ re.search(r"\bTEMP\b", query_text, re.IGNORECASE) is not None
382
+ or re.search(r"\bTEMPORARY\b", query_text, re.IGNORECASE) is not None
383
+ )
384
+
376
385
  def _parse_audit_log_row(
377
386
  self, row: Dict[str, Any], users: UsersMapping
378
387
  ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
@@ -389,6 +398,15 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
389
398
  key = key.lower()
390
399
  res[key] = value
391
400
 
401
+ timestamp: datetime = res["query_start_time"]
402
+ timestamp = timestamp.astimezone(timezone.utc)
403
+
404
+ # TODO need to map snowflake query types to ours
405
+ query_text: str = res["query_text"]
406
+ query_type: QueryType = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
407
+ res["query_type"], QueryType.UNKNOWN
408
+ )
409
+
392
410
  direct_objects_accessed = res["direct_objects_accessed"]
393
411
  objects_modified = res["objects_modified"]
394
412
  object_modified_by_ddl = res["object_modified_by_ddl"]
@@ -399,9 +417,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
399
417
  "Error fetching ddl lineage from Snowflake"
400
418
  ):
401
419
  known_ddl_entry = self.parse_ddl_query(
402
- res["query_text"],
420
+ query_text,
403
421
  res["session_id"],
404
- res["query_start_time"],
422
+ timestamp,
405
423
  object_modified_by_ddl,
406
424
  res["query_type"],
407
425
  )
@@ -419,24 +437,38 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
419
437
  )
420
438
  )
421
439
 
422
- # Use direct_objects_accessed instead objects_modified
423
- # objects_modified returns $SYS_VIEW_X with no mapping
440
+ # There are a couple cases when we'd want to prefer our own SQL parsing
441
+ # over Snowflake's metadata.
442
+ # 1. For queries that use a stream, objects_modified returns $SYS_VIEW_X with no mapping.
443
+ # We can check direct_objects_accessed to see if there is a stream used, and if so,
444
+ # prefer doing SQL parsing over Snowflake's metadata.
445
+ # 2. For queries that create a view, objects_modified is empty and object_modified_by_ddl
446
+ # contains the view name and columns. Because `object_modified_by_ddl` doesn't contain
447
+ # source columns e.g. lineage information, we must do our own SQL parsing. We're mainly
448
+ # focused on temporary views. It's fine if we parse a couple extra views, but in general
449
+ # we want view definitions to come from Snowflake's schema metadata and not from query logs.
450
+
424
451
  has_stream_objects = any(
425
452
  obj.get("objectDomain") == "Stream" for obj in direct_objects_accessed
426
453
  )
454
+ is_create_view = query_type == QueryType.CREATE_VIEW
455
+ is_create_temp_view = is_create_view and self._has_temp_keyword(query_text)
456
+
457
+ if has_stream_objects or is_create_temp_view:
458
+ if has_stream_objects:
459
+ self.report.num_stream_queries_observed += 1
460
+ elif is_create_temp_view:
461
+ self.report.num_create_temp_view_queries_observed += 1
427
462
 
428
- # If a stream is used, default to query parsing.
429
- if has_stream_objects:
430
- logger.debug("Found matching stream object")
431
463
  return ObservedQuery(
432
- query=res["query_text"],
464
+ query=query_text,
433
465
  session_id=res["session_id"],
434
- timestamp=res["query_start_time"].astimezone(timezone.utc),
466
+ timestamp=timestamp,
435
467
  user=user,
436
468
  default_db=res["default_db"],
437
469
  default_schema=res["default_schema"],
438
470
  query_hash=get_query_fingerprint(
439
- res["query_text"], self.identifiers.platform, fast=True
471
+ query_text, self.identifiers.platform, fast=True
440
472
  ),
441
473
  )
442
474
 
@@ -502,25 +534,17 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
502
534
  )
503
535
  )
504
536
 
505
- timestamp: datetime = res["query_start_time"]
506
- timestamp = timestamp.astimezone(timezone.utc)
507
-
508
- # TODO need to map snowflake query types to ours
509
- query_type = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
510
- res["query_type"], QueryType.UNKNOWN
511
- )
512
-
513
537
  entry = PreparsedQuery(
514
538
  # Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
515
539
  # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
516
540
  # here
517
541
  query_id=get_query_fingerprint(
518
- res["query_text"],
542
+ query_text,
519
543
  self.identifiers.platform,
520
544
  fast=True,
521
545
  secondary_id=res["query_secondary_fingerprint"],
522
546
  ),
523
- query_text=res["query_text"],
547
+ query_text=query_text,
524
548
  upstreams=upstreams,
525
549
  downstream=downstream,
526
550
  column_lineage=column_lineage,
@@ -543,7 +567,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
543
567
  object_modified_by_ddl: dict,
544
568
  query_type: str,
545
569
  ) -> Optional[Union[TableRename, TableSwap]]:
546
- timestamp = timestamp.astimezone(timezone.utc)
547
570
  if (
548
571
  object_modified_by_ddl["operationType"] == "ALTER"
549
572
  and query_type == "RENAME_TABLE"
@@ -43,13 +43,6 @@ class SnowflakeQuery:
43
43
  ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format(
44
44
  ",".join(f"'{domain}'" for domain in ACCESS_HISTORY_TABLE_VIEW_DOMAINS)
45
45
  )
46
- ACCESS_HISTORY_TABLE_DOMAINS_FILTER = (
47
- "("
48
- f"'{SnowflakeObjectDomain.TABLE.capitalize()}',"
49
- f"'{SnowflakeObjectDomain.VIEW.capitalize()}',"
50
- f"'{SnowflakeObjectDomain.STREAM.capitalize()}',"
51
- ")"
52
- )
53
46
 
54
47
  @staticmethod
55
48
  def current_account() -> str:
@@ -139,7 +139,7 @@ class StoragePathParser:
139
139
  path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
140
140
 
141
141
  elif platform == StoragePlatform.AZURE:
142
- if scheme in ("abfs", "abfss"):
142
+ if scheme in ("abfs", "abfss", "wasbs"):
143
143
  # Format: abfss://container@account.dfs.core.windows.net/path
144
144
  container = parsed.netloc.split("@")[0]
145
145
  path = f"{container}/{parsed.path.lstrip('/')}"
@@ -153,7 +153,7 @@ class StoragePathParser:
153
153
 
154
154
  elif platform == StoragePlatform.DBFS:
155
155
  # For DBFS, use path as-is
156
- path = parsed.path.lstrip("/")
156
+ path = "/" + parsed.path.lstrip("/")
157
157
 
158
158
  elif platform == StoragePlatform.LOCAL:
159
159
  # For local files, use full path
@@ -169,7 +169,6 @@ class StoragePathParser:
169
169
  # Clean up the path
170
170
  path = path.rstrip("/") # Remove trailing slashes
171
171
  path = re.sub(r"/+", "/", path) # Normalize multiple slashes
172
- path = f"/{path}"
173
172
 
174
173
  return platform, path
175
174
 
@@ -109,7 +109,7 @@ class ObservedQuery:
109
109
  query_hash: Optional[str] = None
110
110
  usage_multiplier: int = 1
111
111
 
112
- # Use this to store addtitional key-value information about query for debugging
112
+ # Use this to store additional key-value information about the query for debugging.
113
113
  extra_info: Optional[dict] = None
114
114
 
115
115