acryl-datahub 1.1.0rc4__py3-none-any.whl → 1.1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (87) hide show
  1. {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/METADATA +2414 -2412
  2. {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/RECORD +87 -70
  3. {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +9 -8
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/restricted_text.py +247 -0
  10. datahub/api/entities/external/unity_catalog_external_entites.py +170 -0
  11. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  12. datahub/cli/delete_cli.py +4 -4
  13. datahub/cli/ingest_cli.py +9 -1
  14. datahub/emitter/mce_builder.py +3 -1
  15. datahub/emitter/response_helper.py +86 -1
  16. datahub/emitter/rest_emitter.py +1 -1
  17. datahub/ingestion/graph/client.py +3 -3
  18. datahub/ingestion/source/apply/datahub_apply.py +4 -4
  19. datahub/ingestion/source/data_lake_common/data_lake_utils.py +22 -10
  20. datahub/ingestion/source/data_lake_common/object_store.py +644 -0
  21. datahub/ingestion/source/datahub/config.py +11 -0
  22. datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
  23. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  24. datahub/ingestion/source/dbt/dbt_common.py +30 -11
  25. datahub/ingestion/source/gcs/gcs_source.py +22 -7
  26. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  27. datahub/ingestion/source/hex/query_fetcher.py +9 -3
  28. datahub/ingestion/source/openapi.py +12 -0
  29. datahub/ingestion/source/openapi_parser.py +56 -37
  30. datahub/ingestion/source/s3/source.py +65 -6
  31. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  32. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -21
  33. datahub/ingestion/source/snowflake/snowflake_query.py +0 -7
  34. datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
  35. datahub/ingestion/source/sql/athena.py +1 -0
  36. datahub/ingestion/source/sql/hive.py +2 -3
  37. datahub/ingestion/source/sql/sql_common.py +98 -34
  38. datahub/ingestion/source/sql/sql_types.py +5 -2
  39. datahub/ingestion/source/unity/config.py +5 -0
  40. datahub/ingestion/source/unity/proxy.py +117 -0
  41. datahub/ingestion/source/unity/source.py +167 -15
  42. datahub/ingestion/source/unity/tag_entities.py +295 -0
  43. datahub/metadata/_internal_schema_classes.py +667 -522
  44. datahub/metadata/_urns/urn_defs.py +1804 -1748
  45. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  46. datahub/metadata/schema.avsc +17358 -17584
  47. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  48. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  49. datahub/metadata/schemas/Applications.avsc +38 -0
  50. datahub/metadata/schemas/ChartKey.avsc +1 -0
  51. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  52. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  53. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  54. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  55. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  56. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  57. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  58. datahub/metadata/schemas/DatasetKey.avsc +1 -0
  59. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  60. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  61. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  62. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  63. datahub/metadata/schemas/MLModelGroupKey.avsc +1 -0
  64. datahub/metadata/schemas/MLModelKey.avsc +1 -0
  65. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  66. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  67. datahub/metadata/schemas/__init__.py +3 -3
  68. datahub/sdk/__init__.py +6 -0
  69. datahub/sdk/_all_entities.py +11 -0
  70. datahub/sdk/_shared.py +118 -1
  71. datahub/sdk/chart.py +315 -0
  72. datahub/sdk/container.py +7 -0
  73. datahub/sdk/dashboard.py +432 -0
  74. datahub/sdk/dataflow.py +309 -0
  75. datahub/sdk/datajob.py +342 -0
  76. datahub/sdk/dataset.py +8 -2
  77. datahub/sdk/entity_client.py +90 -2
  78. datahub/sdk/lineage_client.py +681 -82
  79. datahub/sdk/main_client.py +27 -8
  80. datahub/sdk/mlmodel.py +101 -38
  81. datahub/sdk/mlmodelgroup.py +7 -0
  82. datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
  83. datahub/testing/mce_helpers.py +421 -0
  84. datahub/testing/sdk_v2_helpers.py +18 -0
  85. {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/entry_points.txt +0 -0
  86. {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/licenses/LICENSE +0 -0
  87. {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/top_level.txt +0 -0
@@ -59,17 +59,21 @@ def request_call(
59
59
  username: Optional[str] = None,
60
60
  password: Optional[str] = None,
61
61
  proxies: Optional[dict] = None,
62
+ verify_ssl: bool = True,
62
63
  ) -> requests.Response:
63
64
  headers = {"accept": "application/json"}
64
65
  if username is not None and password is not None:
65
66
  return requests.get(
66
- url, headers=headers, auth=HTTPBasicAuth(username, password)
67
+ url,
68
+ headers=headers,
69
+ auth=HTTPBasicAuth(username, password),
70
+ verify=verify_ssl,
67
71
  )
68
72
  elif token is not None:
69
73
  headers["Authorization"] = f"{token}"
70
- return requests.get(url, proxies=proxies, headers=headers)
74
+ return requests.get(url, proxies=proxies, headers=headers, verify=verify_ssl)
71
75
  else:
72
- return requests.get(url, headers=headers)
76
+ return requests.get(url, headers=headers, verify=verify_ssl)
73
77
 
74
78
 
75
79
  def get_swag_json(
@@ -79,10 +83,16 @@ def get_swag_json(
79
83
  password: Optional[str] = None,
80
84
  swagger_file: str = "",
81
85
  proxies: Optional[dict] = None,
86
+ verify_ssl: bool = True,
82
87
  ) -> Dict:
83
88
  tot_url = url + swagger_file
84
89
  response = request_call(
85
- url=tot_url, token=token, username=username, password=password, proxies=proxies
90
+ url=tot_url,
91
+ token=token,
92
+ username=username,
93
+ password=password,
94
+ proxies=proxies,
95
+ verify_ssl=verify_ssl,
86
96
  )
87
97
 
88
98
  if response.status_code != 200:
@@ -127,37 +137,45 @@ def get_endpoints(sw_dict: dict) -> dict:
127
137
  check_sw_version(sw_dict)
128
138
 
129
139
  for p_k, p_o in sw_dict["paths"].items():
130
- method = list(p_o)[0]
131
- if "200" in p_o[method]["responses"]:
132
- base_res = p_o[method]["responses"]["200"]
133
- elif 200 in p_o[method]["responses"]:
134
- # if you read a plain yml file the 200 will be an integer
135
- base_res = p_o[method]["responses"][200]
136
- else:
137
- # the endpoint does not have a 200 response
138
- continue
139
-
140
- if "description" in p_o[method]:
141
- desc = p_o[method]["description"]
142
- elif "summary" in p_o[method]:
143
- desc = p_o[method]["summary"]
144
- else: # still testing
145
- desc = ""
146
-
147
- try:
148
- tags = p_o[method]["tags"]
149
- except KeyError:
150
- tags = []
151
-
152
- url_details[p_k] = {"description": desc, "tags": tags, "method": method}
153
-
154
- example_data = check_for_api_example_data(base_res, p_k)
155
- if example_data:
156
- url_details[p_k]["data"] = example_data
157
-
158
- # checking whether there are defined parameters to execute the call...
159
- if "parameters" in p_o[method]:
160
- url_details[p_k]["parameters"] = p_o[method]["parameters"]
140
+ for method, method_spec in p_o.items():
141
+ # skip non-method keys like "parameters"
142
+ if method.lower() not in [
143
+ "get",
144
+ "post",
145
+ "put",
146
+ "delete",
147
+ "patch",
148
+ "options",
149
+ "head",
150
+ ]:
151
+ continue
152
+
153
+ responses = method_spec.get("responses", {})
154
+ base_res = responses.get("200") or responses.get(200)
155
+ if not base_res:
156
+ # if there is no 200 response, we skip this method
157
+ continue
158
+
159
+ # if the description is not present, we will use the summary
160
+ # if both are not present, we will use an empty string
161
+ desc = method_spec.get("description") or method_spec.get("summary", "")
162
+
163
+ # if the tags are not present, we will use an empty list
164
+ tags = method_spec.get("tags", [])
165
+
166
+ url_details[p_k] = {
167
+ "description": desc,
168
+ "tags": tags,
169
+ "method": method.upper(),
170
+ }
171
+
172
+ example_data = check_for_api_example_data(base_res, p_k)
173
+ if example_data:
174
+ url_details[p_k]["data"] = example_data
175
+
176
+ # checking whether there are defined parameters to execute the call...
177
+ if "parameters" in p_o[method]:
178
+ url_details[p_k]["parameters"] = p_o[method]["parameters"]
161
179
 
162
180
  return dict(sorted(url_details.items()))
163
181
 
@@ -358,6 +376,7 @@ def get_tok(
358
376
  tok_url: str = "",
359
377
  method: str = "post",
360
378
  proxies: Optional[dict] = None,
379
+ verify_ssl: bool = True,
361
380
  ) -> str:
362
381
  """
363
382
  Trying to post username/password to get auth.
@@ -368,7 +387,7 @@ def get_tok(
368
387
  # this will make a POST call with username and password
369
388
  data = {"username": username, "password": password, "maxDuration": True}
370
389
  # url2post = url + "api/authenticate/"
371
- response = requests.post(url4req, proxies=proxies, json=data)
390
+ response = requests.post(url4req, proxies=proxies, json=data, verify=verify_ssl)
372
391
  if response.status_code == 200:
373
392
  cont = json.loads(response.content)
374
393
  if "token" in cont: # other authentication scheme
@@ -377,7 +396,7 @@ def get_tok(
377
396
  token = f"Bearer {cont['tokens']['access']}"
378
397
  elif method == "get":
379
398
  # this will make a GET call with username and password
380
- response = requests.get(url4req)
399
+ response = requests.get(url4req, verify=verify_ssl)
381
400
  if response.status_code == 200:
382
401
  cont = json.loads(response.content)
383
402
  token = cont["token"]
@@ -7,7 +7,7 @@ import re
7
7
  import time
8
8
  from datetime import datetime
9
9
  from pathlib import PurePath
10
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
10
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
11
11
  from urllib.parse import urlparse
12
12
 
13
13
  import smart_open.compression as so_compression
@@ -43,6 +43,9 @@ from datahub.ingestion.source.aws.s3_util import (
43
43
  strip_s3_prefix,
44
44
  )
45
45
  from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
46
+ from datahub.ingestion.source.data_lake_common.object_store import (
47
+ create_object_store_adapter,
48
+ )
46
49
  from datahub.ingestion.source.data_lake_common.path_spec import FolderTraversalMethod
47
50
  from datahub.ingestion.source.s3.config import DataLakeSourceConfig, PathSpec
48
51
  from datahub.ingestion.source.s3.report import DataLakeSourceReport
@@ -197,12 +200,59 @@ class S3Source(StatefulIngestionSourceBase):
197
200
  report: DataLakeSourceReport
198
201
  profiling_times_taken: List[float]
199
202
  container_WU_creator: ContainerWUCreator
203
+ object_store_adapter: Any
200
204
 
201
205
  def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext):
202
206
  super().__init__(config, ctx)
203
207
  self.source_config = config
204
208
  self.report = DataLakeSourceReport()
205
209
  self.profiling_times_taken = []
210
+ self.container_WU_creator = ContainerWUCreator(
211
+ self.source_config.platform,
212
+ self.source_config.platform_instance,
213
+ self.source_config.env,
214
+ )
215
+
216
+ # Create an object store adapter for handling external URLs and paths
217
+ if self.is_s3_platform():
218
+ # Get the AWS region from config, if available
219
+ aws_region = None
220
+ if self.source_config.aws_config:
221
+ aws_region = self.source_config.aws_config.aws_region
222
+
223
+ # For backward compatibility with tests: if we're using a test endpoint, use us-east-1
224
+ if self.source_config.aws_config.aws_endpoint_url and (
225
+ "localstack"
226
+ in self.source_config.aws_config.aws_endpoint_url.lower()
227
+ or "storage.googleapis.com"
228
+ in self.source_config.aws_config.aws_endpoint_url.lower()
229
+ ):
230
+ aws_region = "us-east-1"
231
+
232
+ # Create an S3 adapter with the configured region
233
+ self.object_store_adapter = create_object_store_adapter(
234
+ "s3", aws_region=aws_region
235
+ )
236
+
237
+ # Special handling for GCS via S3 (via boto compatibility layer)
238
+ if (
239
+ self.source_config.aws_config
240
+ and self.source_config.aws_config.aws_endpoint_url
241
+ and "storage.googleapis.com"
242
+ in self.source_config.aws_config.aws_endpoint_url.lower()
243
+ ):
244
+ # We need to preserve the S3-style paths but use GCS external URL generation
245
+ self.object_store_adapter = create_object_store_adapter("gcs")
246
+ # Override create_s3_path to maintain S3 compatibility
247
+ self.object_store_adapter.register_customization(
248
+ "create_s3_path", lambda bucket, key: f"s3://{bucket}/{key}"
249
+ )
250
+ else:
251
+ # For local files, create a default adapter
252
+ self.object_store_adapter = create_object_store_adapter(
253
+ self.source_config.platform or "file"
254
+ )
255
+
206
256
  config_report = {
207
257
  config_option: config.dict().get(config_option)
208
258
  for config_option in config_options_to_report
@@ -605,6 +655,19 @@ class S3Source(StatefulIngestionSourceBase):
605
655
  maxPartition=max_partition_summary, minPartition=min_partition_summary
606
656
  )
607
657
 
658
+ def get_external_url(self, table_data: TableData) -> Optional[str]:
659
+ """
660
+ Get the external URL for a table using the configured object store adapter.
661
+
662
+ Args:
663
+ table_data: Table data containing path information
664
+
665
+ Returns:
666
+ An external URL or None if not applicable
667
+ """
668
+ # The adapter handles all the URL generation with proper region handling
669
+ return self.object_store_adapter.get_external_url(table_data)
670
+
608
671
  def ingest_table(
609
672
  self, table_data: TableData, path_spec: PathSpec
610
673
  ) -> Iterable[MetadataWorkUnit]:
@@ -674,6 +737,7 @@ class S3Source(StatefulIngestionSourceBase):
674
737
  if max_partition
675
738
  else None
676
739
  ),
740
+ externalUrl=self.get_external_url(table_data),
677
741
  )
678
742
  aspects.append(dataset_properties)
679
743
  if table_data.size_in_bytes > 0:
@@ -1082,11 +1146,6 @@ class S3Source(StatefulIngestionSourceBase):
1082
1146
  )
1083
1147
 
1084
1148
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1085
- self.container_WU_creator = ContainerWUCreator(
1086
- self.source_config.platform,
1087
- self.source_config.platform_instance,
1088
- self.source_config.env,
1089
- )
1090
1149
  with PerfTimer() as timer:
1091
1150
  assert self.source_config.path_specs
1092
1151
  for path_spec in self.source_config.path_specs:
@@ -22,6 +22,7 @@ from datahub.ingestion.api.incremental_properties_helper import (
22
22
  from datahub.ingestion.glossary.classification_mixin import (
23
23
  ClassificationSourceConfigMixin,
24
24
  )
25
+ from datahub.ingestion.source.snowflake.constants import SnowflakeEdition
25
26
  from datahub.ingestion.source.snowflake.snowflake_connection import (
26
27
  SnowflakeConnectionConfig,
27
28
  )
@@ -326,6 +327,18 @@ class SnowflakeV2Config(
326
327
  " Map of share name -> details of share.",
327
328
  )
328
329
 
330
+ known_snowflake_edition: Optional[SnowflakeEdition] = Field(
331
+ default=None,
332
+ description="Explicitly specify the Snowflake edition (STANDARD or ENTERPRISE). If unset, the edition will be inferred automatically using 'SHOW TAGS'.",
333
+ )
334
+
335
+ # Allows empty containers to be ingested before datasets are added, avoiding permission errors
336
+ warn_no_datasets: bool = Field(
337
+ hidden_from_docs=True,
338
+ default=False,
339
+ description="If True, warns when no datasets are found during ingestion. If False, ingestion fails when no datasets are found.",
340
+ )
341
+
329
342
  include_assertion_results: bool = Field(
330
343
  default=False,
331
344
  description="Whether to ingest assertion run results for assertions created using Datahub"
@@ -127,6 +127,8 @@ class SnowflakeQueriesExtractorReport(Report):
127
127
  sql_aggregator: Optional[SqlAggregatorReport] = None
128
128
 
129
129
  num_ddl_queries_dropped: int = 0
130
+ num_stream_queries_observed: int = 0
131
+ num_create_temp_view_queries_observed: int = 0
130
132
  num_users: int = 0
131
133
 
132
134
 
@@ -373,6 +375,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
373
375
  if entry:
374
376
  yield entry
375
377
 
378
+ @classmethod
379
+ def _has_temp_keyword(cls, query_text: str) -> bool:
380
+ return (
381
+ re.search(r"\bTEMP\b", query_text, re.IGNORECASE) is not None
382
+ or re.search(r"\bTEMPORARY\b", query_text, re.IGNORECASE) is not None
383
+ )
384
+
376
385
  def _parse_audit_log_row(
377
386
  self, row: Dict[str, Any], users: UsersMapping
378
387
  ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
@@ -389,6 +398,15 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
389
398
  key = key.lower()
390
399
  res[key] = value
391
400
 
401
+ timestamp: datetime = res["query_start_time"]
402
+ timestamp = timestamp.astimezone(timezone.utc)
403
+
404
+ # TODO need to map snowflake query types to ours
405
+ query_text: str = res["query_text"]
406
+ query_type: QueryType = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
407
+ res["query_type"], QueryType.UNKNOWN
408
+ )
409
+
392
410
  direct_objects_accessed = res["direct_objects_accessed"]
393
411
  objects_modified = res["objects_modified"]
394
412
  object_modified_by_ddl = res["object_modified_by_ddl"]
@@ -399,9 +417,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
399
417
  "Error fetching ddl lineage from Snowflake"
400
418
  ):
401
419
  known_ddl_entry = self.parse_ddl_query(
402
- res["query_text"],
420
+ query_text,
403
421
  res["session_id"],
404
- res["query_start_time"],
422
+ timestamp,
405
423
  object_modified_by_ddl,
406
424
  res["query_type"],
407
425
  )
@@ -419,24 +437,38 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
419
437
  )
420
438
  )
421
439
 
422
- # Use direct_objects_accessed instead objects_modified
423
- # objects_modified returns $SYS_VIEW_X with no mapping
440
+ # There are a couple cases when we'd want to prefer our own SQL parsing
441
+ # over Snowflake's metadata.
442
+ # 1. For queries that use a stream, objects_modified returns $SYS_VIEW_X with no mapping.
443
+ # We can check direct_objects_accessed to see if there is a stream used, and if so,
444
+ # prefer doing SQL parsing over Snowflake's metadata.
445
+ # 2. For queries that create a view, objects_modified is empty and object_modified_by_ddl
446
+ # contains the view name and columns. Because `object_modified_by_ddl` doesn't contain
447
+ # source columns e.g. lineage information, we must do our own SQL parsing. We're mainly
448
+ # focused on temporary views. It's fine if we parse a couple extra views, but in general
449
+ # we want view definitions to come from Snowflake's schema metadata and not from query logs.
450
+
424
451
  has_stream_objects = any(
425
452
  obj.get("objectDomain") == "Stream" for obj in direct_objects_accessed
426
453
  )
454
+ is_create_view = query_type == QueryType.CREATE_VIEW
455
+ is_create_temp_view = is_create_view and self._has_temp_keyword(query_text)
456
+
457
+ if has_stream_objects or is_create_temp_view:
458
+ if has_stream_objects:
459
+ self.report.num_stream_queries_observed += 1
460
+ elif is_create_temp_view:
461
+ self.report.num_create_temp_view_queries_observed += 1
427
462
 
428
- # If a stream is used, default to query parsing.
429
- if has_stream_objects:
430
- logger.debug("Found matching stream object")
431
463
  return ObservedQuery(
432
- query=res["query_text"],
464
+ query=query_text,
433
465
  session_id=res["session_id"],
434
- timestamp=res["query_start_time"].astimezone(timezone.utc),
466
+ timestamp=timestamp,
435
467
  user=user,
436
468
  default_db=res["default_db"],
437
469
  default_schema=res["default_schema"],
438
470
  query_hash=get_query_fingerprint(
439
- res["query_text"], self.identifiers.platform, fast=True
471
+ query_text, self.identifiers.platform, fast=True
440
472
  ),
441
473
  )
442
474
 
@@ -502,25 +534,17 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
502
534
  )
503
535
  )
504
536
 
505
- timestamp: datetime = res["query_start_time"]
506
- timestamp = timestamp.astimezone(timezone.utc)
507
-
508
- # TODO need to map snowflake query types to ours
509
- query_type = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
510
- res["query_type"], QueryType.UNKNOWN
511
- )
512
-
513
537
  entry = PreparsedQuery(
514
538
  # Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
515
539
  # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
516
540
  # here
517
541
  query_id=get_query_fingerprint(
518
- res["query_text"],
542
+ query_text,
519
543
  self.identifiers.platform,
520
544
  fast=True,
521
545
  secondary_id=res["query_secondary_fingerprint"],
522
546
  ),
523
- query_text=res["query_text"],
547
+ query_text=query_text,
524
548
  upstreams=upstreams,
525
549
  downstream=downstream,
526
550
  column_lineage=column_lineage,
@@ -543,7 +567,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
543
567
  object_modified_by_ddl: dict,
544
568
  query_type: str,
545
569
  ) -> Optional[Union[TableRename, TableSwap]]:
546
- timestamp = timestamp.astimezone(timezone.utc)
547
570
  if (
548
571
  object_modified_by_ddl["operationType"] == "ALTER"
549
572
  and query_type == "RENAME_TABLE"
@@ -43,13 +43,6 @@ class SnowflakeQuery:
43
43
  ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format(
44
44
  ",".join(f"'{domain}'" for domain in ACCESS_HISTORY_TABLE_VIEW_DOMAINS)
45
45
  )
46
- ACCESS_HISTORY_TABLE_DOMAINS_FILTER = (
47
- "("
48
- f"'{SnowflakeObjectDomain.TABLE.capitalize()}',"
49
- f"'{SnowflakeObjectDomain.VIEW.capitalize()}',"
50
- f"'{SnowflakeObjectDomain.STREAM.capitalize()}',"
51
- ")"
52
- )
53
46
 
54
47
  @staticmethod
55
48
  def current_account() -> str:
@@ -9,6 +9,7 @@ import re
9
9
  from dataclasses import dataclass
10
10
  from typing import Dict, Iterable, List, Optional, Union
11
11
 
12
+ from datahub.configuration.time_window_config import BaseTimeWindowConfig
12
13
  from datahub.ingestion.api.common import PipelineContext
13
14
  from datahub.ingestion.api.decorators import (
14
15
  SupportStatus,
@@ -551,11 +552,15 @@ class SnowflakeV2Source(
551
552
  and len(discovered_views) == 0
552
553
  and len(discovered_streams) == 0
553
554
  ):
554
- self.structured_reporter.failure(
555
- GENERIC_PERMISSION_ERROR_KEY,
556
- "No tables/views/streams found. Please check permissions.",
557
- )
558
- return
555
+ if self.config.warn_no_datasets:
556
+ self.structured_reporter.warning(
557
+ "No tables/views/streams found. Verify dataset permissions if Snowflake source is not empty.",
558
+ )
559
+ else:
560
+ self.structured_reporter.failure(
561
+ GENERIC_PERMISSION_ERROR_KEY,
562
+ "No tables/views/streams found. Verify dataset permissions in Snowflake.",
563
+ )
559
564
 
560
565
  self.discovered_datasets = (
561
566
  discovered_tables + discovered_views + discovered_streams
@@ -571,7 +576,11 @@ class SnowflakeV2Source(
571
576
  queries_extractor = SnowflakeQueriesExtractor(
572
577
  connection=self.connection,
573
578
  config=SnowflakeQueriesExtractorConfig(
574
- window=self.config,
579
+ window=BaseTimeWindowConfig(
580
+ start_time=self.config.start_time,
581
+ end_time=self.config.end_time,
582
+ bucket_duration=self.config.bucket_duration,
583
+ ),
575
584
  temporary_tables_pattern=self.config.temporary_tables_pattern,
576
585
  include_lineage=self.config.include_table_lineage,
577
586
  include_usage_statistics=self.config.include_usage_stats,
@@ -732,6 +741,8 @@ class SnowflakeV2Source(
732
741
  return None
733
742
 
734
743
  def is_standard_edition(self) -> bool:
744
+ if self.config.known_snowflake_edition is not None:
745
+ return self.config.known_snowflake_edition == SnowflakeEdition.STANDARD
735
746
  try:
736
747
  self.connection.query(SnowflakeQuery.show_tags())
737
748
  return False
@@ -323,6 +323,7 @@ class Partitionitem:
323
323
  "Optionally enabled via configuration. Profiling uses sql queries on whole table which can be expensive operation.",
324
324
  )
325
325
  @capability(SourceCapability.LINEAGE_COARSE, "Supported for S3 tables")
326
+ @capability(SourceCapability.LINEAGE_FINE, "Supported for S3 tables")
326
327
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
327
328
  class AthenaSource(SQLAlchemySource):
328
329
  """
@@ -139,7 +139,7 @@ class StoragePathParser:
139
139
  path = f"{parsed.netloc}/{parsed.path.lstrip('/')}"
140
140
 
141
141
  elif platform == StoragePlatform.AZURE:
142
- if scheme in ("abfs", "abfss"):
142
+ if scheme in ("abfs", "abfss", "wasbs"):
143
143
  # Format: abfss://container@account.dfs.core.windows.net/path
144
144
  container = parsed.netloc.split("@")[0]
145
145
  path = f"{container}/{parsed.path.lstrip('/')}"
@@ -153,7 +153,7 @@ class StoragePathParser:
153
153
 
154
154
  elif platform == StoragePlatform.DBFS:
155
155
  # For DBFS, use path as-is
156
- path = parsed.path.lstrip("/")
156
+ path = "/" + parsed.path.lstrip("/")
157
157
 
158
158
  elif platform == StoragePlatform.LOCAL:
159
159
  # For local files, use full path
@@ -169,7 +169,6 @@ class StoragePathParser:
169
169
  # Clean up the path
170
170
  path = path.rstrip("/") # Remove trailing slashes
171
171
  path = re.sub(r"/+", "/", path) # Normalize multiple slashes
172
- path = f"/{path}"
173
172
 
174
173
  return platform, path
175
174