acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (78) hide show
  1. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
  2. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +0 -7
  5. datahub/cli/cli_utils.py +73 -0
  6. datahub/cli/delete_cli.py +0 -6
  7. datahub/cli/docker_check.py +107 -12
  8. datahub/cli/docker_cli.py +148 -228
  9. datahub/cli/exists_cli.py +0 -4
  10. datahub/cli/get_cli.py +0 -4
  11. datahub/cli/ingest_cli.py +1 -20
  12. datahub/cli/put_cli.py +0 -6
  13. datahub/cli/quickstart_versioning.py +50 -5
  14. datahub/cli/specific/assertions_cli.py +0 -6
  15. datahub/cli/specific/datacontract_cli.py +0 -6
  16. datahub/cli/specific/dataproduct_cli.py +0 -22
  17. datahub/cli/specific/dataset_cli.py +0 -11
  18. datahub/cli/specific/forms_cli.py +0 -6
  19. datahub/cli/specific/group_cli.py +0 -4
  20. datahub/cli/specific/structuredproperties_cli.py +0 -7
  21. datahub/cli/specific/user_cli.py +0 -4
  22. datahub/cli/state_cli.py +0 -4
  23. datahub/cli/timeline_cli.py +0 -4
  24. datahub/entrypoints.py +4 -3
  25. datahub/ingestion/api/report.py +183 -35
  26. datahub/ingestion/autogenerated/capability_summary.json +3431 -0
  27. datahub/ingestion/autogenerated/lineage.json +401 -0
  28. datahub/ingestion/autogenerated/lineage_helper.py +30 -128
  29. datahub/ingestion/extractor/schema_util.py +13 -4
  30. datahub/ingestion/graph/client.py +2 -2
  31. datahub/ingestion/run/pipeline.py +47 -1
  32. datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
  33. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  34. datahub/ingestion/source/common/subtypes.py +1 -1
  35. datahub/ingestion/source/data_lake_common/object_store.py +40 -0
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  37. datahub/ingestion/source/dremio/dremio_source.py +7 -7
  38. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  39. datahub/ingestion/source/ge_data_profiler.py +28 -20
  40. datahub/ingestion/source/identity/okta.py +0 -13
  41. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  42. datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
  43. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  44. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  45. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  46. datahub/ingestion/source/redshift/usage.py +4 -3
  47. datahub/ingestion/source/s3/source.py +19 -3
  48. datahub/ingestion/source/sigma/sigma.py +6 -1
  49. datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
  50. datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
  51. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  52. datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
  53. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  54. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  55. datahub/ingestion/source/sql/sql_common.py +4 -0
  56. datahub/ingestion/source/sql/vertica.py +0 -4
  57. datahub/ingestion/source/sql_queries.py +2 -2
  58. datahub/ingestion/source/superset.py +56 -1
  59. datahub/ingestion/source/tableau/tableau.py +40 -34
  60. datahub/ingestion/source/tableau/tableau_constant.py +0 -2
  61. datahub/ingestion/source/unity/proxy.py +4 -3
  62. datahub/ingestion/source/unity/source.py +19 -9
  63. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  64. datahub/metadata/_internal_schema_classes.py +85 -4
  65. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  66. datahub/metadata/schema.avsc +54 -1
  67. datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
  68. datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
  69. datahub/sdk/lineage_client.py +2 -0
  70. datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
  71. datahub/sql_parsing/sqlglot_lineage.py +40 -13
  72. datahub/upgrade/upgrade.py +46 -13
  73. datahub/utilities/server_config_util.py +8 -0
  74. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  75. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
  76. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
  77. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
  78. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0
@@ -52,7 +52,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
52
52
  from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps
53
53
  from datahub.metadata.schema_classes import (
54
54
  BrowsePathsClass,
55
- ChangeTypeClass,
56
55
  CorpUserInfoClass,
57
56
  CorpUserKeyClass,
58
57
  DashboardInfoClass,
@@ -243,20 +242,14 @@ class Mapper:
243
242
 
244
243
  @staticmethod
245
244
  def new_mcp(
246
- entity_type,
247
245
  entity_urn,
248
- aspect_name,
249
246
  aspect,
250
- change_type=ChangeTypeClass.UPSERT,
251
247
  ):
252
248
  """
253
249
  Create MCP
254
250
  """
255
251
  return MetadataChangeProposalWrapper(
256
- entityType=entity_type,
257
- changeType=change_type,
258
252
  entityUrn=entity_urn,
259
- aspectName=aspect_name,
260
253
  aspect=aspect,
261
254
  )
262
255
 
@@ -343,17 +336,13 @@ class Mapper:
343
336
  )
344
337
 
345
338
  info_mcp = self.new_mcp(
346
- entity_type=Constant.DASHBOARD,
347
339
  entity_urn=dashboard_urn,
348
- aspect_name=Constant.DASHBOARD_INFO,
349
340
  aspect=dashboard_info_cls,
350
341
  )
351
342
 
352
343
  # removed status mcp
353
344
  removed_status_mcp = self.new_mcp(
354
- entity_type=Constant.DASHBOARD,
355
345
  entity_urn=dashboard_urn,
356
- aspect_name=Constant.STATUS,
357
346
  aspect=StatusClass(removed=False),
358
347
  )
359
348
 
@@ -365,9 +354,7 @@ class Mapper:
365
354
 
366
355
  # Dashboard key
367
356
  dashboard_key_mcp = self.new_mcp(
368
- entity_type=Constant.DASHBOARD,
369
357
  entity_urn=dashboard_urn,
370
- aspect_name=Constant.DASHBOARD_KEY,
371
358
  aspect=dashboard_key_cls,
372
359
  )
373
360
 
@@ -378,9 +365,7 @@ class Mapper:
378
365
  ownership = OwnershipClass(owners=owners)
379
366
  # Dashboard owner MCP
380
367
  owner_mcp = self.new_mcp(
381
- entity_type=Constant.DASHBOARD,
382
368
  entity_urn=dashboard_urn,
383
- aspect_name=Constant.OWNERSHIP,
384
369
  aspect=ownership,
385
370
  )
386
371
 
@@ -396,9 +381,7 @@ class Mapper:
396
381
  ]
397
382
  )
398
383
  browse_path_mcp = self.new_mcp(
399
- entity_type=Constant.DASHBOARD,
400
384
  entity_urn=dashboard_urn,
401
- aspect_name=Constant.BROWSERPATH,
402
385
  aspect=browse_path,
403
386
  )
404
387
 
@@ -429,27 +412,21 @@ class Mapper:
429
412
  )
430
413
 
431
414
  info_mcp = self.new_mcp(
432
- entity_type=Constant.CORP_USER,
433
415
  entity_urn=user_urn,
434
- aspect_name=Constant.CORP_USER_INFO,
435
416
  aspect=user_info_instance,
436
417
  )
437
418
  user_mcps.append(info_mcp)
438
419
 
439
420
  # removed status mcp
440
421
  status_mcp = self.new_mcp(
441
- entity_type=Constant.CORP_USER,
442
422
  entity_urn=user_urn,
443
- aspect_name=Constant.STATUS,
444
423
  aspect=StatusClass(removed=False),
445
424
  )
446
425
  user_mcps.append(status_mcp)
447
426
  user_key = CorpUserKeyClass(username=user.username)
448
427
 
449
428
  user_key_mcp = self.new_mcp(
450
- entity_type=Constant.CORP_USER,
451
429
  entity_urn=user_urn,
452
- aspect_name=Constant.CORP_USER_KEY,
453
430
  aspect=user_key,
454
431
  )
455
432
  user_mcps.append(user_key_mcp)
@@ -182,9 +182,10 @@ class RedshiftUsageExtractor:
182
182
  self.report.num_operational_stats_filtered = 0
183
183
 
184
184
  if self.config.include_operational_stats:
185
- with self.report.new_stage(
186
- USAGE_EXTRACTION_OPERATIONAL_STATS
187
- ), PerfTimer() as timer:
185
+ with (
186
+ self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS),
187
+ PerfTimer() as timer,
188
+ ):
188
189
  # Generate operation aspect workunits
189
190
  yield from self._gen_operation_aspect_workunits(
190
191
  self.connection, all_tables
@@ -682,7 +682,7 @@ class S3Source(StatefulIngestionSourceBase):
682
682
 
683
683
  logger.info(f"Extracting table schema from file: {table_data.full_path}")
684
684
  browse_path: str = (
685
- strip_s3_prefix(table_data.table_path)
685
+ self.strip_s3_prefix(table_data.table_path)
686
686
  if self.is_s3_platform()
687
687
  else table_data.table_path.strip("/")
688
688
  )
@@ -949,7 +949,10 @@ class S3Source(StatefulIngestionSourceBase):
949
949
  """
950
950
 
951
951
  def _is_allowed_path(path_spec_: PathSpec, s3_uri: str) -> bool:
952
- allowed = path_spec_.allowed(s3_uri)
952
+ # Normalize URI for pattern matching
953
+ normalized_uri = self._normalize_uri_for_pattern_matching(s3_uri)
954
+
955
+ allowed = path_spec_.allowed(normalized_uri)
953
956
  if not allowed:
954
957
  logger.debug(f"File {s3_uri} not allowed and skipping")
955
958
  self.report.report_file_dropped(s3_uri)
@@ -1394,8 +1397,13 @@ class S3Source(StatefulIngestionSourceBase):
1394
1397
  )
1395
1398
  table_dict: Dict[str, TableData] = {}
1396
1399
  for browse_path in file_browser:
1400
+ # Normalize URI for pattern matching
1401
+ normalized_file_path = self._normalize_uri_for_pattern_matching(
1402
+ browse_path.file
1403
+ )
1404
+
1397
1405
  if not path_spec.allowed(
1398
- browse_path.file,
1406
+ normalized_file_path,
1399
1407
  ignore_ext=self.is_s3_platform()
1400
1408
  and self.source_config.use_s3_content_type,
1401
1409
  ):
@@ -1471,5 +1479,13 @@ class S3Source(StatefulIngestionSourceBase):
1471
1479
  def is_s3_platform(self):
1472
1480
  return self.source_config.platform == "s3"
1473
1481
 
1482
+ def strip_s3_prefix(self, s3_uri: str) -> str:
1483
+ """Strip S3 prefix from URI. Can be overridden by adapters for other platforms."""
1484
+ return strip_s3_prefix(s3_uri)
1485
+
1486
+ def _normalize_uri_for_pattern_matching(self, uri: str) -> str:
1487
+ """Normalize URI for pattern matching. Can be overridden by adapters for other platforms."""
1488
+ return uri
1489
+
1474
1490
  def get_report(self):
1475
1491
  return self.report
@@ -30,6 +30,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
30
30
  from datahub.ingestion.source.common.subtypes import (
31
31
  BIContainerSubTypes,
32
32
  DatasetSubTypes,
33
+ SourceCapabilityModifier,
33
34
  )
34
35
  from datahub.ingestion.source.sigma.config import (
35
36
  PlatformDetail,
@@ -95,7 +96,11 @@ logger = logging.getLogger(__name__)
95
96
  @platform_name("Sigma")
96
97
  @config_class(SigmaSourceConfig)
97
98
  @support_status(SupportStatus.INCUBATING)
98
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
99
+ @capability(
100
+ SourceCapability.CONTAINERS,
101
+ "Enabled by default",
102
+ subtype_modifier=[SourceCapabilityModifier.SIGMA_WORKSPACE],
103
+ )
99
104
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
100
105
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default.")
101
106
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
3
  from dataclasses import dataclass
4
+ from enum import Enum
4
5
  from typing import Dict, List, Optional, Set
5
6
 
6
7
  import pydantic
@@ -53,6 +54,11 @@ DEFAULT_TEMP_TABLES_PATTERNS = [
53
54
  ]
54
55
 
55
56
 
57
+ class QueryDedupStrategyType(Enum):
58
+ STANDARD = "STANDARD"
59
+ NONE = "NONE"
60
+
61
+
56
62
  class TagOption(StrEnum):
57
63
  with_lineage = "with_lineage"
58
64
  without_lineage = "without_lineage"
@@ -248,6 +254,11 @@ class SnowflakeV2Config(
248
254
  "This is useful if you have a large number of schemas and want to avoid bulk fetching the schema for each table/view.",
249
255
  )
250
256
 
257
+ query_dedup_strategy: QueryDedupStrategyType = Field(
258
+ default=QueryDedupStrategyType.STANDARD,
259
+ description=f"Experimental: Choose the strategy for query deduplication (default value is appropriate for most use-cases; make sure you understand performance implications before changing it). Allowed values are: {', '.join([s.name for s in QueryDedupStrategyType])}",
260
+ )
261
+
251
262
  _check_role_grants_removed = pydantic_removed_field("check_role_grants")
252
263
  _provision_role_removed = pydantic_removed_field("provision_role")
253
264
 
@@ -28,6 +28,7 @@ from datahub.ingestion.graph.client import DataHubGraph
28
28
  from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
29
29
  from datahub.ingestion.source.snowflake.snowflake_config import (
30
30
  DEFAULT_TEMP_TABLES_PATTERNS,
31
+ QueryDedupStrategyType,
31
32
  SnowflakeFilterConfig,
32
33
  SnowflakeIdentifierConfig,
33
34
  )
@@ -44,6 +45,11 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
44
45
  SnowflakeIdentifierBuilder,
45
46
  SnowflakeStructuredReportMixin,
46
47
  )
48
+ from datahub.ingestion.source.snowflake.stored_proc_lineage import (
49
+ StoredProcCall,
50
+ StoredProcLineageReport,
51
+ StoredProcLineageTracker,
52
+ )
47
53
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
48
54
  from datahub.metadata.urns import CorpUserUrn
49
55
  from datahub.sql_parsing.schema_resolver import SchemaResolver
@@ -113,6 +119,8 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
113
119
  include_query_usage_statistics: bool = True
114
120
  include_operations: bool = True
115
121
 
122
+ query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD
123
+
116
124
 
117
125
  class SnowflakeQueriesSourceConfig(
118
126
  SnowflakeQueriesExtractorConfig, SnowflakeIdentifierConfig, SnowflakeFilterConfig
@@ -130,6 +138,7 @@ class SnowflakeQueriesExtractorReport(Report):
130
138
  aggregator_generate_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
131
139
 
132
140
  sql_aggregator: Optional[SqlAggregatorReport] = None
141
+ stored_proc_lineage: Optional[StoredProcLineageReport] = None
133
142
 
134
143
  num_ddl_queries_dropped: int = 0
135
144
  num_stream_queries_observed: int = 0
@@ -261,6 +270,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
261
270
  TableRename,
262
271
  TableSwap,
263
272
  ObservedQuery,
273
+ StoredProcCall,
264
274
  ]
265
275
  ] = self._exit_stack.enter_context(FileBackedList(shared_connection))
266
276
 
@@ -277,12 +287,34 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
277
287
  for entry in self.fetch_query_log(users):
278
288
  queries.append(entry)
279
289
 
290
+ stored_proc_tracker: StoredProcLineageTracker = self._exit_stack.enter_context(
291
+ StoredProcLineageTracker(
292
+ platform=self.identifiers.platform,
293
+ shared_connection=shared_connection,
294
+ )
295
+ )
296
+ self.report.stored_proc_lineage = stored_proc_tracker.report
297
+
280
298
  with self.report.audit_log_load_timer:
281
299
  for i, query in enumerate(queries):
282
300
  if i % 1000 == 0:
283
301
  logger.info(f"Added {i} query log entries to SQL aggregator")
284
302
 
285
- self.aggregator.add(query)
303
+ if isinstance(query, StoredProcCall):
304
+ stored_proc_tracker.add_stored_proc_call(query)
305
+ continue
306
+
307
+ if not (
308
+ isinstance(query, PreparsedQuery)
309
+ and stored_proc_tracker.add_related_query(query)
310
+ ):
311
+ # Only add to aggregator if it's not part of a stored procedure.
312
+ self.aggregator.add(query)
313
+
314
+ # Generate and add stored procedure lineage entries.
315
+ for lineage_entry in stored_proc_tracker.build_merged_lineage_entries():
316
+ # TODO: Make this the lowest priority lineage - so that it doesn't override other lineage entries.
317
+ self.aggregator.add(lineage_entry)
286
318
 
287
319
  with self.report.aggregator_generate_timer:
288
320
  yield from auto_workunit(self.aggregator.gen_metadata())
@@ -342,13 +374,16 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
342
374
 
343
375
  def fetch_query_log(
344
376
  self, users: UsersMapping
345
- ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery]]:
346
- query_log_query = _build_enriched_query_log_query(
377
+ ) -> Iterable[
378
+ Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
379
+ ]:
380
+ query_log_query = QueryLogQueryBuilder(
347
381
  start_time=self.config.window.start_time,
348
382
  end_time=self.config.window.end_time,
349
383
  bucket_duration=self.config.window.bucket_duration,
350
384
  deny_usernames=self.config.pushdown_deny_usernames,
351
- )
385
+ dedup_strategy=self.config.query_dedup_strategy,
386
+ ).build_enriched_query_log_query()
352
387
 
353
388
  with self.structured_reporter.report_exc(
354
389
  "Error fetching query log from Snowflake"
@@ -382,7 +417,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
382
417
 
383
418
  def _parse_audit_log_row(
384
419
  self, row: Dict[str, Any], users: UsersMapping
385
- ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
420
+ ) -> Optional[
421
+ Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery, StoredProcCall]
422
+ ]:
386
423
  json_fields = {
387
424
  "DIRECT_OBJECTS_ACCESSED",
388
425
  "OBJECTS_MODIFIED",
@@ -482,6 +519,17 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
482
519
  extra_info=extra_info,
483
520
  )
484
521
 
522
+ if snowflake_query_type == "CALL" and res["root_query_id"] is None:
523
+ return StoredProcCall(
524
+ # This is the top-level query ID that other entries will reference.
525
+ snowflake_root_query_id=res["query_id"],
526
+ query_text=query_text,
527
+ timestamp=timestamp,
528
+ user=user,
529
+ default_db=res["default_db"],
530
+ default_schema=res["default_schema"],
531
+ )
532
+
485
533
  upstreams = []
486
534
  column_usage = {}
487
535
 
@@ -666,63 +714,101 @@ class SnowflakeQueriesSource(Source):
666
714
  super().close()
667
715
 
668
716
 
669
- # Make sure we don't try to generate too much info for a single query.
670
- _MAX_TABLES_PER_QUERY = 20
671
-
672
-
673
- def _build_enriched_query_log_query(
674
- start_time: datetime,
675
- end_time: datetime,
676
- bucket_duration: BucketDuration,
677
- deny_usernames: Optional[List[str]],
678
- ) -> str:
679
- start_time_millis = int(start_time.timestamp() * 1000)
680
- end_time_millis = int(end_time.timestamp() * 1000)
681
-
682
- users_filter = "TRUE"
683
- if deny_usernames:
684
- user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
685
- users_filter = f"user_name NOT IN ({user_not_in})"
686
-
687
- time_bucket_size = bucket_duration.value
688
- assert time_bucket_size in ("HOUR", "DAY", "MONTH")
717
+ class QueryLogQueryBuilder:
718
+ def __init__(
719
+ self,
720
+ start_time: datetime,
721
+ end_time: datetime,
722
+ bucket_duration: BucketDuration,
723
+ deny_usernames: Optional[List[str]],
724
+ max_tables_per_query: int = 20,
725
+ dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
726
+ ):
727
+ self.start_time = start_time
728
+ self.end_time = end_time
729
+ self.start_time_millis = int(start_time.timestamp() * 1000)
730
+ self.end_time_millis = int(end_time.timestamp() * 1000)
731
+ self.max_tables_per_query = max_tables_per_query
732
+ self.dedup_strategy = dedup_strategy
733
+
734
+ self.users_filter = "TRUE"
735
+ if deny_usernames:
736
+ user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
737
+ self.users_filter = f"user_name NOT IN ({user_not_in})"
738
+
739
+ self.time_bucket_size = bucket_duration.value
740
+ assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
741
+
742
+ def _query_fingerprinted_queries(self):
743
+ if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
744
+ secondary_fingerprint_sql = """
745
+ CASE
746
+ WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
747
+ -- Extract project id and hash it
748
+ THEN CAST(HASH(
749
+ REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
750
+ REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
751
+ ) AS VARCHAR)
752
+ ELSE NULL
753
+ END"""
754
+ elif self.dedup_strategy == QueryDedupStrategyType.NONE:
755
+ secondary_fingerprint_sql = "NULL"
756
+ else:
757
+ raise NotImplementedError(
758
+ f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
759
+ )
760
+ return f"""
761
+ SELECT *,
762
+ -- TODO: Generate better fingerprints for each query by pushing down regex logic.
763
+ query_history.query_parameterized_hash as query_fingerprint,
764
+ -- Optional and additional hash to be used for query deduplication and final query identity
765
+ {secondary_fingerprint_sql} as query_secondary_fingerprint
766
+ FROM
767
+ snowflake.account_usage.query_history
768
+ WHERE
769
+ query_history.start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
770
+ AND query_history.start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
771
+ AND execution_status = 'SUCCESS'
772
+ AND {self.users_filter}"""
773
+
774
+ def _query_deduplicated_queries(self):
775
+ if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
776
+ return f"""
777
+ SELECT
778
+ *,
779
+ DATE_TRUNC(
780
+ {self.time_bucket_size},
781
+ CONVERT_TIMEZONE('UTC', start_time)
782
+ ) AS bucket_start_time,
783
+ COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
784
+ FROM
785
+ fingerprinted_queries
786
+ QUALIFY
787
+ ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1"""
788
+ elif self.dedup_strategy == QueryDedupStrategyType.NONE:
789
+ return f"""
790
+ SELECT
791
+ *,
792
+ DATE_TRUNC(
793
+ {self.time_bucket_size},
794
+ CONVERT_TIMEZONE('UTC', start_time)
795
+ ) AS bucket_start_time,
796
+ 1 AS query_count,
797
+ FROM
798
+ fingerprinted_queries"""
799
+ else:
800
+ raise NotImplementedError(
801
+ f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
802
+ )
689
803
 
690
- return f"""\
804
+ def build_enriched_query_log_query(self) -> str:
805
+ return f"""\
691
806
  WITH
692
807
  fingerprinted_queries as (
693
- SELECT *,
694
- -- TODO: Generate better fingerprints for each query by pushing down regex logic.
695
- query_history.query_parameterized_hash as query_fingerprint,
696
- -- Optional and additional hash to be used for query deduplication and final query identity
697
- CASE
698
- WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
699
- -- Extract project id and hash it
700
- THEN CAST(HASH(
701
- REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
702
- REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
703
- ) AS VARCHAR)
704
- ELSE NULL
705
- END as query_secondary_fingerprint
706
- FROM
707
- snowflake.account_usage.query_history
708
- WHERE
709
- query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3) -- {start_time.isoformat()}
710
- AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3) -- {end_time.isoformat()}
711
- AND execution_status = 'SUCCESS'
712
- AND {users_filter}
808
+ {self._query_fingerprinted_queries()}
713
809
  )
714
810
  , deduplicated_queries as (
715
- SELECT
716
- *,
717
- DATE_TRUNC(
718
- {time_bucket_size},
719
- CONVERT_TIMEZONE('UTC', start_time)
720
- ) AS bucket_start_time,
721
- COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
722
- FROM
723
- fingerprinted_queries
724
- QUALIFY
725
- ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
811
+ {self._query_deduplicated_queries()}
726
812
  )
727
813
  , raw_access_history AS (
728
814
  SELECT
@@ -736,9 +822,9 @@ fingerprinted_queries as (
736
822
  FROM
737
823
  snowflake.account_usage.access_history
738
824
  WHERE
739
- query_start_time >= to_timestamp_ltz({start_time_millis}, 3) -- {start_time.isoformat()}
740
- AND query_start_time < to_timestamp_ltz({end_time_millis}, 3) -- {end_time.isoformat()}
741
- AND {users_filter}
825
+ query_start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
826
+ AND query_start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
827
+ AND {self.users_filter}
742
828
  AND query_id IN (
743
829
  SELECT query_id FROM deduplicated_queries
744
830
  )
@@ -751,7 +837,7 @@ fingerprinted_queries as (
751
837
  query_start_time,
752
838
  ARRAY_SLICE(
753
839
  FILTER(direct_objects_accessed, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}),
754
- 0, {_MAX_TABLES_PER_QUERY}
840
+ 0, {self.max_tables_per_query}
755
841
  ) as direct_objects_accessed,
756
842
  -- TODO: Drop the columns.baseSources subfield.
757
843
  FILTER(objects_modified, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}) as objects_modified,
@@ -231,7 +231,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
231
231
 
232
232
  with self.report.usage_aggregation.result_fetch_timer as fetch_timer:
233
233
  for row in results:
234
- with fetch_timer.pause(), self.report.usage_aggregation.result_skip_timer as skip_timer:
234
+ with (
235
+ fetch_timer.pause(),
236
+ self.report.usage_aggregation.result_skip_timer as skip_timer,
237
+ ):
235
238
  if results.rownumber is not None and results.rownumber % 1000 == 0:
236
239
  logger.debug(f"Processing usage row number {results.rownumber}")
237
240
  logger.debug(self.report.usage_aggregation.as_string())
@@ -255,7 +258,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
255
258
  f"Skipping usage for {row['OBJECT_DOMAIN']} {dataset_identifier}, as table is not accessible."
256
259
  )
257
260
  continue
258
- with skip_timer.pause(), self.report.usage_aggregation.result_map_timer as map_timer:
261
+ with (
262
+ skip_timer.pause(),
263
+ self.report.usage_aggregation.result_map_timer as map_timer,
264
+ ):
259
265
  wu = self.build_usage_statistics_for_dataset(
260
266
  dataset_identifier, row
261
267
  )
@@ -32,6 +32,7 @@ from datahub.ingestion.api.source import (
32
32
  )
33
33
  from datahub.ingestion.api.source_helpers import auto_workunit
34
34
  from datahub.ingestion.api.workunit import MetadataWorkUnit
35
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
35
36
  from datahub.ingestion.source.snowflake.constants import (
36
37
  GENERIC_PERMISSION_ERROR_KEY,
37
38
  SnowflakeEdition,
@@ -97,7 +98,14 @@ logger: logging.Logger = logging.getLogger(__name__)
97
98
  @support_status(SupportStatus.CERTIFIED)
98
99
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
99
100
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
100
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
101
+ @capability(
102
+ SourceCapability.CONTAINERS,
103
+ "Enabled by default",
104
+ subtype_modifier=[
105
+ SourceCapabilityModifier.DATABASE,
106
+ SourceCapabilityModifier.SCHEMA,
107
+ ],
108
+ )
101
109
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
102
110
  @capability(
103
111
  SourceCapability.DATA_PROFILING,
@@ -577,6 +585,7 @@ class SnowflakeV2Source(
577
585
 
578
586
  queries_extractor = SnowflakeQueriesExtractor(
579
587
  connection=self.connection,
588
+ # TODO: this should be its own section in main recipe
580
589
  config=SnowflakeQueriesExtractorConfig(
581
590
  window=BaseTimeWindowConfig(
582
591
  start_time=self.config.start_time,
@@ -591,6 +600,7 @@ class SnowflakeV2Source(
591
600
  include_query_usage_statistics=self.config.include_query_usage_statistics,
592
601
  user_email_pattern=self.config.user_email_pattern,
593
602
  pushdown_deny_usernames=self.config.pushdown_deny_usernames,
603
+ query_dedup_strategy=self.config.query_dedup_strategy,
594
604
  ),
595
605
  structured_report=self.report,
596
606
  filters=self.filters,