acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
- datahub/_version.py +1 -1
- datahub/cli/check_cli.py +0 -7
- datahub/cli/cli_utils.py +73 -0
- datahub/cli/delete_cli.py +0 -6
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +148 -228
- datahub/cli/exists_cli.py +0 -4
- datahub/cli/get_cli.py +0 -4
- datahub/cli/ingest_cli.py +1 -20
- datahub/cli/put_cli.py +0 -6
- datahub/cli/quickstart_versioning.py +50 -5
- datahub/cli/specific/assertions_cli.py +0 -6
- datahub/cli/specific/datacontract_cli.py +0 -6
- datahub/cli/specific/dataproduct_cli.py +0 -22
- datahub/cli/specific/dataset_cli.py +0 -11
- datahub/cli/specific/forms_cli.py +0 -6
- datahub/cli/specific/group_cli.py +0 -4
- datahub/cli/specific/structuredproperties_cli.py +0 -7
- datahub/cli/specific/user_cli.py +0 -4
- datahub/cli/state_cli.py +0 -4
- datahub/cli/timeline_cli.py +0 -4
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/report.py +183 -35
- datahub/ingestion/autogenerated/capability_summary.json +3431 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +30 -128
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/graph/client.py +2 -2
- datahub/ingestion/run/pipeline.py +47 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +1 -1
- datahub/ingestion/source/data_lake_common/object_store.py +40 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dremio/dremio_source.py +7 -7
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +28 -20
- datahub/ingestion/source/identity/okta.py +0 -13
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
- datahub/ingestion/source/powerbi/powerbi.py +0 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/source.py +19 -3
- datahub/ingestion/source/sigma/sigma.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/hive_metastore.py +0 -10
- datahub/ingestion/source/sql/sql_common.py +4 -0
- datahub/ingestion/source/sql/vertica.py +0 -4
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/superset.py +56 -1
- datahub/ingestion/source/tableau/tableau.py +40 -34
- datahub/ingestion/source/tableau/tableau_constant.py +0 -2
- datahub/ingestion/source/unity/proxy.py +4 -3
- datahub/ingestion/source/unity/source.py +19 -9
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +85 -4
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
- datahub/metadata/schema.avsc +54 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
- datahub/sdk/lineage_client.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
- datahub/sql_parsing/sqlglot_lineage.py +40 -13
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0
|
@@ -52,7 +52,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
52
52
|
from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps
|
|
53
53
|
from datahub.metadata.schema_classes import (
|
|
54
54
|
BrowsePathsClass,
|
|
55
|
-
ChangeTypeClass,
|
|
56
55
|
CorpUserInfoClass,
|
|
57
56
|
CorpUserKeyClass,
|
|
58
57
|
DashboardInfoClass,
|
|
@@ -243,20 +242,14 @@ class Mapper:
|
|
|
243
242
|
|
|
244
243
|
@staticmethod
|
|
245
244
|
def new_mcp(
|
|
246
|
-
entity_type,
|
|
247
245
|
entity_urn,
|
|
248
|
-
aspect_name,
|
|
249
246
|
aspect,
|
|
250
|
-
change_type=ChangeTypeClass.UPSERT,
|
|
251
247
|
):
|
|
252
248
|
"""
|
|
253
249
|
Create MCP
|
|
254
250
|
"""
|
|
255
251
|
return MetadataChangeProposalWrapper(
|
|
256
|
-
entityType=entity_type,
|
|
257
|
-
changeType=change_type,
|
|
258
252
|
entityUrn=entity_urn,
|
|
259
|
-
aspectName=aspect_name,
|
|
260
253
|
aspect=aspect,
|
|
261
254
|
)
|
|
262
255
|
|
|
@@ -343,17 +336,13 @@ class Mapper:
|
|
|
343
336
|
)
|
|
344
337
|
|
|
345
338
|
info_mcp = self.new_mcp(
|
|
346
|
-
entity_type=Constant.DASHBOARD,
|
|
347
339
|
entity_urn=dashboard_urn,
|
|
348
|
-
aspect_name=Constant.DASHBOARD_INFO,
|
|
349
340
|
aspect=dashboard_info_cls,
|
|
350
341
|
)
|
|
351
342
|
|
|
352
343
|
# removed status mcp
|
|
353
344
|
removed_status_mcp = self.new_mcp(
|
|
354
|
-
entity_type=Constant.DASHBOARD,
|
|
355
345
|
entity_urn=dashboard_urn,
|
|
356
|
-
aspect_name=Constant.STATUS,
|
|
357
346
|
aspect=StatusClass(removed=False),
|
|
358
347
|
)
|
|
359
348
|
|
|
@@ -365,9 +354,7 @@ class Mapper:
|
|
|
365
354
|
|
|
366
355
|
# Dashboard key
|
|
367
356
|
dashboard_key_mcp = self.new_mcp(
|
|
368
|
-
entity_type=Constant.DASHBOARD,
|
|
369
357
|
entity_urn=dashboard_urn,
|
|
370
|
-
aspect_name=Constant.DASHBOARD_KEY,
|
|
371
358
|
aspect=dashboard_key_cls,
|
|
372
359
|
)
|
|
373
360
|
|
|
@@ -378,9 +365,7 @@ class Mapper:
|
|
|
378
365
|
ownership = OwnershipClass(owners=owners)
|
|
379
366
|
# Dashboard owner MCP
|
|
380
367
|
owner_mcp = self.new_mcp(
|
|
381
|
-
entity_type=Constant.DASHBOARD,
|
|
382
368
|
entity_urn=dashboard_urn,
|
|
383
|
-
aspect_name=Constant.OWNERSHIP,
|
|
384
369
|
aspect=ownership,
|
|
385
370
|
)
|
|
386
371
|
|
|
@@ -396,9 +381,7 @@ class Mapper:
|
|
|
396
381
|
]
|
|
397
382
|
)
|
|
398
383
|
browse_path_mcp = self.new_mcp(
|
|
399
|
-
entity_type=Constant.DASHBOARD,
|
|
400
384
|
entity_urn=dashboard_urn,
|
|
401
|
-
aspect_name=Constant.BROWSERPATH,
|
|
402
385
|
aspect=browse_path,
|
|
403
386
|
)
|
|
404
387
|
|
|
@@ -429,27 +412,21 @@ class Mapper:
|
|
|
429
412
|
)
|
|
430
413
|
|
|
431
414
|
info_mcp = self.new_mcp(
|
|
432
|
-
entity_type=Constant.CORP_USER,
|
|
433
415
|
entity_urn=user_urn,
|
|
434
|
-
aspect_name=Constant.CORP_USER_INFO,
|
|
435
416
|
aspect=user_info_instance,
|
|
436
417
|
)
|
|
437
418
|
user_mcps.append(info_mcp)
|
|
438
419
|
|
|
439
420
|
# removed status mcp
|
|
440
421
|
status_mcp = self.new_mcp(
|
|
441
|
-
entity_type=Constant.CORP_USER,
|
|
442
422
|
entity_urn=user_urn,
|
|
443
|
-
aspect_name=Constant.STATUS,
|
|
444
423
|
aspect=StatusClass(removed=False),
|
|
445
424
|
)
|
|
446
425
|
user_mcps.append(status_mcp)
|
|
447
426
|
user_key = CorpUserKeyClass(username=user.username)
|
|
448
427
|
|
|
449
428
|
user_key_mcp = self.new_mcp(
|
|
450
|
-
entity_type=Constant.CORP_USER,
|
|
451
429
|
entity_urn=user_urn,
|
|
452
|
-
aspect_name=Constant.CORP_USER_KEY,
|
|
453
430
|
aspect=user_key,
|
|
454
431
|
)
|
|
455
432
|
user_mcps.append(user_key_mcp)
|
|
@@ -182,9 +182,10 @@ class RedshiftUsageExtractor:
|
|
|
182
182
|
self.report.num_operational_stats_filtered = 0
|
|
183
183
|
|
|
184
184
|
if self.config.include_operational_stats:
|
|
185
|
-
with
|
|
186
|
-
USAGE_EXTRACTION_OPERATIONAL_STATS
|
|
187
|
-
|
|
185
|
+
with (
|
|
186
|
+
self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS),
|
|
187
|
+
PerfTimer() as timer,
|
|
188
|
+
):
|
|
188
189
|
# Generate operation aspect workunits
|
|
189
190
|
yield from self._gen_operation_aspect_workunits(
|
|
190
191
|
self.connection, all_tables
|
|
@@ -682,7 +682,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
682
682
|
|
|
683
683
|
logger.info(f"Extracting table schema from file: {table_data.full_path}")
|
|
684
684
|
browse_path: str = (
|
|
685
|
-
strip_s3_prefix(table_data.table_path)
|
|
685
|
+
self.strip_s3_prefix(table_data.table_path)
|
|
686
686
|
if self.is_s3_platform()
|
|
687
687
|
else table_data.table_path.strip("/")
|
|
688
688
|
)
|
|
@@ -949,7 +949,10 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
949
949
|
"""
|
|
950
950
|
|
|
951
951
|
def _is_allowed_path(path_spec_: PathSpec, s3_uri: str) -> bool:
|
|
952
|
-
|
|
952
|
+
# Normalize URI for pattern matching
|
|
953
|
+
normalized_uri = self._normalize_uri_for_pattern_matching(s3_uri)
|
|
954
|
+
|
|
955
|
+
allowed = path_spec_.allowed(normalized_uri)
|
|
953
956
|
if not allowed:
|
|
954
957
|
logger.debug(f"File {s3_uri} not allowed and skipping")
|
|
955
958
|
self.report.report_file_dropped(s3_uri)
|
|
@@ -1394,8 +1397,13 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1394
1397
|
)
|
|
1395
1398
|
table_dict: Dict[str, TableData] = {}
|
|
1396
1399
|
for browse_path in file_browser:
|
|
1400
|
+
# Normalize URI for pattern matching
|
|
1401
|
+
normalized_file_path = self._normalize_uri_for_pattern_matching(
|
|
1402
|
+
browse_path.file
|
|
1403
|
+
)
|
|
1404
|
+
|
|
1397
1405
|
if not path_spec.allowed(
|
|
1398
|
-
|
|
1406
|
+
normalized_file_path,
|
|
1399
1407
|
ignore_ext=self.is_s3_platform()
|
|
1400
1408
|
and self.source_config.use_s3_content_type,
|
|
1401
1409
|
):
|
|
@@ -1471,5 +1479,13 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1471
1479
|
def is_s3_platform(self):
|
|
1472
1480
|
return self.source_config.platform == "s3"
|
|
1473
1481
|
|
|
1482
|
+
def strip_s3_prefix(self, s3_uri: str) -> str:
|
|
1483
|
+
"""Strip S3 prefix from URI. Can be overridden by adapters for other platforms."""
|
|
1484
|
+
return strip_s3_prefix(s3_uri)
|
|
1485
|
+
|
|
1486
|
+
def _normalize_uri_for_pattern_matching(self, uri: str) -> str:
|
|
1487
|
+
"""Normalize URI for pattern matching. Can be overridden by adapters for other platforms."""
|
|
1488
|
+
return uri
|
|
1489
|
+
|
|
1474
1490
|
def get_report(self):
|
|
1475
1491
|
return self.report
|
|
@@ -30,6 +30,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
30
30
|
from datahub.ingestion.source.common.subtypes import (
|
|
31
31
|
BIContainerSubTypes,
|
|
32
32
|
DatasetSubTypes,
|
|
33
|
+
SourceCapabilityModifier,
|
|
33
34
|
)
|
|
34
35
|
from datahub.ingestion.source.sigma.config import (
|
|
35
36
|
PlatformDetail,
|
|
@@ -95,7 +96,11 @@ logger = logging.getLogger(__name__)
|
|
|
95
96
|
@platform_name("Sigma")
|
|
96
97
|
@config_class(SigmaSourceConfig)
|
|
97
98
|
@support_status(SupportStatus.INCUBATING)
|
|
98
|
-
@capability(
|
|
99
|
+
@capability(
|
|
100
|
+
SourceCapability.CONTAINERS,
|
|
101
|
+
"Enabled by default",
|
|
102
|
+
subtype_modifier=[SourceCapabilityModifier.SIGMA_WORKSPACE],
|
|
103
|
+
)
|
|
99
104
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
100
105
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default.")
|
|
101
106
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
4
5
|
from typing import Dict, List, Optional, Set
|
|
5
6
|
|
|
6
7
|
import pydantic
|
|
@@ -53,6 +54,11 @@ DEFAULT_TEMP_TABLES_PATTERNS = [
|
|
|
53
54
|
]
|
|
54
55
|
|
|
55
56
|
|
|
57
|
+
class QueryDedupStrategyType(Enum):
|
|
58
|
+
STANDARD = "STANDARD"
|
|
59
|
+
NONE = "NONE"
|
|
60
|
+
|
|
61
|
+
|
|
56
62
|
class TagOption(StrEnum):
|
|
57
63
|
with_lineage = "with_lineage"
|
|
58
64
|
without_lineage = "without_lineage"
|
|
@@ -248,6 +254,11 @@ class SnowflakeV2Config(
|
|
|
248
254
|
"This is useful if you have a large number of schemas and want to avoid bulk fetching the schema for each table/view.",
|
|
249
255
|
)
|
|
250
256
|
|
|
257
|
+
query_dedup_strategy: QueryDedupStrategyType = Field(
|
|
258
|
+
default=QueryDedupStrategyType.STANDARD,
|
|
259
|
+
description=f"Experimental: Choose the strategy for query deduplication (default value is appropriate for most use-cases; make sure you understand performance implications before changing it). Allowed values are: {', '.join([s.name for s in QueryDedupStrategyType])}",
|
|
260
|
+
)
|
|
261
|
+
|
|
251
262
|
_check_role_grants_removed = pydantic_removed_field("check_role_grants")
|
|
252
263
|
_provision_role_removed = pydantic_removed_field("provision_role")
|
|
253
264
|
|
|
@@ -28,6 +28,7 @@ from datahub.ingestion.graph.client import DataHubGraph
|
|
|
28
28
|
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
|
|
29
29
|
from datahub.ingestion.source.snowflake.snowflake_config import (
|
|
30
30
|
DEFAULT_TEMP_TABLES_PATTERNS,
|
|
31
|
+
QueryDedupStrategyType,
|
|
31
32
|
SnowflakeFilterConfig,
|
|
32
33
|
SnowflakeIdentifierConfig,
|
|
33
34
|
)
|
|
@@ -44,6 +45,11 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
|
44
45
|
SnowflakeIdentifierBuilder,
|
|
45
46
|
SnowflakeStructuredReportMixin,
|
|
46
47
|
)
|
|
48
|
+
from datahub.ingestion.source.snowflake.stored_proc_lineage import (
|
|
49
|
+
StoredProcCall,
|
|
50
|
+
StoredProcLineageReport,
|
|
51
|
+
StoredProcLineageTracker,
|
|
52
|
+
)
|
|
47
53
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
48
54
|
from datahub.metadata.urns import CorpUserUrn
|
|
49
55
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
@@ -113,6 +119,8 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
|
|
|
113
119
|
include_query_usage_statistics: bool = True
|
|
114
120
|
include_operations: bool = True
|
|
115
121
|
|
|
122
|
+
query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD
|
|
123
|
+
|
|
116
124
|
|
|
117
125
|
class SnowflakeQueriesSourceConfig(
|
|
118
126
|
SnowflakeQueriesExtractorConfig, SnowflakeIdentifierConfig, SnowflakeFilterConfig
|
|
@@ -130,6 +138,7 @@ class SnowflakeQueriesExtractorReport(Report):
|
|
|
130
138
|
aggregator_generate_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
131
139
|
|
|
132
140
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
141
|
+
stored_proc_lineage: Optional[StoredProcLineageReport] = None
|
|
133
142
|
|
|
134
143
|
num_ddl_queries_dropped: int = 0
|
|
135
144
|
num_stream_queries_observed: int = 0
|
|
@@ -261,6 +270,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
261
270
|
TableRename,
|
|
262
271
|
TableSwap,
|
|
263
272
|
ObservedQuery,
|
|
273
|
+
StoredProcCall,
|
|
264
274
|
]
|
|
265
275
|
] = self._exit_stack.enter_context(FileBackedList(shared_connection))
|
|
266
276
|
|
|
@@ -277,12 +287,34 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
277
287
|
for entry in self.fetch_query_log(users):
|
|
278
288
|
queries.append(entry)
|
|
279
289
|
|
|
290
|
+
stored_proc_tracker: StoredProcLineageTracker = self._exit_stack.enter_context(
|
|
291
|
+
StoredProcLineageTracker(
|
|
292
|
+
platform=self.identifiers.platform,
|
|
293
|
+
shared_connection=shared_connection,
|
|
294
|
+
)
|
|
295
|
+
)
|
|
296
|
+
self.report.stored_proc_lineage = stored_proc_tracker.report
|
|
297
|
+
|
|
280
298
|
with self.report.audit_log_load_timer:
|
|
281
299
|
for i, query in enumerate(queries):
|
|
282
300
|
if i % 1000 == 0:
|
|
283
301
|
logger.info(f"Added {i} query log entries to SQL aggregator")
|
|
284
302
|
|
|
285
|
-
|
|
303
|
+
if isinstance(query, StoredProcCall):
|
|
304
|
+
stored_proc_tracker.add_stored_proc_call(query)
|
|
305
|
+
continue
|
|
306
|
+
|
|
307
|
+
if not (
|
|
308
|
+
isinstance(query, PreparsedQuery)
|
|
309
|
+
and stored_proc_tracker.add_related_query(query)
|
|
310
|
+
):
|
|
311
|
+
# Only add to aggregator if it's not part of a stored procedure.
|
|
312
|
+
self.aggregator.add(query)
|
|
313
|
+
|
|
314
|
+
# Generate and add stored procedure lineage entries.
|
|
315
|
+
for lineage_entry in stored_proc_tracker.build_merged_lineage_entries():
|
|
316
|
+
# TODO: Make this the lowest priority lineage - so that it doesn't override other lineage entries.
|
|
317
|
+
self.aggregator.add(lineage_entry)
|
|
286
318
|
|
|
287
319
|
with self.report.aggregator_generate_timer:
|
|
288
320
|
yield from auto_workunit(self.aggregator.gen_metadata())
|
|
@@ -342,13 +374,16 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
342
374
|
|
|
343
375
|
def fetch_query_log(
|
|
344
376
|
self, users: UsersMapping
|
|
345
|
-
) -> Iterable[
|
|
346
|
-
|
|
377
|
+
) -> Iterable[
|
|
378
|
+
Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
|
|
379
|
+
]:
|
|
380
|
+
query_log_query = QueryLogQueryBuilder(
|
|
347
381
|
start_time=self.config.window.start_time,
|
|
348
382
|
end_time=self.config.window.end_time,
|
|
349
383
|
bucket_duration=self.config.window.bucket_duration,
|
|
350
384
|
deny_usernames=self.config.pushdown_deny_usernames,
|
|
351
|
-
|
|
385
|
+
dedup_strategy=self.config.query_dedup_strategy,
|
|
386
|
+
).build_enriched_query_log_query()
|
|
352
387
|
|
|
353
388
|
with self.structured_reporter.report_exc(
|
|
354
389
|
"Error fetching query log from Snowflake"
|
|
@@ -382,7 +417,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
382
417
|
|
|
383
418
|
def _parse_audit_log_row(
|
|
384
419
|
self, row: Dict[str, Any], users: UsersMapping
|
|
385
|
-
) -> Optional[
|
|
420
|
+
) -> Optional[
|
|
421
|
+
Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery, StoredProcCall]
|
|
422
|
+
]:
|
|
386
423
|
json_fields = {
|
|
387
424
|
"DIRECT_OBJECTS_ACCESSED",
|
|
388
425
|
"OBJECTS_MODIFIED",
|
|
@@ -482,6 +519,17 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
482
519
|
extra_info=extra_info,
|
|
483
520
|
)
|
|
484
521
|
|
|
522
|
+
if snowflake_query_type == "CALL" and res["root_query_id"] is None:
|
|
523
|
+
return StoredProcCall(
|
|
524
|
+
# This is the top-level query ID that other entries will reference.
|
|
525
|
+
snowflake_root_query_id=res["query_id"],
|
|
526
|
+
query_text=query_text,
|
|
527
|
+
timestamp=timestamp,
|
|
528
|
+
user=user,
|
|
529
|
+
default_db=res["default_db"],
|
|
530
|
+
default_schema=res["default_schema"],
|
|
531
|
+
)
|
|
532
|
+
|
|
485
533
|
upstreams = []
|
|
486
534
|
column_usage = {}
|
|
487
535
|
|
|
@@ -666,63 +714,101 @@ class SnowflakeQueriesSource(Source):
|
|
|
666
714
|
super().close()
|
|
667
715
|
|
|
668
716
|
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
)
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
717
|
+
class QueryLogQueryBuilder:
|
|
718
|
+
def __init__(
|
|
719
|
+
self,
|
|
720
|
+
start_time: datetime,
|
|
721
|
+
end_time: datetime,
|
|
722
|
+
bucket_duration: BucketDuration,
|
|
723
|
+
deny_usernames: Optional[List[str]],
|
|
724
|
+
max_tables_per_query: int = 20,
|
|
725
|
+
dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
|
|
726
|
+
):
|
|
727
|
+
self.start_time = start_time
|
|
728
|
+
self.end_time = end_time
|
|
729
|
+
self.start_time_millis = int(start_time.timestamp() * 1000)
|
|
730
|
+
self.end_time_millis = int(end_time.timestamp() * 1000)
|
|
731
|
+
self.max_tables_per_query = max_tables_per_query
|
|
732
|
+
self.dedup_strategy = dedup_strategy
|
|
733
|
+
|
|
734
|
+
self.users_filter = "TRUE"
|
|
735
|
+
if deny_usernames:
|
|
736
|
+
user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
|
|
737
|
+
self.users_filter = f"user_name NOT IN ({user_not_in})"
|
|
738
|
+
|
|
739
|
+
self.time_bucket_size = bucket_duration.value
|
|
740
|
+
assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
|
|
741
|
+
|
|
742
|
+
def _query_fingerprinted_queries(self):
|
|
743
|
+
if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
|
|
744
|
+
secondary_fingerprint_sql = """
|
|
745
|
+
CASE
|
|
746
|
+
WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
|
|
747
|
+
-- Extract project id and hash it
|
|
748
|
+
THEN CAST(HASH(
|
|
749
|
+
REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
|
|
750
|
+
REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
|
|
751
|
+
) AS VARCHAR)
|
|
752
|
+
ELSE NULL
|
|
753
|
+
END"""
|
|
754
|
+
elif self.dedup_strategy == QueryDedupStrategyType.NONE:
|
|
755
|
+
secondary_fingerprint_sql = "NULL"
|
|
756
|
+
else:
|
|
757
|
+
raise NotImplementedError(
|
|
758
|
+
f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
|
|
759
|
+
)
|
|
760
|
+
return f"""
|
|
761
|
+
SELECT *,
|
|
762
|
+
-- TODO: Generate better fingerprints for each query by pushing down regex logic.
|
|
763
|
+
query_history.query_parameterized_hash as query_fingerprint,
|
|
764
|
+
-- Optional and additional hash to be used for query deduplication and final query identity
|
|
765
|
+
{secondary_fingerprint_sql} as query_secondary_fingerprint
|
|
766
|
+
FROM
|
|
767
|
+
snowflake.account_usage.query_history
|
|
768
|
+
WHERE
|
|
769
|
+
query_history.start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
|
|
770
|
+
AND query_history.start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
|
|
771
|
+
AND execution_status = 'SUCCESS'
|
|
772
|
+
AND {self.users_filter}"""
|
|
773
|
+
|
|
774
|
+
def _query_deduplicated_queries(self):
|
|
775
|
+
if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
|
|
776
|
+
return f"""
|
|
777
|
+
SELECT
|
|
778
|
+
*,
|
|
779
|
+
DATE_TRUNC(
|
|
780
|
+
{self.time_bucket_size},
|
|
781
|
+
CONVERT_TIMEZONE('UTC', start_time)
|
|
782
|
+
) AS bucket_start_time,
|
|
783
|
+
COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
|
|
784
|
+
FROM
|
|
785
|
+
fingerprinted_queries
|
|
786
|
+
QUALIFY
|
|
787
|
+
ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1"""
|
|
788
|
+
elif self.dedup_strategy == QueryDedupStrategyType.NONE:
|
|
789
|
+
return f"""
|
|
790
|
+
SELECT
|
|
791
|
+
*,
|
|
792
|
+
DATE_TRUNC(
|
|
793
|
+
{self.time_bucket_size},
|
|
794
|
+
CONVERT_TIMEZONE('UTC', start_time)
|
|
795
|
+
) AS bucket_start_time,
|
|
796
|
+
1 AS query_count,
|
|
797
|
+
FROM
|
|
798
|
+
fingerprinted_queries"""
|
|
799
|
+
else:
|
|
800
|
+
raise NotImplementedError(
|
|
801
|
+
f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
|
|
802
|
+
)
|
|
689
803
|
|
|
690
|
-
|
|
804
|
+
def build_enriched_query_log_query(self) -> str:
|
|
805
|
+
return f"""\
|
|
691
806
|
WITH
|
|
692
807
|
fingerprinted_queries as (
|
|
693
|
-
|
|
694
|
-
-- TODO: Generate better fingerprints for each query by pushing down regex logic.
|
|
695
|
-
query_history.query_parameterized_hash as query_fingerprint,
|
|
696
|
-
-- Optional and additional hash to be used for query deduplication and final query identity
|
|
697
|
-
CASE
|
|
698
|
-
WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
|
|
699
|
-
-- Extract project id and hash it
|
|
700
|
-
THEN CAST(HASH(
|
|
701
|
-
REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
|
|
702
|
-
REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
|
|
703
|
-
) AS VARCHAR)
|
|
704
|
-
ELSE NULL
|
|
705
|
-
END as query_secondary_fingerprint
|
|
706
|
-
FROM
|
|
707
|
-
snowflake.account_usage.query_history
|
|
708
|
-
WHERE
|
|
709
|
-
query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3) -- {start_time.isoformat()}
|
|
710
|
-
AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3) -- {end_time.isoformat()}
|
|
711
|
-
AND execution_status = 'SUCCESS'
|
|
712
|
-
AND {users_filter}
|
|
808
|
+
{self._query_fingerprinted_queries()}
|
|
713
809
|
)
|
|
714
810
|
, deduplicated_queries as (
|
|
715
|
-
|
|
716
|
-
*,
|
|
717
|
-
DATE_TRUNC(
|
|
718
|
-
{time_bucket_size},
|
|
719
|
-
CONVERT_TIMEZONE('UTC', start_time)
|
|
720
|
-
) AS bucket_start_time,
|
|
721
|
-
COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
|
|
722
|
-
FROM
|
|
723
|
-
fingerprinted_queries
|
|
724
|
-
QUALIFY
|
|
725
|
-
ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
|
|
811
|
+
{self._query_deduplicated_queries()}
|
|
726
812
|
)
|
|
727
813
|
, raw_access_history AS (
|
|
728
814
|
SELECT
|
|
@@ -736,9 +822,9 @@ fingerprinted_queries as (
|
|
|
736
822
|
FROM
|
|
737
823
|
snowflake.account_usage.access_history
|
|
738
824
|
WHERE
|
|
739
|
-
query_start_time >= to_timestamp_ltz({start_time_millis}, 3) -- {start_time.isoformat()}
|
|
740
|
-
AND query_start_time < to_timestamp_ltz({end_time_millis}, 3) -- {end_time.isoformat()}
|
|
741
|
-
AND {users_filter}
|
|
825
|
+
query_start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
|
|
826
|
+
AND query_start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
|
|
827
|
+
AND {self.users_filter}
|
|
742
828
|
AND query_id IN (
|
|
743
829
|
SELECT query_id FROM deduplicated_queries
|
|
744
830
|
)
|
|
@@ -751,7 +837,7 @@ fingerprinted_queries as (
|
|
|
751
837
|
query_start_time,
|
|
752
838
|
ARRAY_SLICE(
|
|
753
839
|
FILTER(direct_objects_accessed, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}),
|
|
754
|
-
0, {
|
|
840
|
+
0, {self.max_tables_per_query}
|
|
755
841
|
) as direct_objects_accessed,
|
|
756
842
|
-- TODO: Drop the columns.baseSources subfield.
|
|
757
843
|
FILTER(objects_modified, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}) as objects_modified,
|
|
@@ -231,7 +231,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
231
231
|
|
|
232
232
|
with self.report.usage_aggregation.result_fetch_timer as fetch_timer:
|
|
233
233
|
for row in results:
|
|
234
|
-
with
|
|
234
|
+
with (
|
|
235
|
+
fetch_timer.pause(),
|
|
236
|
+
self.report.usage_aggregation.result_skip_timer as skip_timer,
|
|
237
|
+
):
|
|
235
238
|
if results.rownumber is not None and results.rownumber % 1000 == 0:
|
|
236
239
|
logger.debug(f"Processing usage row number {results.rownumber}")
|
|
237
240
|
logger.debug(self.report.usage_aggregation.as_string())
|
|
@@ -255,7 +258,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
255
258
|
f"Skipping usage for {row['OBJECT_DOMAIN']} {dataset_identifier}, as table is not accessible."
|
|
256
259
|
)
|
|
257
260
|
continue
|
|
258
|
-
with
|
|
261
|
+
with (
|
|
262
|
+
skip_timer.pause(),
|
|
263
|
+
self.report.usage_aggregation.result_map_timer as map_timer,
|
|
264
|
+
):
|
|
259
265
|
wu = self.build_usage_statistics_for_dataset(
|
|
260
266
|
dataset_identifier, row
|
|
261
267
|
)
|
|
@@ -32,6 +32,7 @@ from datahub.ingestion.api.source import (
|
|
|
32
32
|
)
|
|
33
33
|
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
34
34
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
35
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
35
36
|
from datahub.ingestion.source.snowflake.constants import (
|
|
36
37
|
GENERIC_PERMISSION_ERROR_KEY,
|
|
37
38
|
SnowflakeEdition,
|
|
@@ -97,7 +98,14 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
97
98
|
@support_status(SupportStatus.CERTIFIED)
|
|
98
99
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
99
100
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
100
|
-
@capability(
|
|
101
|
+
@capability(
|
|
102
|
+
SourceCapability.CONTAINERS,
|
|
103
|
+
"Enabled by default",
|
|
104
|
+
subtype_modifier=[
|
|
105
|
+
SourceCapabilityModifier.DATABASE,
|
|
106
|
+
SourceCapabilityModifier.SCHEMA,
|
|
107
|
+
],
|
|
108
|
+
)
|
|
101
109
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
102
110
|
@capability(
|
|
103
111
|
SourceCapability.DATA_PROFILING,
|
|
@@ -577,6 +585,7 @@ class SnowflakeV2Source(
|
|
|
577
585
|
|
|
578
586
|
queries_extractor = SnowflakeQueriesExtractor(
|
|
579
587
|
connection=self.connection,
|
|
588
|
+
# TODO: this should be its own section in main recipe
|
|
580
589
|
config=SnowflakeQueriesExtractorConfig(
|
|
581
590
|
window=BaseTimeWindowConfig(
|
|
582
591
|
start_time=self.config.start_time,
|
|
@@ -591,6 +600,7 @@ class SnowflakeV2Source(
|
|
|
591
600
|
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
592
601
|
user_email_pattern=self.config.user_email_pattern,
|
|
593
602
|
pushdown_deny_usernames=self.config.pushdown_deny_usernames,
|
|
603
|
+
query_dedup_strategy=self.config.query_dedup_strategy,
|
|
594
604
|
),
|
|
595
605
|
structured_report=self.report,
|
|
596
606
|
filters=self.filters,
|