acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +12 -16
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/response_helper.py +86 -1
- datahub/emitter/rest_emitter.py +71 -13
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/config.py +11 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/openapi.py +12 -0
- datahub/ingestion/source/openapi_parser.py +56 -37
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1446 -559
- datahub/metadata/_urns/urn_defs.py +1721 -1553
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +18055 -17802
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/sdk/__init__.py +6 -0
- datahub/sdk/_all_entities.py +11 -0
- datahub/sdk/_shared.py +118 -1
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +90 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -28,6 +28,7 @@ from datahub.ingestion.graph.client import DataHubGraph
|
|
|
28
28
|
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
|
|
29
29
|
from datahub.ingestion.source.snowflake.snowflake_config import (
|
|
30
30
|
DEFAULT_TEMP_TABLES_PATTERNS,
|
|
31
|
+
QueryDedupStrategyType,
|
|
31
32
|
SnowflakeFilterConfig,
|
|
32
33
|
SnowflakeIdentifierConfig,
|
|
33
34
|
)
|
|
@@ -44,6 +45,11 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
|
44
45
|
SnowflakeIdentifierBuilder,
|
|
45
46
|
SnowflakeStructuredReportMixin,
|
|
46
47
|
)
|
|
48
|
+
from datahub.ingestion.source.snowflake.stored_proc_lineage import (
|
|
49
|
+
StoredProcCall,
|
|
50
|
+
StoredProcLineageReport,
|
|
51
|
+
StoredProcLineageTracker,
|
|
52
|
+
)
|
|
47
53
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
48
54
|
from datahub.metadata.urns import CorpUserUrn
|
|
49
55
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
@@ -63,7 +69,10 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
63
69
|
DownstreamColumnRef,
|
|
64
70
|
)
|
|
65
71
|
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
66
|
-
from datahub.utilities.file_backed_collections import
|
|
72
|
+
from datahub.utilities.file_backed_collections import (
|
|
73
|
+
ConnectionWrapper,
|
|
74
|
+
FileBackedList,
|
|
75
|
+
)
|
|
67
76
|
from datahub.utilities.perf_timer import PerfTimer
|
|
68
77
|
|
|
69
78
|
logger = logging.getLogger(__name__)
|
|
@@ -110,6 +119,22 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
|
|
|
110
119
|
include_query_usage_statistics: bool = True
|
|
111
120
|
include_operations: bool = True
|
|
112
121
|
|
|
122
|
+
push_down_database_pattern_access_history: bool = pydantic.Field(
|
|
123
|
+
default=False,
|
|
124
|
+
description="If enabled, pushes down database pattern filtering to the access_history table for improved performance. "
|
|
125
|
+
"This filters on the accessed objects in access_history.",
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
additional_database_names_allowlist: List[str] = pydantic.Field(
|
|
129
|
+
default=[],
|
|
130
|
+
description="Additional database names (no pattern matching) to be included in the access_history filter. "
|
|
131
|
+
"Only applies if push_down_database_pattern_access_history=True. "
|
|
132
|
+
"These databases will be included in the filter being pushed down regardless of database_pattern settings."
|
|
133
|
+
"This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.",
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD
|
|
137
|
+
|
|
113
138
|
|
|
114
139
|
class SnowflakeQueriesSourceConfig(
|
|
115
140
|
SnowflakeQueriesExtractorConfig, SnowflakeIdentifierConfig, SnowflakeFilterConfig
|
|
@@ -124,7 +149,10 @@ class SnowflakeQueriesExtractorReport(Report):
|
|
|
124
149
|
users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
125
150
|
|
|
126
151
|
audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
152
|
+
aggregator_generate_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
153
|
+
|
|
127
154
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
155
|
+
stored_proc_lineage: Optional[StoredProcLineageReport] = None
|
|
128
156
|
|
|
129
157
|
num_ddl_queries_dropped: int = 0
|
|
130
158
|
num_stream_queries_observed: int = 0
|
|
@@ -243,6 +271,12 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
243
271
|
audit_log_file = self.local_temp_path / "audit_log.sqlite"
|
|
244
272
|
use_cached_audit_log = audit_log_file.exists()
|
|
245
273
|
|
|
274
|
+
if self.config.local_temp_path is None:
|
|
275
|
+
self._exit_stack.callback(lambda: audit_log_file.unlink(missing_ok=True))
|
|
276
|
+
|
|
277
|
+
shared_connection = self._exit_stack.enter_context(
|
|
278
|
+
ConnectionWrapper(audit_log_file)
|
|
279
|
+
)
|
|
246
280
|
queries: FileBackedList[
|
|
247
281
|
Union[
|
|
248
282
|
KnownLineageMapping,
|
|
@@ -250,44 +284,54 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
250
284
|
TableRename,
|
|
251
285
|
TableSwap,
|
|
252
286
|
ObservedQuery,
|
|
287
|
+
StoredProcCall,
|
|
253
288
|
]
|
|
254
|
-
]
|
|
289
|
+
] = self._exit_stack.enter_context(FileBackedList(shared_connection))
|
|
290
|
+
|
|
255
291
|
if use_cached_audit_log:
|
|
256
|
-
logger.info("Using cached audit log")
|
|
257
|
-
shared_connection = ConnectionWrapper(audit_log_file)
|
|
258
|
-
queries = FileBackedList(shared_connection)
|
|
292
|
+
logger.info(f"Using cached audit log at {audit_log_file}")
|
|
259
293
|
else:
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
shared_connection = ConnectionWrapper(audit_log_file)
|
|
263
|
-
queries = FileBackedList(shared_connection)
|
|
264
|
-
entry: Union[
|
|
265
|
-
KnownLineageMapping,
|
|
266
|
-
PreparsedQuery,
|
|
267
|
-
TableRename,
|
|
268
|
-
TableSwap,
|
|
269
|
-
ObservedQuery,
|
|
270
|
-
]
|
|
294
|
+
logger.info(f"Fetching audit log into {audit_log_file}")
|
|
271
295
|
|
|
272
296
|
with self.report.copy_history_fetch_timer:
|
|
273
|
-
for
|
|
274
|
-
queries.append(
|
|
297
|
+
for copy_entry in self.fetch_copy_history():
|
|
298
|
+
queries.append(copy_entry)
|
|
275
299
|
|
|
276
300
|
with self.report.query_log_fetch_timer:
|
|
277
301
|
for entry in self.fetch_query_log(users):
|
|
278
302
|
queries.append(entry)
|
|
279
303
|
|
|
304
|
+
stored_proc_tracker: StoredProcLineageTracker = self._exit_stack.enter_context(
|
|
305
|
+
StoredProcLineageTracker(
|
|
306
|
+
platform=self.identifiers.platform,
|
|
307
|
+
shared_connection=shared_connection,
|
|
308
|
+
)
|
|
309
|
+
)
|
|
310
|
+
self.report.stored_proc_lineage = stored_proc_tracker.report
|
|
311
|
+
|
|
280
312
|
with self.report.audit_log_load_timer:
|
|
281
313
|
for i, query in enumerate(queries):
|
|
282
314
|
if i % 1000 == 0:
|
|
283
315
|
logger.info(f"Added {i} query log entries to SQL aggregator")
|
|
284
|
-
self.aggregator.add(query)
|
|
285
316
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
317
|
+
if isinstance(query, StoredProcCall):
|
|
318
|
+
stored_proc_tracker.add_stored_proc_call(query)
|
|
319
|
+
continue
|
|
320
|
+
|
|
321
|
+
if not (
|
|
322
|
+
isinstance(query, PreparsedQuery)
|
|
323
|
+
and stored_proc_tracker.add_related_query(query)
|
|
324
|
+
):
|
|
325
|
+
# Only add to aggregator if it's not part of a stored procedure.
|
|
326
|
+
self.aggregator.add(query)
|
|
327
|
+
|
|
328
|
+
# Generate and add stored procedure lineage entries.
|
|
329
|
+
for lineage_entry in stored_proc_tracker.build_merged_lineage_entries():
|
|
330
|
+
# TODO: Make this the lowest priority lineage - so that it doesn't override other lineage entries.
|
|
331
|
+
self.aggregator.add(lineage_entry)
|
|
332
|
+
|
|
333
|
+
with self.report.aggregator_generate_timer:
|
|
334
|
+
yield from auto_workunit(self.aggregator.gen_metadata())
|
|
291
335
|
|
|
292
336
|
def fetch_users(self) -> UsersMapping:
|
|
293
337
|
users: UsersMapping = dict()
|
|
@@ -344,13 +388,22 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
344
388
|
|
|
345
389
|
def fetch_query_log(
|
|
346
390
|
self, users: UsersMapping
|
|
347
|
-
) -> Iterable[
|
|
348
|
-
|
|
391
|
+
) -> Iterable[
|
|
392
|
+
Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
|
|
393
|
+
]:
|
|
394
|
+
query_log_query = QueryLogQueryBuilder(
|
|
349
395
|
start_time=self.config.window.start_time,
|
|
350
396
|
end_time=self.config.window.end_time,
|
|
351
397
|
bucket_duration=self.config.window.bucket_duration,
|
|
352
398
|
deny_usernames=self.config.pushdown_deny_usernames,
|
|
353
|
-
|
|
399
|
+
dedup_strategy=self.config.query_dedup_strategy,
|
|
400
|
+
database_pattern=self.filters.filter_config.database_pattern
|
|
401
|
+
if self.config.push_down_database_pattern_access_history
|
|
402
|
+
else None,
|
|
403
|
+
additional_database_names=self.config.additional_database_names_allowlist
|
|
404
|
+
if self.config.push_down_database_pattern_access_history
|
|
405
|
+
else None,
|
|
406
|
+
).build_enriched_query_log_query()
|
|
354
407
|
|
|
355
408
|
with self.structured_reporter.report_exc(
|
|
356
409
|
"Error fetching query log from Snowflake"
|
|
@@ -384,7 +437,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
384
437
|
|
|
385
438
|
def _parse_audit_log_row(
|
|
386
439
|
self, row: Dict[str, Any], users: UsersMapping
|
|
387
|
-
) -> Optional[
|
|
440
|
+
) -> Optional[
|
|
441
|
+
Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery, StoredProcCall]
|
|
442
|
+
]:
|
|
388
443
|
json_fields = {
|
|
389
444
|
"DIRECT_OBJECTS_ACCESSED",
|
|
390
445
|
"OBJECTS_MODIFIED",
|
|
@@ -403,8 +458,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
403
458
|
|
|
404
459
|
# TODO need to map snowflake query types to ours
|
|
405
460
|
query_text: str = res["query_text"]
|
|
461
|
+
snowflake_query_type: str = res["query_type"]
|
|
406
462
|
query_type: QueryType = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
|
|
407
|
-
|
|
463
|
+
snowflake_query_type, QueryType.UNKNOWN
|
|
408
464
|
)
|
|
409
465
|
|
|
410
466
|
direct_objects_accessed = res["direct_objects_accessed"]
|
|
@@ -421,7 +477,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
421
477
|
res["session_id"],
|
|
422
478
|
timestamp,
|
|
423
479
|
object_modified_by_ddl,
|
|
424
|
-
|
|
480
|
+
snowflake_query_type,
|
|
425
481
|
)
|
|
426
482
|
if known_ddl_entry:
|
|
427
483
|
return known_ddl_entry
|
|
@@ -436,6 +492,16 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
436
492
|
res["user_name"], users.get(res["user_name"])
|
|
437
493
|
)
|
|
438
494
|
)
|
|
495
|
+
extra_info = {
|
|
496
|
+
"snowflake_query_id": res["query_id"],
|
|
497
|
+
"snowflake_root_query_id": res["root_query_id"],
|
|
498
|
+
"snowflake_query_type": res["query_type"],
|
|
499
|
+
"snowflake_role_name": res["role_name"],
|
|
500
|
+
"query_duration": res["query_duration"],
|
|
501
|
+
"rows_inserted": res["rows_inserted"],
|
|
502
|
+
"rows_updated": res["rows_updated"],
|
|
503
|
+
"rows_deleted": res["rows_deleted"],
|
|
504
|
+
}
|
|
439
505
|
|
|
440
506
|
# There are a couple cases when we'd want to prefer our own SQL parsing
|
|
441
507
|
# over Snowflake's metadata.
|
|
@@ -470,6 +536,18 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
470
536
|
query_hash=get_query_fingerprint(
|
|
471
537
|
query_text, self.identifiers.platform, fast=True
|
|
472
538
|
),
|
|
539
|
+
extra_info=extra_info,
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
if snowflake_query_type == "CALL" and res["root_query_id"] is None:
|
|
543
|
+
return StoredProcCall(
|
|
544
|
+
# This is the top-level query ID that other entries will reference.
|
|
545
|
+
snowflake_root_query_id=res["query_id"],
|
|
546
|
+
query_text=query_text,
|
|
547
|
+
timestamp=timestamp,
|
|
548
|
+
user=user,
|
|
549
|
+
default_db=res["default_db"],
|
|
550
|
+
default_schema=res["default_schema"],
|
|
473
551
|
)
|
|
474
552
|
|
|
475
553
|
upstreams = []
|
|
@@ -556,6 +634,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
556
634
|
timestamp=timestamp,
|
|
557
635
|
session_id=res["session_id"],
|
|
558
636
|
query_type=query_type,
|
|
637
|
+
extra_info=extra_info,
|
|
559
638
|
)
|
|
560
639
|
return entry
|
|
561
640
|
|
|
@@ -652,69 +731,253 @@ class SnowflakeQueriesSource(Source):
|
|
|
652
731
|
def close(self) -> None:
|
|
653
732
|
self.connection.close()
|
|
654
733
|
self.queries_extractor.close()
|
|
734
|
+
super().close()
|
|
655
735
|
|
|
656
736
|
|
|
657
|
-
|
|
658
|
-
|
|
737
|
+
class QueryLogQueryBuilder:
|
|
738
|
+
def __init__(
|
|
739
|
+
self,
|
|
740
|
+
start_time: datetime,
|
|
741
|
+
end_time: datetime,
|
|
742
|
+
bucket_duration: BucketDuration,
|
|
743
|
+
deny_usernames: Optional[List[str]],
|
|
744
|
+
max_tables_per_query: int = 20,
|
|
745
|
+
dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
|
|
746
|
+
database_pattern: Optional[AllowDenyPattern] = None,
|
|
747
|
+
additional_database_names: Optional[List[str]] = None,
|
|
748
|
+
):
|
|
749
|
+
self.start_time = start_time
|
|
750
|
+
self.end_time = end_time
|
|
751
|
+
self.start_time_millis = int(start_time.timestamp() * 1000)
|
|
752
|
+
self.end_time_millis = int(end_time.timestamp() * 1000)
|
|
753
|
+
self.max_tables_per_query = max_tables_per_query
|
|
754
|
+
self.dedup_strategy = dedup_strategy
|
|
755
|
+
|
|
756
|
+
self.users_filter = "TRUE"
|
|
757
|
+
if deny_usernames:
|
|
758
|
+
user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
|
|
759
|
+
self.users_filter = f"user_name NOT IN ({user_not_in})"
|
|
760
|
+
|
|
761
|
+
self.access_history_database_filter = (
|
|
762
|
+
self._build_access_history_database_filter_condition(
|
|
763
|
+
database_pattern, additional_database_names
|
|
764
|
+
)
|
|
765
|
+
)
|
|
659
766
|
|
|
767
|
+
self.time_bucket_size = bucket_duration.value
|
|
768
|
+
assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
|
|
660
769
|
|
|
661
|
-
def
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
770
|
+
def _build_access_history_database_filter_condition(
|
|
771
|
+
self,
|
|
772
|
+
database_pattern: Optional[AllowDenyPattern],
|
|
773
|
+
additional_database_names: Optional[List[str]] = None,
|
|
774
|
+
) -> str:
|
|
775
|
+
"""
|
|
776
|
+
Build a SQL WHERE condition for database filtering in access_history based on AllowDenyPattern.
|
|
777
|
+
|
|
778
|
+
IMPORTANT: This function handles the fundamental difference between DML and DDL operations in Snowflake's
|
|
779
|
+
access_history table:
|
|
780
|
+
|
|
781
|
+
- DML Operations (SELECT, INSERT, UPDATE, DELETE, etc.): Store accessed/modified objects in the
|
|
782
|
+
`direct_objects_accessed` and `objects_modified` arrays
|
|
783
|
+
- DDL Operations (CREATE, ALTER, DROP, RENAME, etc.): Store modified objects in the
|
|
784
|
+
`object_modified_by_ddl` field (single object, not an array)
|
|
785
|
+
|
|
786
|
+
Without checking `object_modified_by_ddl`, DDL operations like "ALTER TABLE person_info RENAME TO person_info_final"
|
|
787
|
+
would be incorrectly filtered out because they don't populate the DML arrays, causing missing lineage
|
|
788
|
+
and operational metadata.
|
|
789
|
+
|
|
790
|
+
Filtering Logic:
|
|
791
|
+
A query is included if it matches:
|
|
792
|
+
- Any database name in additional_database_names (exact match), OR
|
|
793
|
+
- Any database pattern in database_pattern.allow AND NOT any pattern in database_pattern.deny
|
|
794
|
+
|
|
795
|
+
Args:
|
|
796
|
+
database_pattern: The AllowDenyPattern configuration for database filtering
|
|
797
|
+
additional_database_names: Additional database names to always include (no pattern matching)
|
|
798
|
+
|
|
799
|
+
Returns:
|
|
800
|
+
A SQL WHERE condition string, or "TRUE" if no filtering should be applied
|
|
801
|
+
"""
|
|
802
|
+
if not database_pattern and not additional_database_names:
|
|
803
|
+
return "TRUE"
|
|
804
|
+
|
|
805
|
+
# Build the database filter conditions
|
|
806
|
+
# Logic: Allow if (matches additional_database_names_allowlist) OR (matches database_pattern.allow AND NOT matches database_pattern.deny)
|
|
807
|
+
# Note: Using UPPER() + RLIKE for case-insensitive matching is more performant than REGEXP_LIKE with 'i' flag
|
|
808
|
+
|
|
809
|
+
# Build additional database names condition (exact matches) - these always get included
|
|
810
|
+
additional_db_condition = None
|
|
811
|
+
if additional_database_names:
|
|
812
|
+
additional_db_conditions = []
|
|
813
|
+
for db_name in additional_database_names:
|
|
814
|
+
# Escape single quotes
|
|
815
|
+
escaped_db_name = db_name.replace("'", "''")
|
|
816
|
+
additional_db_conditions.append(
|
|
817
|
+
f"SPLIT_PART(UPPER(o:objectName), '.', 1) = '{escaped_db_name.upper()}'"
|
|
818
|
+
)
|
|
819
|
+
if additional_db_conditions:
|
|
820
|
+
additional_db_condition = " OR ".join(additional_db_conditions)
|
|
821
|
+
|
|
822
|
+
# Build database pattern condition (allow AND NOT deny)
|
|
823
|
+
database_pattern_condition = None
|
|
824
|
+
if database_pattern:
|
|
825
|
+
allow_patterns = database_pattern.allow
|
|
826
|
+
deny_patterns = database_pattern.deny
|
|
827
|
+
|
|
828
|
+
pattern_parts = []
|
|
829
|
+
|
|
830
|
+
# Add allow patterns (if not the default "allow all")
|
|
831
|
+
if allow_patterns and allow_patterns != [".*"]:
|
|
832
|
+
allow_conditions = []
|
|
833
|
+
for pattern in allow_patterns:
|
|
834
|
+
# Escape single quotes that might be present in the regex pattern
|
|
835
|
+
escaped_pattern = pattern.replace("'", "''")
|
|
836
|
+
allow_conditions.append(
|
|
837
|
+
f"SPLIT_PART(UPPER(o:objectName), '.', 1) RLIKE '{escaped_pattern}'"
|
|
838
|
+
)
|
|
839
|
+
if allow_conditions:
|
|
840
|
+
pattern_parts.append(
|
|
841
|
+
allow_conditions[0]
|
|
842
|
+
if len(allow_conditions) == 1
|
|
843
|
+
else f"({' OR '.join(allow_conditions)})"
|
|
844
|
+
)
|
|
669
845
|
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
846
|
+
# Add deny patterns
|
|
847
|
+
if deny_patterns:
|
|
848
|
+
deny_conditions = []
|
|
849
|
+
for pattern in deny_patterns:
|
|
850
|
+
# Escape single quotes that might be present in the regex pattern
|
|
851
|
+
escaped_pattern = pattern.replace("'", "''")
|
|
852
|
+
deny_conditions.append(
|
|
853
|
+
f"SPLIT_PART(UPPER(o:objectName), '.', 1) NOT RLIKE '{escaped_pattern}'"
|
|
854
|
+
)
|
|
855
|
+
if deny_conditions:
|
|
856
|
+
pattern_parts.append(
|
|
857
|
+
deny_conditions[0]
|
|
858
|
+
if len(deny_conditions) == 1
|
|
859
|
+
else f"({' AND '.join(deny_conditions)})"
|
|
860
|
+
)
|
|
674
861
|
|
|
675
|
-
|
|
676
|
-
|
|
862
|
+
if pattern_parts:
|
|
863
|
+
database_pattern_condition = " AND ".join(pattern_parts)
|
|
677
864
|
|
|
678
|
-
|
|
865
|
+
# Combine conditions: additional_database_names OR database_pattern
|
|
866
|
+
filter_conditions = []
|
|
867
|
+
if additional_db_condition:
|
|
868
|
+
filter_conditions.append(
|
|
869
|
+
f"({additional_db_condition})"
|
|
870
|
+
if len(additional_db_condition.split(" OR ")) > 1
|
|
871
|
+
else additional_db_condition
|
|
872
|
+
)
|
|
873
|
+
if database_pattern_condition:
|
|
874
|
+
filter_conditions.append(
|
|
875
|
+
f"({database_pattern_condition})"
|
|
876
|
+
if len(database_pattern_condition.split(" AND ")) > 1
|
|
877
|
+
else database_pattern_condition
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
if filter_conditions:
|
|
881
|
+
database_filter_condition = (
|
|
882
|
+
filter_conditions[0]
|
|
883
|
+
if len(filter_conditions) == 1
|
|
884
|
+
else " OR ".join(filter_conditions)
|
|
885
|
+
)
|
|
886
|
+
|
|
887
|
+
# Build a condition that checks if any objects in the arrays match the database pattern
|
|
888
|
+
# This implements "at least one" matching behavior: queries are allowed if they touch
|
|
889
|
+
# at least one database that matches the pattern, even if they also touch other databases
|
|
890
|
+
# Use ARRAY_SIZE with FILTER which is more compatible with Snowflake
|
|
891
|
+
direct_objects_condition = f"ARRAY_SIZE(FILTER(direct_objects_accessed, o -> {database_filter_condition})) > 0"
|
|
892
|
+
objects_modified_condition = f"ARRAY_SIZE(FILTER(objects_modified, o -> {database_filter_condition})) > 0"
|
|
893
|
+
|
|
894
|
+
# CRITICAL: Handle DDL operations by checking object_modified_by_ddl field
|
|
895
|
+
# DDL operations like ALTER TABLE RENAME store their data here instead of in the arrays
|
|
896
|
+
# We need to adapt the filter condition for a single object rather than an array
|
|
897
|
+
ddl_filter_condition = database_filter_condition.replace(
|
|
898
|
+
"o:objectName", "object_modified_by_ddl:objectName"
|
|
899
|
+
)
|
|
900
|
+
object_modified_by_ddl_condition = f"({ddl_filter_condition})"
|
|
901
|
+
|
|
902
|
+
return f"({direct_objects_condition} OR {objects_modified_condition} OR {object_modified_by_ddl_condition})"
|
|
903
|
+
else:
|
|
904
|
+
return "TRUE"
|
|
905
|
+
|
|
906
|
+
def _query_fingerprinted_queries(self):
|
|
907
|
+
if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
|
|
908
|
+
secondary_fingerprint_sql = """
|
|
909
|
+
CASE
|
|
910
|
+
WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
|
|
911
|
+
-- Extract project id and hash it
|
|
912
|
+
THEN CAST(HASH(
|
|
913
|
+
REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
|
|
914
|
+
REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
|
|
915
|
+
) AS VARCHAR)
|
|
916
|
+
ELSE NULL
|
|
917
|
+
END"""
|
|
918
|
+
elif self.dedup_strategy == QueryDedupStrategyType.NONE:
|
|
919
|
+
secondary_fingerprint_sql = "NULL"
|
|
920
|
+
else:
|
|
921
|
+
raise NotImplementedError(
|
|
922
|
+
f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
|
|
923
|
+
)
|
|
924
|
+
return f"""
|
|
925
|
+
SELECT *,
|
|
926
|
+
-- TODO: Generate better fingerprints for each query by pushing down regex logic.
|
|
927
|
+
query_history.query_parameterized_hash as query_fingerprint,
|
|
928
|
+
-- Optional and additional hash to be used for query deduplication and final query identity
|
|
929
|
+
{secondary_fingerprint_sql} as query_secondary_fingerprint
|
|
930
|
+
FROM
|
|
931
|
+
snowflake.account_usage.query_history
|
|
932
|
+
WHERE
|
|
933
|
+
query_history.start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
|
|
934
|
+
AND query_history.start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
|
|
935
|
+
AND execution_status = 'SUCCESS'
|
|
936
|
+
AND {self.users_filter}"""
|
|
937
|
+
|
|
938
|
+
def _query_deduplicated_queries(self):
|
|
939
|
+
if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
|
|
940
|
+
return f"""
|
|
941
|
+
SELECT
|
|
942
|
+
*,
|
|
943
|
+
DATE_TRUNC(
|
|
944
|
+
{self.time_bucket_size},
|
|
945
|
+
CONVERT_TIMEZONE('UTC', start_time)
|
|
946
|
+
) AS bucket_start_time,
|
|
947
|
+
COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
|
|
948
|
+
FROM
|
|
949
|
+
fingerprinted_queries
|
|
950
|
+
QUALIFY
|
|
951
|
+
ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1"""
|
|
952
|
+
elif self.dedup_strategy == QueryDedupStrategyType.NONE:
|
|
953
|
+
return f"""
|
|
954
|
+
SELECT
|
|
955
|
+
*,
|
|
956
|
+
DATE_TRUNC(
|
|
957
|
+
{self.time_bucket_size},
|
|
958
|
+
CONVERT_TIMEZONE('UTC', start_time)
|
|
959
|
+
) AS bucket_start_time,
|
|
960
|
+
1 AS query_count,
|
|
961
|
+
FROM
|
|
962
|
+
fingerprinted_queries"""
|
|
963
|
+
else:
|
|
964
|
+
raise NotImplementedError(
|
|
965
|
+
f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
def build_enriched_query_log_query(self) -> str:
|
|
969
|
+
return f"""\
|
|
679
970
|
WITH
|
|
680
971
|
fingerprinted_queries as (
|
|
681
|
-
|
|
682
|
-
-- TODO: Generate better fingerprints for each query by pushing down regex logic.
|
|
683
|
-
query_history.query_parameterized_hash as query_fingerprint,
|
|
684
|
-
-- Optional and additional hash to be used for query deduplication and final query identity
|
|
685
|
-
CASE
|
|
686
|
-
WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
|
|
687
|
-
-- Extract project id and hash it
|
|
688
|
-
THEN CAST(HASH(
|
|
689
|
-
REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
|
|
690
|
-
REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
|
|
691
|
-
) AS VARCHAR)
|
|
692
|
-
ELSE NULL
|
|
693
|
-
END as query_secondary_fingerprint
|
|
694
|
-
FROM
|
|
695
|
-
snowflake.account_usage.query_history
|
|
696
|
-
WHERE
|
|
697
|
-
query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
698
|
-
AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
699
|
-
AND execution_status = 'SUCCESS'
|
|
700
|
-
AND {users_filter or "TRUE"}
|
|
972
|
+
{self._query_fingerprinted_queries()}
|
|
701
973
|
)
|
|
702
974
|
, deduplicated_queries as (
|
|
703
|
-
|
|
704
|
-
*,
|
|
705
|
-
DATE_TRUNC(
|
|
706
|
-
{time_bucket_size},
|
|
707
|
-
CONVERT_TIMEZONE('UTC', start_time)
|
|
708
|
-
) AS bucket_start_time,
|
|
709
|
-
COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
|
|
710
|
-
FROM
|
|
711
|
-
fingerprinted_queries
|
|
712
|
-
QUALIFY
|
|
713
|
-
ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
|
|
975
|
+
{self._query_deduplicated_queries()}
|
|
714
976
|
)
|
|
715
977
|
, raw_access_history AS (
|
|
716
978
|
SELECT
|
|
717
979
|
query_id,
|
|
980
|
+
root_query_id,
|
|
718
981
|
query_start_time,
|
|
719
982
|
user_name,
|
|
720
983
|
direct_objects_accessed,
|
|
@@ -723,21 +986,23 @@ fingerprinted_queries as (
|
|
|
723
986
|
FROM
|
|
724
987
|
snowflake.account_usage.access_history
|
|
725
988
|
WHERE
|
|
726
|
-
query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
727
|
-
AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
728
|
-
AND {users_filter
|
|
989
|
+
query_start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
|
|
990
|
+
AND query_start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
|
|
991
|
+
AND {self.users_filter}
|
|
729
992
|
AND query_id IN (
|
|
730
993
|
SELECT query_id FROM deduplicated_queries
|
|
731
994
|
)
|
|
995
|
+
AND {self.access_history_database_filter}
|
|
732
996
|
)
|
|
733
997
|
, filtered_access_history AS (
|
|
734
998
|
-- TODO: Add table filter clause.
|
|
735
999
|
SELECT
|
|
736
1000
|
query_id,
|
|
1001
|
+
root_query_id,
|
|
737
1002
|
query_start_time,
|
|
738
1003
|
ARRAY_SLICE(
|
|
739
1004
|
FILTER(direct_objects_accessed, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}),
|
|
740
|
-
0, {
|
|
1005
|
+
0, {self.max_tables_per_query}
|
|
741
1006
|
) as direct_objects_accessed,
|
|
742
1007
|
-- TODO: Drop the columns.baseSources subfield.
|
|
743
1008
|
FILTER(objects_modified, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}) as objects_modified,
|
|
@@ -764,6 +1029,7 @@ fingerprinted_queries as (
|
|
|
764
1029
|
q.rows_deleted AS "ROWS_DELETED",
|
|
765
1030
|
q.user_name AS "USER_NAME",
|
|
766
1031
|
q.role_name AS "ROLE_NAME",
|
|
1032
|
+
a.root_query_id,
|
|
767
1033
|
a.direct_objects_accessed,
|
|
768
1034
|
a.objects_modified,
|
|
769
1035
|
a.object_modified_by_ddl
|
|
@@ -20,6 +20,7 @@ from datahub.ingestion.source.snowflake.snowflake_schema_gen import (
|
|
|
20
20
|
SnowflakeSchemaGenerator,
|
|
21
21
|
)
|
|
22
22
|
from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
23
|
+
SnowflakeFilter,
|
|
23
24
|
SnowflakeIdentifierBuilder,
|
|
24
25
|
)
|
|
25
26
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
@@ -81,6 +82,10 @@ class SnowflakeSummarySource(Source):
|
|
|
81
82
|
profiler=None,
|
|
82
83
|
aggregator=None,
|
|
83
84
|
snowsight_url_builder=None,
|
|
85
|
+
filters=SnowflakeFilter(
|
|
86
|
+
filter_config=self.config,
|
|
87
|
+
structured_reporter=self.report,
|
|
88
|
+
),
|
|
84
89
|
)
|
|
85
90
|
|
|
86
91
|
# Databases.
|
|
@@ -231,7 +231,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
231
231
|
|
|
232
232
|
with self.report.usage_aggregation.result_fetch_timer as fetch_timer:
|
|
233
233
|
for row in results:
|
|
234
|
-
with
|
|
234
|
+
with (
|
|
235
|
+
fetch_timer.pause(),
|
|
236
|
+
self.report.usage_aggregation.result_skip_timer as skip_timer,
|
|
237
|
+
):
|
|
235
238
|
if results.rownumber is not None and results.rownumber % 1000 == 0:
|
|
236
239
|
logger.debug(f"Processing usage row number {results.rownumber}")
|
|
237
240
|
logger.debug(self.report.usage_aggregation.as_string())
|
|
@@ -255,7 +258,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
255
258
|
f"Skipping usage for {row['OBJECT_DOMAIN']} {dataset_identifier}, as table is not accessible."
|
|
256
259
|
)
|
|
257
260
|
continue
|
|
258
|
-
with
|
|
261
|
+
with (
|
|
262
|
+
skip_timer.pause(),
|
|
263
|
+
self.report.usage_aggregation.result_map_timer as map_timer,
|
|
264
|
+
):
|
|
259
265
|
wu = self.build_usage_statistics_for_dataset(
|
|
260
266
|
dataset_identifier, row
|
|
261
267
|
)
|
|
@@ -325,15 +325,10 @@ class SnowflakeIdentifierBuilder:
|
|
|
325
325
|
user_email: Optional[str],
|
|
326
326
|
) -> str:
|
|
327
327
|
if user_email:
|
|
328
|
-
return self.snowflake_identifier(
|
|
329
|
-
user_email
|
|
330
|
-
if self.identifier_config.email_as_user_identifier is True
|
|
331
|
-
else user_email.split("@")[0]
|
|
332
|
-
)
|
|
328
|
+
return self.snowflake_identifier(user_email)
|
|
333
329
|
return self.snowflake_identifier(
|
|
334
330
|
f"{user_name}@{self.identifier_config.email_domain}"
|
|
335
|
-
if self.identifier_config.
|
|
336
|
-
and self.identifier_config.email_domain is not None
|
|
331
|
+
if self.identifier_config.email_domain is not None
|
|
337
332
|
else user_name
|
|
338
333
|
)
|
|
339
334
|
|