acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,7 @@ from datahub.ingestion.graph.client import DataHubGraph
28
28
  from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
29
29
  from datahub.ingestion.source.snowflake.snowflake_config import (
30
30
  DEFAULT_TEMP_TABLES_PATTERNS,
31
+ QueryDedupStrategyType,
31
32
  SnowflakeFilterConfig,
32
33
  SnowflakeIdentifierConfig,
33
34
  )
@@ -44,6 +45,11 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
44
45
  SnowflakeIdentifierBuilder,
45
46
  SnowflakeStructuredReportMixin,
46
47
  )
48
+ from datahub.ingestion.source.snowflake.stored_proc_lineage import (
49
+ StoredProcCall,
50
+ StoredProcLineageReport,
51
+ StoredProcLineageTracker,
52
+ )
47
53
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
48
54
  from datahub.metadata.urns import CorpUserUrn
49
55
  from datahub.sql_parsing.schema_resolver import SchemaResolver
@@ -63,7 +69,10 @@ from datahub.sql_parsing.sqlglot_lineage import (
63
69
  DownstreamColumnRef,
64
70
  )
65
71
  from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
66
- from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
72
+ from datahub.utilities.file_backed_collections import (
73
+ ConnectionWrapper,
74
+ FileBackedList,
75
+ )
67
76
  from datahub.utilities.perf_timer import PerfTimer
68
77
 
69
78
  logger = logging.getLogger(__name__)
@@ -110,6 +119,22 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
110
119
  include_query_usage_statistics: bool = True
111
120
  include_operations: bool = True
112
121
 
122
+ push_down_database_pattern_access_history: bool = pydantic.Field(
123
+ default=False,
124
+ description="If enabled, pushes down database pattern filtering to the access_history table for improved performance. "
125
+ "This filters on the accessed objects in access_history.",
126
+ )
127
+
128
+ additional_database_names_allowlist: List[str] = pydantic.Field(
129
+ default=[],
130
+ description="Additional database names (no pattern matching) to be included in the access_history filter. "
131
+ "Only applies if push_down_database_pattern_access_history=True. "
132
+ "These databases will be included in the filter being pushed down regardless of database_pattern settings."
133
+ "This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.",
134
+ )
135
+
136
+ query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD
137
+
113
138
 
114
139
  class SnowflakeQueriesSourceConfig(
115
140
  SnowflakeQueriesExtractorConfig, SnowflakeIdentifierConfig, SnowflakeFilterConfig
@@ -124,7 +149,10 @@ class SnowflakeQueriesExtractorReport(Report):
124
149
  users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
125
150
 
126
151
  audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
152
+ aggregator_generate_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
153
+
127
154
  sql_aggregator: Optional[SqlAggregatorReport] = None
155
+ stored_proc_lineage: Optional[StoredProcLineageReport] = None
128
156
 
129
157
  num_ddl_queries_dropped: int = 0
130
158
  num_stream_queries_observed: int = 0
@@ -243,6 +271,12 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
243
271
  audit_log_file = self.local_temp_path / "audit_log.sqlite"
244
272
  use_cached_audit_log = audit_log_file.exists()
245
273
 
274
+ if self.config.local_temp_path is None:
275
+ self._exit_stack.callback(lambda: audit_log_file.unlink(missing_ok=True))
276
+
277
+ shared_connection = self._exit_stack.enter_context(
278
+ ConnectionWrapper(audit_log_file)
279
+ )
246
280
  queries: FileBackedList[
247
281
  Union[
248
282
  KnownLineageMapping,
@@ -250,44 +284,54 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
250
284
  TableRename,
251
285
  TableSwap,
252
286
  ObservedQuery,
287
+ StoredProcCall,
253
288
  ]
254
- ]
289
+ ] = self._exit_stack.enter_context(FileBackedList(shared_connection))
290
+
255
291
  if use_cached_audit_log:
256
- logger.info("Using cached audit log")
257
- shared_connection = ConnectionWrapper(audit_log_file)
258
- queries = FileBackedList(shared_connection)
292
+ logger.info(f"Using cached audit log at {audit_log_file}")
259
293
  else:
260
- audit_log_file.unlink(missing_ok=True)
261
-
262
- shared_connection = ConnectionWrapper(audit_log_file)
263
- queries = FileBackedList(shared_connection)
264
- entry: Union[
265
- KnownLineageMapping,
266
- PreparsedQuery,
267
- TableRename,
268
- TableSwap,
269
- ObservedQuery,
270
- ]
294
+ logger.info(f"Fetching audit log into {audit_log_file}")
271
295
 
272
296
  with self.report.copy_history_fetch_timer:
273
- for entry in self.fetch_copy_history():
274
- queries.append(entry)
297
+ for copy_entry in self.fetch_copy_history():
298
+ queries.append(copy_entry)
275
299
 
276
300
  with self.report.query_log_fetch_timer:
277
301
  for entry in self.fetch_query_log(users):
278
302
  queries.append(entry)
279
303
 
304
+ stored_proc_tracker: StoredProcLineageTracker = self._exit_stack.enter_context(
305
+ StoredProcLineageTracker(
306
+ platform=self.identifiers.platform,
307
+ shared_connection=shared_connection,
308
+ )
309
+ )
310
+ self.report.stored_proc_lineage = stored_proc_tracker.report
311
+
280
312
  with self.report.audit_log_load_timer:
281
313
  for i, query in enumerate(queries):
282
314
  if i % 1000 == 0:
283
315
  logger.info(f"Added {i} query log entries to SQL aggregator")
284
- self.aggregator.add(query)
285
316
 
286
- yield from auto_workunit(self.aggregator.gen_metadata())
287
- if not use_cached_audit_log:
288
- queries.close()
289
- shared_connection.close()
290
- audit_log_file.unlink(missing_ok=True)
317
+ if isinstance(query, StoredProcCall):
318
+ stored_proc_tracker.add_stored_proc_call(query)
319
+ continue
320
+
321
+ if not (
322
+ isinstance(query, PreparsedQuery)
323
+ and stored_proc_tracker.add_related_query(query)
324
+ ):
325
+ # Only add to aggregator if it's not part of a stored procedure.
326
+ self.aggregator.add(query)
327
+
328
+ # Generate and add stored procedure lineage entries.
329
+ for lineage_entry in stored_proc_tracker.build_merged_lineage_entries():
330
+ # TODO: Make this the lowest priority lineage - so that it doesn't override other lineage entries.
331
+ self.aggregator.add(lineage_entry)
332
+
333
+ with self.report.aggregator_generate_timer:
334
+ yield from auto_workunit(self.aggregator.gen_metadata())
291
335
 
292
336
  def fetch_users(self) -> UsersMapping:
293
337
  users: UsersMapping = dict()
@@ -344,13 +388,22 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
344
388
 
345
389
  def fetch_query_log(
346
390
  self, users: UsersMapping
347
- ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery]]:
348
- query_log_query = _build_enriched_query_log_query(
391
+ ) -> Iterable[
392
+ Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
393
+ ]:
394
+ query_log_query = QueryLogQueryBuilder(
349
395
  start_time=self.config.window.start_time,
350
396
  end_time=self.config.window.end_time,
351
397
  bucket_duration=self.config.window.bucket_duration,
352
398
  deny_usernames=self.config.pushdown_deny_usernames,
353
- )
399
+ dedup_strategy=self.config.query_dedup_strategy,
400
+ database_pattern=self.filters.filter_config.database_pattern
401
+ if self.config.push_down_database_pattern_access_history
402
+ else None,
403
+ additional_database_names=self.config.additional_database_names_allowlist
404
+ if self.config.push_down_database_pattern_access_history
405
+ else None,
406
+ ).build_enriched_query_log_query()
354
407
 
355
408
  with self.structured_reporter.report_exc(
356
409
  "Error fetching query log from Snowflake"
@@ -384,7 +437,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
384
437
 
385
438
  def _parse_audit_log_row(
386
439
  self, row: Dict[str, Any], users: UsersMapping
387
- ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
440
+ ) -> Optional[
441
+ Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery, StoredProcCall]
442
+ ]:
388
443
  json_fields = {
389
444
  "DIRECT_OBJECTS_ACCESSED",
390
445
  "OBJECTS_MODIFIED",
@@ -403,8 +458,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
403
458
 
404
459
  # TODO need to map snowflake query types to ours
405
460
  query_text: str = res["query_text"]
461
+ snowflake_query_type: str = res["query_type"]
406
462
  query_type: QueryType = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
407
- res["query_type"], QueryType.UNKNOWN
463
+ snowflake_query_type, QueryType.UNKNOWN
408
464
  )
409
465
 
410
466
  direct_objects_accessed = res["direct_objects_accessed"]
@@ -421,7 +477,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
421
477
  res["session_id"],
422
478
  timestamp,
423
479
  object_modified_by_ddl,
424
- res["query_type"],
480
+ snowflake_query_type,
425
481
  )
426
482
  if known_ddl_entry:
427
483
  return known_ddl_entry
@@ -436,6 +492,16 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
436
492
  res["user_name"], users.get(res["user_name"])
437
493
  )
438
494
  )
495
+ extra_info = {
496
+ "snowflake_query_id": res["query_id"],
497
+ "snowflake_root_query_id": res["root_query_id"],
498
+ "snowflake_query_type": res["query_type"],
499
+ "snowflake_role_name": res["role_name"],
500
+ "query_duration": res["query_duration"],
501
+ "rows_inserted": res["rows_inserted"],
502
+ "rows_updated": res["rows_updated"],
503
+ "rows_deleted": res["rows_deleted"],
504
+ }
439
505
 
440
506
  # There are a couple cases when we'd want to prefer our own SQL parsing
441
507
  # over Snowflake's metadata.
@@ -470,6 +536,18 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
470
536
  query_hash=get_query_fingerprint(
471
537
  query_text, self.identifiers.platform, fast=True
472
538
  ),
539
+ extra_info=extra_info,
540
+ )
541
+
542
+ if snowflake_query_type == "CALL" and res["root_query_id"] is None:
543
+ return StoredProcCall(
544
+ # This is the top-level query ID that other entries will reference.
545
+ snowflake_root_query_id=res["query_id"],
546
+ query_text=query_text,
547
+ timestamp=timestamp,
548
+ user=user,
549
+ default_db=res["default_db"],
550
+ default_schema=res["default_schema"],
473
551
  )
474
552
 
475
553
  upstreams = []
@@ -556,6 +634,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
556
634
  timestamp=timestamp,
557
635
  session_id=res["session_id"],
558
636
  query_type=query_type,
637
+ extra_info=extra_info,
559
638
  )
560
639
  return entry
561
640
 
@@ -652,69 +731,253 @@ class SnowflakeQueriesSource(Source):
652
731
  def close(self) -> None:
653
732
  self.connection.close()
654
733
  self.queries_extractor.close()
734
+ super().close()
655
735
 
656
736
 
657
- # Make sure we don't try to generate too much info for a single query.
658
- _MAX_TABLES_PER_QUERY = 20
737
+ class QueryLogQueryBuilder:
738
+ def __init__(
739
+ self,
740
+ start_time: datetime,
741
+ end_time: datetime,
742
+ bucket_duration: BucketDuration,
743
+ deny_usernames: Optional[List[str]],
744
+ max_tables_per_query: int = 20,
745
+ dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
746
+ database_pattern: Optional[AllowDenyPattern] = None,
747
+ additional_database_names: Optional[List[str]] = None,
748
+ ):
749
+ self.start_time = start_time
750
+ self.end_time = end_time
751
+ self.start_time_millis = int(start_time.timestamp() * 1000)
752
+ self.end_time_millis = int(end_time.timestamp() * 1000)
753
+ self.max_tables_per_query = max_tables_per_query
754
+ self.dedup_strategy = dedup_strategy
755
+
756
+ self.users_filter = "TRUE"
757
+ if deny_usernames:
758
+ user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
759
+ self.users_filter = f"user_name NOT IN ({user_not_in})"
760
+
761
+ self.access_history_database_filter = (
762
+ self._build_access_history_database_filter_condition(
763
+ database_pattern, additional_database_names
764
+ )
765
+ )
659
766
 
767
+ self.time_bucket_size = bucket_duration.value
768
+ assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
660
769
 
661
- def _build_enriched_query_log_query(
662
- start_time: datetime,
663
- end_time: datetime,
664
- bucket_duration: BucketDuration,
665
- deny_usernames: Optional[List[str]],
666
- ) -> str:
667
- start_time_millis = int(start_time.timestamp() * 1000)
668
- end_time_millis = int(end_time.timestamp() * 1000)
770
+ def _build_access_history_database_filter_condition(
771
+ self,
772
+ database_pattern: Optional[AllowDenyPattern],
773
+ additional_database_names: Optional[List[str]] = None,
774
+ ) -> str:
775
+ """
776
+ Build a SQL WHERE condition for database filtering in access_history based on AllowDenyPattern.
777
+
778
+ IMPORTANT: This function handles the fundamental difference between DML and DDL operations in Snowflake's
779
+ access_history table:
780
+
781
+ - DML Operations (SELECT, INSERT, UPDATE, DELETE, etc.): Store accessed/modified objects in the
782
+ `direct_objects_accessed` and `objects_modified` arrays
783
+ - DDL Operations (CREATE, ALTER, DROP, RENAME, etc.): Store modified objects in the
784
+ `object_modified_by_ddl` field (single object, not an array)
785
+
786
+ Without checking `object_modified_by_ddl`, DDL operations like "ALTER TABLE person_info RENAME TO person_info_final"
787
+ would be incorrectly filtered out because they don't populate the DML arrays, causing missing lineage
788
+ and operational metadata.
789
+
790
+ Filtering Logic:
791
+ A query is included if it matches:
792
+ - Any database name in additional_database_names (exact match), OR
793
+ - Any database pattern in database_pattern.allow AND NOT any pattern in database_pattern.deny
794
+
795
+ Args:
796
+ database_pattern: The AllowDenyPattern configuration for database filtering
797
+ additional_database_names: Additional database names to always include (no pattern matching)
798
+
799
+ Returns:
800
+ A SQL WHERE condition string, or "TRUE" if no filtering should be applied
801
+ """
802
+ if not database_pattern and not additional_database_names:
803
+ return "TRUE"
804
+
805
+ # Build the database filter conditions
806
+ # Logic: Allow if (matches additional_database_names_allowlist) OR (matches database_pattern.allow AND NOT matches database_pattern.deny)
807
+ # Note: Using UPPER() + RLIKE for case-insensitive matching is more performant than REGEXP_LIKE with 'i' flag
808
+
809
+ # Build additional database names condition (exact matches) - these always get included
810
+ additional_db_condition = None
811
+ if additional_database_names:
812
+ additional_db_conditions = []
813
+ for db_name in additional_database_names:
814
+ # Escape single quotes
815
+ escaped_db_name = db_name.replace("'", "''")
816
+ additional_db_conditions.append(
817
+ f"SPLIT_PART(UPPER(o:objectName), '.', 1) = '{escaped_db_name.upper()}'"
818
+ )
819
+ if additional_db_conditions:
820
+ additional_db_condition = " OR ".join(additional_db_conditions)
821
+
822
+ # Build database pattern condition (allow AND NOT deny)
823
+ database_pattern_condition = None
824
+ if database_pattern:
825
+ allow_patterns = database_pattern.allow
826
+ deny_patterns = database_pattern.deny
827
+
828
+ pattern_parts = []
829
+
830
+ # Add allow patterns (if not the default "allow all")
831
+ if allow_patterns and allow_patterns != [".*"]:
832
+ allow_conditions = []
833
+ for pattern in allow_patterns:
834
+ # Escape single quotes that might be present in the regex pattern
835
+ escaped_pattern = pattern.replace("'", "''")
836
+ allow_conditions.append(
837
+ f"SPLIT_PART(UPPER(o:objectName), '.', 1) RLIKE '{escaped_pattern}'"
838
+ )
839
+ if allow_conditions:
840
+ pattern_parts.append(
841
+ allow_conditions[0]
842
+ if len(allow_conditions) == 1
843
+ else f"({' OR '.join(allow_conditions)})"
844
+ )
669
845
 
670
- users_filter = ""
671
- if deny_usernames:
672
- user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
673
- users_filter = f"user_name NOT IN ({user_not_in})"
846
+ # Add deny patterns
847
+ if deny_patterns:
848
+ deny_conditions = []
849
+ for pattern in deny_patterns:
850
+ # Escape single quotes that might be present in the regex pattern
851
+ escaped_pattern = pattern.replace("'", "''")
852
+ deny_conditions.append(
853
+ f"SPLIT_PART(UPPER(o:objectName), '.', 1) NOT RLIKE '{escaped_pattern}'"
854
+ )
855
+ if deny_conditions:
856
+ pattern_parts.append(
857
+ deny_conditions[0]
858
+ if len(deny_conditions) == 1
859
+ else f"({' AND '.join(deny_conditions)})"
860
+ )
674
861
 
675
- time_bucket_size = bucket_duration.value
676
- assert time_bucket_size in ("HOUR", "DAY", "MONTH")
862
+ if pattern_parts:
863
+ database_pattern_condition = " AND ".join(pattern_parts)
677
864
 
678
- return f"""\
865
+ # Combine conditions: additional_database_names OR database_pattern
866
+ filter_conditions = []
867
+ if additional_db_condition:
868
+ filter_conditions.append(
869
+ f"({additional_db_condition})"
870
+ if len(additional_db_condition.split(" OR ")) > 1
871
+ else additional_db_condition
872
+ )
873
+ if database_pattern_condition:
874
+ filter_conditions.append(
875
+ f"({database_pattern_condition})"
876
+ if len(database_pattern_condition.split(" AND ")) > 1
877
+ else database_pattern_condition
878
+ )
879
+
880
+ if filter_conditions:
881
+ database_filter_condition = (
882
+ filter_conditions[0]
883
+ if len(filter_conditions) == 1
884
+ else " OR ".join(filter_conditions)
885
+ )
886
+
887
+ # Build a condition that checks if any objects in the arrays match the database pattern
888
+ # This implements "at least one" matching behavior: queries are allowed if they touch
889
+ # at least one database that matches the pattern, even if they also touch other databases
890
+ # Use ARRAY_SIZE with FILTER which is more compatible with Snowflake
891
+ direct_objects_condition = f"ARRAY_SIZE(FILTER(direct_objects_accessed, o -> {database_filter_condition})) > 0"
892
+ objects_modified_condition = f"ARRAY_SIZE(FILTER(objects_modified, o -> {database_filter_condition})) > 0"
893
+
894
+ # CRITICAL: Handle DDL operations by checking object_modified_by_ddl field
895
+ # DDL operations like ALTER TABLE RENAME store their data here instead of in the arrays
896
+ # We need to adapt the filter condition for a single object rather than an array
897
+ ddl_filter_condition = database_filter_condition.replace(
898
+ "o:objectName", "object_modified_by_ddl:objectName"
899
+ )
900
+ object_modified_by_ddl_condition = f"({ddl_filter_condition})"
901
+
902
+ return f"({direct_objects_condition} OR {objects_modified_condition} OR {object_modified_by_ddl_condition})"
903
+ else:
904
+ return "TRUE"
905
+
906
+ def _query_fingerprinted_queries(self):
907
+ if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
908
+ secondary_fingerprint_sql = """
909
+ CASE
910
+ WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
911
+ -- Extract project id and hash it
912
+ THEN CAST(HASH(
913
+ REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
914
+ REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
915
+ ) AS VARCHAR)
916
+ ELSE NULL
917
+ END"""
918
+ elif self.dedup_strategy == QueryDedupStrategyType.NONE:
919
+ secondary_fingerprint_sql = "NULL"
920
+ else:
921
+ raise NotImplementedError(
922
+ f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
923
+ )
924
+ return f"""
925
+ SELECT *,
926
+ -- TODO: Generate better fingerprints for each query by pushing down regex logic.
927
+ query_history.query_parameterized_hash as query_fingerprint,
928
+ -- Optional and additional hash to be used for query deduplication and final query identity
929
+ {secondary_fingerprint_sql} as query_secondary_fingerprint
930
+ FROM
931
+ snowflake.account_usage.query_history
932
+ WHERE
933
+ query_history.start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
934
+ AND query_history.start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
935
+ AND execution_status = 'SUCCESS'
936
+ AND {self.users_filter}"""
937
+
938
+ def _query_deduplicated_queries(self):
939
+ if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
940
+ return f"""
941
+ SELECT
942
+ *,
943
+ DATE_TRUNC(
944
+ {self.time_bucket_size},
945
+ CONVERT_TIMEZONE('UTC', start_time)
946
+ ) AS bucket_start_time,
947
+ COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
948
+ FROM
949
+ fingerprinted_queries
950
+ QUALIFY
951
+ ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1"""
952
+ elif self.dedup_strategy == QueryDedupStrategyType.NONE:
953
+ return f"""
954
+ SELECT
955
+ *,
956
+ DATE_TRUNC(
957
+ {self.time_bucket_size},
958
+ CONVERT_TIMEZONE('UTC', start_time)
959
+ ) AS bucket_start_time,
960
+ 1 AS query_count,
961
+ FROM
962
+ fingerprinted_queries"""
963
+ else:
964
+ raise NotImplementedError(
965
+ f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
966
+ )
967
+
968
+ def build_enriched_query_log_query(self) -> str:
969
+ return f"""\
679
970
  WITH
680
971
  fingerprinted_queries as (
681
- SELECT *,
682
- -- TODO: Generate better fingerprints for each query by pushing down regex logic.
683
- query_history.query_parameterized_hash as query_fingerprint,
684
- -- Optional and additional hash to be used for query deduplication and final query identity
685
- CASE
686
- WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
687
- -- Extract project id and hash it
688
- THEN CAST(HASH(
689
- REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
690
- REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
691
- ) AS VARCHAR)
692
- ELSE NULL
693
- END as query_secondary_fingerprint
694
- FROM
695
- snowflake.account_usage.query_history
696
- WHERE
697
- query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3)
698
- AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3)
699
- AND execution_status = 'SUCCESS'
700
- AND {users_filter or "TRUE"}
972
+ {self._query_fingerprinted_queries()}
701
973
  )
702
974
  , deduplicated_queries as (
703
- SELECT
704
- *,
705
- DATE_TRUNC(
706
- {time_bucket_size},
707
- CONVERT_TIMEZONE('UTC', start_time)
708
- ) AS bucket_start_time,
709
- COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
710
- FROM
711
- fingerprinted_queries
712
- QUALIFY
713
- ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
975
+ {self._query_deduplicated_queries()}
714
976
  )
715
977
  , raw_access_history AS (
716
978
  SELECT
717
979
  query_id,
980
+ root_query_id,
718
981
  query_start_time,
719
982
  user_name,
720
983
  direct_objects_accessed,
@@ -723,21 +986,23 @@ fingerprinted_queries as (
723
986
  FROM
724
987
  snowflake.account_usage.access_history
725
988
  WHERE
726
- query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
727
- AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
728
- AND {users_filter or "TRUE"}
989
+ query_start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
990
+ AND query_start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
991
+ AND {self.users_filter}
729
992
  AND query_id IN (
730
993
  SELECT query_id FROM deduplicated_queries
731
994
  )
995
+ AND {self.access_history_database_filter}
732
996
  )
733
997
  , filtered_access_history AS (
734
998
  -- TODO: Add table filter clause.
735
999
  SELECT
736
1000
  query_id,
1001
+ root_query_id,
737
1002
  query_start_time,
738
1003
  ARRAY_SLICE(
739
1004
  FILTER(direct_objects_accessed, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}),
740
- 0, {_MAX_TABLES_PER_QUERY}
1005
+ 0, {self.max_tables_per_query}
741
1006
  ) as direct_objects_accessed,
742
1007
  -- TODO: Drop the columns.baseSources subfield.
743
1008
  FILTER(objects_modified, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}) as objects_modified,
@@ -764,6 +1029,7 @@ fingerprinted_queries as (
764
1029
  q.rows_deleted AS "ROWS_DELETED",
765
1030
  q.user_name AS "USER_NAME",
766
1031
  q.role_name AS "ROLE_NAME",
1032
+ a.root_query_id,
767
1033
  a.direct_objects_accessed,
768
1034
  a.objects_modified,
769
1035
  a.object_modified_by_ddl
@@ -20,6 +20,7 @@ from datahub.ingestion.source.snowflake.snowflake_schema_gen import (
20
20
  SnowflakeSchemaGenerator,
21
21
  )
22
22
  from datahub.ingestion.source.snowflake.snowflake_utils import (
23
+ SnowflakeFilter,
23
24
  SnowflakeIdentifierBuilder,
24
25
  )
25
26
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
@@ -81,6 +82,10 @@ class SnowflakeSummarySource(Source):
81
82
  profiler=None,
82
83
  aggregator=None,
83
84
  snowsight_url_builder=None,
85
+ filters=SnowflakeFilter(
86
+ filter_config=self.config,
87
+ structured_reporter=self.report,
88
+ ),
84
89
  )
85
90
 
86
91
  # Databases.
@@ -231,7 +231,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
231
231
 
232
232
  with self.report.usage_aggregation.result_fetch_timer as fetch_timer:
233
233
  for row in results:
234
- with fetch_timer.pause(), self.report.usage_aggregation.result_skip_timer as skip_timer:
234
+ with (
235
+ fetch_timer.pause(),
236
+ self.report.usage_aggregation.result_skip_timer as skip_timer,
237
+ ):
235
238
  if results.rownumber is not None and results.rownumber % 1000 == 0:
236
239
  logger.debug(f"Processing usage row number {results.rownumber}")
237
240
  logger.debug(self.report.usage_aggregation.as_string())
@@ -255,7 +258,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
255
258
  f"Skipping usage for {row['OBJECT_DOMAIN']} {dataset_identifier}, as table is not accessible."
256
259
  )
257
260
  continue
258
- with skip_timer.pause(), self.report.usage_aggregation.result_map_timer as map_timer:
261
+ with (
262
+ skip_timer.pause(),
263
+ self.report.usage_aggregation.result_map_timer as map_timer,
264
+ ):
259
265
  wu = self.build_usage_statistics_for_dataset(
260
266
  dataset_identifier, row
261
267
  )
@@ -325,15 +325,10 @@ class SnowflakeIdentifierBuilder:
325
325
  user_email: Optional[str],
326
326
  ) -> str:
327
327
  if user_email:
328
- return self.snowflake_identifier(
329
- user_email
330
- if self.identifier_config.email_as_user_identifier is True
331
- else user_email.split("@")[0]
332
- )
328
+ return self.snowflake_identifier(user_email)
333
329
  return self.snowflake_identifier(
334
330
  f"{user_name}@{self.identifier_config.email_domain}"
335
- if self.identifier_config.email_as_user_identifier is True
336
- and self.identifier_config.email_domain is not None
331
+ if self.identifier_config.email_domain is not None
337
332
  else user_name
338
333
  )
339
334