acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -13,13 +13,15 @@ from typing import Any, Dict, Iterable, List, Optional, Union
13
13
  import pydantic
14
14
  from typing_extensions import Self
15
15
 
16
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
16
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
17
17
  from datahub.configuration.time_window_config import (
18
18
  BaseTimeWindowConfig,
19
19
  BucketDuration,
20
+ get_time_bucket,
20
21
  )
21
22
  from datahub.ingestion.api.closeable import Closeable
22
23
  from datahub.ingestion.api.common import PipelineContext
24
+ from datahub.ingestion.api.decorators import SupportStatus, config_class, support_status
23
25
  from datahub.ingestion.api.report import Report
24
26
  from datahub.ingestion.api.source import Source, SourceReport
25
27
  from datahub.ingestion.api.source_helpers import auto_workunit
@@ -28,6 +30,7 @@ from datahub.ingestion.graph.client import DataHubGraph
28
30
  from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
29
31
  from datahub.ingestion.source.snowflake.snowflake_config import (
30
32
  DEFAULT_TEMP_TABLES_PATTERNS,
33
+ QueryDedupStrategyType,
31
34
  SnowflakeFilterConfig,
32
35
  SnowflakeIdentifierConfig,
33
36
  )
@@ -44,6 +47,14 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
44
47
  SnowflakeIdentifierBuilder,
45
48
  SnowflakeStructuredReportMixin,
46
49
  )
50
+ from datahub.ingestion.source.snowflake.stored_proc_lineage import (
51
+ StoredProcCall,
52
+ StoredProcLineageReport,
53
+ StoredProcLineageTracker,
54
+ )
55
+ from datahub.ingestion.source.state.redundant_run_skip_handler import (
56
+ RedundantQueriesRunSkipHandler,
57
+ )
47
58
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
48
59
  from datahub.metadata.urns import CorpUserUrn
49
60
  from datahub.sql_parsing.schema_resolver import SchemaResolver
@@ -63,7 +74,10 @@ from datahub.sql_parsing.sqlglot_lineage import (
63
74
  DownstreamColumnRef,
64
75
  )
65
76
  from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
66
- from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
77
+ from datahub.utilities.file_backed_collections import (
78
+ ConnectionWrapper,
79
+ FileBackedList,
80
+ )
67
81
  from datahub.utilities.perf_timer import PerfTimer
68
82
 
69
83
  logger = logging.getLogger(__name__)
@@ -80,10 +94,17 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
80
94
 
81
95
  pushdown_deny_usernames: List[str] = pydantic.Field(
82
96
  default=[],
83
- description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
97
+ description="List of snowflake usernames (SQL LIKE patterns, e.g., 'SERVICE_%', '%_PROD', 'TEST_USER') which will NOT be considered for lineage/usage/queries extraction. "
84
98
  "This is primarily useful for improving performance by filtering out users with extremely high query volumes.",
85
99
  )
86
100
 
101
+ pushdown_allow_usernames: List[str] = pydantic.Field(
102
+ default=[],
103
+ description="List of snowflake usernames (SQL LIKE patterns, e.g., 'ANALYST_%', '%_USER', 'MAIN_ACCOUNT') which WILL be considered for lineage/usage/queries extraction. "
104
+ "This is primarily useful for improving performance by filtering in only specific users. "
105
+ "If not specified, all users not in deny list are included.",
106
+ )
107
+
87
108
  user_email_pattern: AllowDenyPattern = pydantic.Field(
88
109
  default=AllowDenyPattern.allow_all(),
89
110
  description="Regex patterns for user emails to filter in usage.",
@@ -96,12 +117,11 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
96
117
  "to ignore the temporary staging tables created by known ETL tools.",
97
118
  )
98
119
 
99
- local_temp_path: Optional[pathlib.Path] = pydantic.Field(
100
- default=None,
101
- description="Local path to store the audit log.",
120
+ local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = pydantic.Field(
102
121
  # TODO: For now, this is simply an advanced config to make local testing easier.
103
122
  # Eventually, we will want to store date-specific files in the directory and use it as a cache.
104
- hidden_from_docs=True,
123
+ default=None,
124
+ description="Local path to store the audit log.",
105
125
  )
106
126
 
107
127
  include_lineage: bool = True
@@ -110,6 +130,22 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
110
130
  include_query_usage_statistics: bool = True
111
131
  include_operations: bool = True
112
132
 
133
+ push_down_database_pattern_access_history: bool = pydantic.Field(
134
+ default=False,
135
+ description="If enabled, pushes down database pattern filtering to the access_history table for improved performance. "
136
+ "This filters on the accessed objects in access_history.",
137
+ )
138
+
139
+ additional_database_names_allowlist: List[str] = pydantic.Field(
140
+ default=[],
141
+ description="Additional database names (no pattern matching) to be included in the access_history filter. "
142
+ "Only applies if push_down_database_pattern_access_history=True. "
143
+ "These databases will be included in the filter being pushed down regardless of database_pattern settings."
144
+ "This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.",
145
+ )
146
+
147
+ query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD
148
+
113
149
 
114
150
  class SnowflakeQueriesSourceConfig(
115
151
  SnowflakeQueriesExtractorConfig, SnowflakeIdentifierConfig, SnowflakeFilterConfig
@@ -124,9 +160,14 @@ class SnowflakeQueriesExtractorReport(Report):
124
160
  users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
125
161
 
126
162
  audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
163
+ aggregator_generate_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
164
+
127
165
  sql_aggregator: Optional[SqlAggregatorReport] = None
166
+ stored_proc_lineage: Optional[StoredProcLineageReport] = None
128
167
 
129
168
  num_ddl_queries_dropped: int = 0
169
+ num_stream_queries_observed: int = 0
170
+ num_create_temp_view_queries_observed: int = 0
130
171
  num_users: int = 0
131
172
 
132
173
 
@@ -144,6 +185,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
144
185
  structured_report: SourceReport,
145
186
  filters: SnowflakeFilter,
146
187
  identifiers: SnowflakeIdentifierBuilder,
188
+ redundant_run_skip_handler: Optional[RedundantQueriesRunSkipHandler] = None,
147
189
  graph: Optional[DataHubGraph] = None,
148
190
  schema_resolver: Optional[SchemaResolver] = None,
149
191
  discovered_tables: Optional[List[str]] = None,
@@ -155,9 +197,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
155
197
  self.filters = filters
156
198
  self.identifiers = identifiers
157
199
  self.discovered_tables = set(discovered_tables) if discovered_tables else None
200
+ self.redundant_run_skip_handler = redundant_run_skip_handler
158
201
 
159
202
  self._structured_report = structured_report
160
203
 
204
+ # Adjust time window based on stateful ingestion state
205
+ self.start_time, self.end_time = self._get_time_window()
206
+
161
207
  # The exit stack helps ensure that we close all the resources we open.
162
208
  self._exit_stack = contextlib.ExitStack()
163
209
 
@@ -175,8 +221,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
175
221
  generate_query_usage_statistics=self.config.include_query_usage_statistics,
176
222
  usage_config=BaseUsageConfig(
177
223
  bucket_duration=self.config.window.bucket_duration,
178
- start_time=self.config.window.start_time,
179
- end_time=self.config.window.end_time,
224
+ start_time=self.start_time,
225
+ end_time=self.end_time,
180
226
  user_email_pattern=self.config.user_email_pattern,
181
227
  # TODO make the rest of the fields configurable
182
228
  ),
@@ -192,6 +238,34 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
192
238
  def structured_reporter(self) -> SourceReport:
193
239
  return self._structured_report
194
240
 
241
+ def _get_time_window(self) -> tuple[datetime, datetime]:
242
+ if self.redundant_run_skip_handler:
243
+ start_time, end_time = (
244
+ self.redundant_run_skip_handler.suggest_run_time_window(
245
+ self.config.window.start_time,
246
+ self.config.window.end_time,
247
+ )
248
+ )
249
+ else:
250
+ start_time = self.config.window.start_time
251
+ end_time = self.config.window.end_time
252
+
253
+ # Usage statistics are aggregated per bucket (typically per day).
254
+ # To ensure accurate aggregated metrics, we need to align the start_time
255
+ # to the beginning of a bucket so that we include complete bucket periods.
256
+ if self.config.include_usage_statistics:
257
+ start_time = get_time_bucket(start_time, self.config.window.bucket_duration)
258
+
259
+ return start_time, end_time
260
+
261
+ def _update_state(self) -> None:
262
+ if self.redundant_run_skip_handler:
263
+ self.redundant_run_skip_handler.update_state(
264
+ self.config.window.start_time,
265
+ self.config.window.end_time,
266
+ self.config.window.bucket_duration,
267
+ )
268
+
195
269
  @functools.cached_property
196
270
  def local_temp_path(self) -> pathlib.Path:
197
271
  if self.config.local_temp_path:
@@ -241,6 +315,12 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
241
315
  audit_log_file = self.local_temp_path / "audit_log.sqlite"
242
316
  use_cached_audit_log = audit_log_file.exists()
243
317
 
318
+ if self.config.local_temp_path is None:
319
+ self._exit_stack.callback(lambda: audit_log_file.unlink(missing_ok=True))
320
+
321
+ shared_connection = self._exit_stack.enter_context(
322
+ ConnectionWrapper(audit_log_file)
323
+ )
244
324
  queries: FileBackedList[
245
325
  Union[
246
326
  KnownLineageMapping,
@@ -248,44 +328,73 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
248
328
  TableRename,
249
329
  TableSwap,
250
330
  ObservedQuery,
331
+ StoredProcCall,
251
332
  ]
252
- ]
333
+ ] = self._exit_stack.enter_context(FileBackedList(shared_connection))
334
+
253
335
  if use_cached_audit_log:
254
- logger.info("Using cached audit log")
255
- shared_connection = ConnectionWrapper(audit_log_file)
256
- queries = FileBackedList(shared_connection)
336
+ logger.info(f"Using cached audit log at {audit_log_file}")
257
337
  else:
258
- audit_log_file.unlink(missing_ok=True)
338
+ # Check if any query-based features are enabled before fetching
339
+ needs_query_data = any(
340
+ [
341
+ self.config.include_lineage,
342
+ self.config.include_queries,
343
+ self.config.include_usage_statistics,
344
+ self.config.include_query_usage_statistics,
345
+ self.config.include_operations,
346
+ ]
347
+ )
259
348
 
260
- shared_connection = ConnectionWrapper(audit_log_file)
261
- queries = FileBackedList(shared_connection)
262
- entry: Union[
263
- KnownLineageMapping,
264
- PreparsedQuery,
265
- TableRename,
266
- TableSwap,
267
- ObservedQuery,
268
- ]
349
+ if not needs_query_data:
350
+ logger.info(
351
+ "All query-based features are disabled. Skipping expensive query log fetch."
352
+ )
353
+ else:
354
+ logger.info(f"Fetching audit log into {audit_log_file}")
355
+
356
+ with self.report.copy_history_fetch_timer:
357
+ for copy_entry in self.fetch_copy_history():
358
+ queries.append(copy_entry)
269
359
 
270
- with self.report.copy_history_fetch_timer:
271
- for entry in self.fetch_copy_history():
272
- queries.append(entry)
360
+ with self.report.query_log_fetch_timer:
361
+ for entry in self.fetch_query_log(users):
362
+ queries.append(entry)
273
363
 
274
- with self.report.query_log_fetch_timer:
275
- for entry in self.fetch_query_log(users):
276
- queries.append(entry)
364
+ stored_proc_tracker: StoredProcLineageTracker = self._exit_stack.enter_context(
365
+ StoredProcLineageTracker(
366
+ platform=self.identifiers.platform,
367
+ shared_connection=shared_connection,
368
+ )
369
+ )
370
+ self.report.stored_proc_lineage = stored_proc_tracker.report
277
371
 
278
372
  with self.report.audit_log_load_timer:
279
373
  for i, query in enumerate(queries):
280
374
  if i % 1000 == 0:
281
375
  logger.info(f"Added {i} query log entries to SQL aggregator")
282
- self.aggregator.add(query)
283
376
 
284
- yield from auto_workunit(self.aggregator.gen_metadata())
285
- if not use_cached_audit_log:
286
- queries.close()
287
- shared_connection.close()
288
- audit_log_file.unlink(missing_ok=True)
377
+ if isinstance(query, StoredProcCall):
378
+ stored_proc_tracker.add_stored_proc_call(query)
379
+ continue
380
+
381
+ if not (
382
+ isinstance(query, PreparsedQuery)
383
+ and stored_proc_tracker.add_related_query(query)
384
+ ):
385
+ # Only add to aggregator if it's not part of a stored procedure.
386
+ self.aggregator.add(query)
387
+
388
+ # Generate and add stored procedure lineage entries.
389
+ for lineage_entry in stored_proc_tracker.build_merged_lineage_entries():
390
+ # TODO: Make this the lowest priority lineage - so that it doesn't override other lineage entries.
391
+ self.aggregator.add(lineage_entry)
392
+
393
+ with self.report.aggregator_generate_timer:
394
+ yield from auto_workunit(self.aggregator.gen_metadata())
395
+
396
+ # Update the stateful ingestion state after successful extraction
397
+ self._update_state()
289
398
 
290
399
  def fetch_users(self) -> UsersMapping:
291
400
  users: UsersMapping = dict()
@@ -310,8 +419,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
310
419
  # Derived from _populate_external_lineage_from_copy_history.
311
420
 
312
421
  query: str = SnowflakeQuery.copy_lineage_history(
313
- start_time_millis=int(self.config.window.start_time.timestamp() * 1000),
314
- end_time_millis=int(self.config.window.end_time.timestamp() * 1000),
422
+ start_time_millis=int(self.start_time.timestamp() * 1000),
423
+ end_time_millis=int(self.end_time.timestamp() * 1000),
315
424
  downstreams_deny_pattern=self.config.temporary_tables_pattern,
316
425
  )
317
426
 
@@ -342,13 +451,23 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
342
451
 
343
452
  def fetch_query_log(
344
453
  self, users: UsersMapping
345
- ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery]]:
346
- query_log_query = _build_enriched_query_log_query(
347
- start_time=self.config.window.start_time,
348
- end_time=self.config.window.end_time,
454
+ ) -> Iterable[
455
+ Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
456
+ ]:
457
+ query_log_query = QueryLogQueryBuilder(
458
+ start_time=self.start_time,
459
+ end_time=self.end_time,
349
460
  bucket_duration=self.config.window.bucket_duration,
350
461
  deny_usernames=self.config.pushdown_deny_usernames,
351
- )
462
+ allow_usernames=self.config.pushdown_allow_usernames,
463
+ dedup_strategy=self.config.query_dedup_strategy,
464
+ database_pattern=self.filters.filter_config.database_pattern
465
+ if self.config.push_down_database_pattern_access_history
466
+ else None,
467
+ additional_database_names=self.config.additional_database_names_allowlist
468
+ if self.config.push_down_database_pattern_access_history
469
+ else None,
470
+ ).build_enriched_query_log_query()
352
471
 
353
472
  with self.structured_reporter.report_exc(
354
473
  "Error fetching query log from Snowflake"
@@ -373,9 +492,18 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
373
492
  if entry:
374
493
  yield entry
375
494
 
495
+ @classmethod
496
+ def _has_temp_keyword(cls, query_text: str) -> bool:
497
+ return (
498
+ re.search(r"\bTEMP\b", query_text, re.IGNORECASE) is not None
499
+ or re.search(r"\bTEMPORARY\b", query_text, re.IGNORECASE) is not None
500
+ )
501
+
376
502
  def _parse_audit_log_row(
377
503
  self, row: Dict[str, Any], users: UsersMapping
378
- ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
504
+ ) -> Optional[
505
+ Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery, StoredProcCall]
506
+ ]:
379
507
  json_fields = {
380
508
  "DIRECT_OBJECTS_ACCESSED",
381
509
  "OBJECTS_MODIFIED",
@@ -389,6 +517,16 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
389
517
  key = key.lower()
390
518
  res[key] = value
391
519
 
520
+ timestamp: datetime = res["query_start_time"]
521
+ timestamp = timestamp.astimezone(timezone.utc)
522
+
523
+ # TODO need to map snowflake query types to ours
524
+ query_text: str = res["query_text"]
525
+ snowflake_query_type: str = res["query_type"]
526
+ query_type: QueryType = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
527
+ snowflake_query_type, QueryType.UNKNOWN
528
+ )
529
+
392
530
  direct_objects_accessed = res["direct_objects_accessed"]
393
531
  objects_modified = res["objects_modified"]
394
532
  object_modified_by_ddl = res["object_modified_by_ddl"]
@@ -399,11 +537,11 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
399
537
  "Error fetching ddl lineage from Snowflake"
400
538
  ):
401
539
  known_ddl_entry = self.parse_ddl_query(
402
- res["query_text"],
540
+ query_text,
403
541
  res["session_id"],
404
- res["query_start_time"],
542
+ timestamp,
405
543
  object_modified_by_ddl,
406
- res["query_type"],
544
+ snowflake_query_type,
407
545
  )
408
546
  if known_ddl_entry:
409
547
  return known_ddl_entry
@@ -418,26 +556,62 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
418
556
  res["user_name"], users.get(res["user_name"])
419
557
  )
420
558
  )
559
+ extra_info = {
560
+ "snowflake_query_id": res["query_id"],
561
+ "snowflake_root_query_id": res["root_query_id"],
562
+ "snowflake_query_type": res["query_type"],
563
+ "snowflake_role_name": res["role_name"],
564
+ "query_duration": res["query_duration"],
565
+ "rows_inserted": res["rows_inserted"],
566
+ "rows_updated": res["rows_updated"],
567
+ "rows_deleted": res["rows_deleted"],
568
+ }
569
+
570
+ # There are a couple cases when we'd want to prefer our own SQL parsing
571
+ # over Snowflake's metadata.
572
+ # 1. For queries that use a stream, objects_modified returns $SYS_VIEW_X with no mapping.
573
+ # We can check direct_objects_accessed to see if there is a stream used, and if so,
574
+ # prefer doing SQL parsing over Snowflake's metadata.
575
+ # 2. For queries that create a view, objects_modified is empty and object_modified_by_ddl
576
+ # contains the view name and columns. Because `object_modified_by_ddl` doesn't contain
577
+ # source columns e.g. lineage information, we must do our own SQL parsing. We're mainly
578
+ # focused on temporary views. It's fine if we parse a couple extra views, but in general
579
+ # we want view definitions to come from Snowflake's schema metadata and not from query logs.
421
580
 
422
- # Use direct_objects_accessed instead objects_modified
423
- # objects_modified returns $SYS_VIEW_X with no mapping
424
581
  has_stream_objects = any(
425
582
  obj.get("objectDomain") == "Stream" for obj in direct_objects_accessed
426
583
  )
584
+ is_create_view = query_type == QueryType.CREATE_VIEW
585
+ is_create_temp_view = is_create_view and self._has_temp_keyword(query_text)
586
+
587
+ if has_stream_objects or is_create_temp_view:
588
+ if has_stream_objects:
589
+ self.report.num_stream_queries_observed += 1
590
+ elif is_create_temp_view:
591
+ self.report.num_create_temp_view_queries_observed += 1
427
592
 
428
- # If a stream is used, default to query parsing.
429
- if has_stream_objects:
430
- logger.debug("Found matching stream object")
431
593
  return ObservedQuery(
432
- query=res["query_text"],
594
+ query=query_text,
433
595
  session_id=res["session_id"],
434
- timestamp=res["query_start_time"].astimezone(timezone.utc),
596
+ timestamp=timestamp,
435
597
  user=user,
436
598
  default_db=res["default_db"],
437
599
  default_schema=res["default_schema"],
438
600
  query_hash=get_query_fingerprint(
439
- res["query_text"], self.identifiers.platform, fast=True
601
+ query_text, self.identifiers.platform, fast=True
440
602
  ),
603
+ extra_info=extra_info,
604
+ )
605
+
606
+ if snowflake_query_type == "CALL" and res["root_query_id"] is None:
607
+ return StoredProcCall(
608
+ # This is the top-level query ID that other entries will reference.
609
+ snowflake_root_query_id=res["query_id"],
610
+ query_text=query_text,
611
+ timestamp=timestamp,
612
+ user=user,
613
+ default_db=res["default_db"],
614
+ default_schema=res["default_schema"],
441
615
  )
442
616
 
443
617
  upstreams = []
@@ -502,22 +676,17 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
502
676
  )
503
677
  )
504
678
 
505
- timestamp: datetime = res["query_start_time"]
506
- timestamp = timestamp.astimezone(timezone.utc)
507
-
508
- # TODO need to map snowflake query types to ours
509
- query_type = SNOWFLAKE_QUERY_TYPE_MAPPING.get(
510
- res["query_type"], QueryType.UNKNOWN
511
- )
512
-
513
679
  entry = PreparsedQuery(
514
680
  # Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
515
681
  # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
516
682
  # here
517
683
  query_id=get_query_fingerprint(
518
- res["query_text"], self.identifiers.platform, fast=True
684
+ query_text,
685
+ self.identifiers.platform,
686
+ fast=True,
687
+ secondary_id=res["query_secondary_fingerprint"],
519
688
  ),
520
- query_text=res["query_text"],
689
+ query_text=query_text,
521
690
  upstreams=upstreams,
522
691
  downstream=downstream,
523
692
  column_lineage=column_lineage,
@@ -529,6 +698,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
529
698
  timestamp=timestamp,
530
699
  session_id=res["session_id"],
531
700
  query_type=query_type,
701
+ extra_info=extra_info,
532
702
  )
533
703
  return entry
534
704
 
@@ -540,7 +710,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
540
710
  object_modified_by_ddl: dict,
541
711
  query_type: str,
542
712
  ) -> Optional[Union[TableRename, TableSwap]]:
543
- timestamp = timestamp.astimezone(timezone.utc)
544
713
  if (
545
714
  object_modified_by_ddl["operationType"] == "ALTER"
546
715
  and query_type == "RENAME_TABLE"
@@ -582,6 +751,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
582
751
  self._exit_stack.close()
583
752
 
584
753
 
754
+ @support_status(SupportStatus.CERTIFIED)
755
+ @config_class(SnowflakeQueriesSourceConfig)
585
756
  class SnowflakeQueriesSource(Source):
586
757
  def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesSourceConfig):
587
758
  self.ctx = ctx
@@ -626,59 +797,288 @@ class SnowflakeQueriesSource(Source):
626
797
  def close(self) -> None:
627
798
  self.connection.close()
628
799
  self.queries_extractor.close()
800
+ super().close()
801
+
802
+
803
+ class QueryLogQueryBuilder:
804
+ def __init__(
805
+ self,
806
+ start_time: datetime,
807
+ end_time: datetime,
808
+ bucket_duration: BucketDuration,
809
+ deny_usernames: Optional[List[str]] = None,
810
+ allow_usernames: Optional[List[str]] = None,
811
+ max_tables_per_query: int = 20,
812
+ dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
813
+ database_pattern: Optional[AllowDenyPattern] = None,
814
+ additional_database_names: Optional[List[str]] = None,
815
+ ):
816
+ self.start_time = start_time
817
+ self.end_time = end_time
818
+ self.start_time_millis = int(start_time.timestamp() * 1000)
819
+ self.end_time_millis = int(end_time.timestamp() * 1000)
820
+ self.max_tables_per_query = max_tables_per_query
821
+ self.dedup_strategy = dedup_strategy
822
+
823
+ self.users_filter = self._build_user_filter(deny_usernames, allow_usernames)
824
+
825
+ self.access_history_database_filter = (
826
+ self._build_access_history_database_filter_condition(
827
+ database_pattern, additional_database_names
828
+ )
829
+ )
830
+
831
+ self.time_bucket_size = bucket_duration.value
832
+ assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
833
+
834
+ def _build_user_filter(
835
+ self,
836
+ deny_usernames: Optional[List[str]] = None,
837
+ allow_usernames: Optional[List[str]] = None,
838
+ ) -> str:
839
+ """
840
+ Build user filter SQL condition based on deny and allow username patterns.
841
+
842
+ Args:
843
+ deny_usernames: List of username patterns to exclude (SQL LIKE patterns)
844
+ allow_usernames: List of username patterns to include (SQL LIKE patterns)
845
+
846
+ Returns:
847
+ SQL WHERE condition string for filtering users
848
+ """
849
+ user_filters = []
850
+
851
+ if deny_usernames:
852
+ deny_conditions = []
853
+ for pattern in deny_usernames:
854
+ # Escape single quotes for SQL safety
855
+ escaped_pattern = pattern.replace("'", "''")
856
+ deny_conditions.append(f"user_name NOT ILIKE '{escaped_pattern}'")
857
+ if deny_conditions:
858
+ user_filters.append(f"({' AND '.join(deny_conditions)})")
859
+
860
+ if allow_usernames:
861
+ allow_conditions = []
862
+ for pattern in allow_usernames:
863
+ # Escape single quotes for SQL safety
864
+ escaped_pattern = pattern.replace("'", "''")
865
+ allow_conditions.append(f"user_name ILIKE '{escaped_pattern}'")
866
+ if allow_conditions:
867
+ user_filters.append(f"({' OR '.join(allow_conditions)})")
868
+
869
+ return " AND ".join(user_filters) if user_filters else "TRUE"
870
+
871
+ def _build_access_history_database_filter_condition(
872
+ self,
873
+ database_pattern: Optional[AllowDenyPattern],
874
+ additional_database_names: Optional[List[str]] = None,
875
+ ) -> str:
876
+ """
877
+ Build a SQL WHERE condition for database filtering in access_history based on AllowDenyPattern.
878
+
879
+ IMPORTANT: This function handles the fundamental difference between DML and DDL operations in Snowflake's
880
+ access_history table:
881
+
882
+ - DML Operations (SELECT, INSERT, UPDATE, DELETE, etc.): Store accessed/modified objects in the
883
+ `direct_objects_accessed` and `objects_modified` arrays
884
+ - DDL Operations (CREATE, ALTER, DROP, RENAME, etc.): Store modified objects in the
885
+ `object_modified_by_ddl` field (single object, not an array)
886
+
887
+ Without checking `object_modified_by_ddl`, DDL operations like "ALTER TABLE person_info RENAME TO person_info_final"
888
+ would be incorrectly filtered out because they don't populate the DML arrays, causing missing lineage
889
+ and operational metadata.
890
+
891
+ Filtering Logic:
892
+ A query is included if it matches:
893
+ - Any database name in additional_database_names (exact match), OR
894
+ - Any database pattern in database_pattern.allow AND NOT any pattern in database_pattern.deny
895
+
896
+ Args:
897
+ database_pattern: The AllowDenyPattern configuration for database filtering
898
+ additional_database_names: Additional database names to always include (no pattern matching)
899
+
900
+ Returns:
901
+ A SQL WHERE condition string, or "TRUE" if no filtering should be applied
902
+ """
903
+ if not database_pattern and not additional_database_names:
904
+ return "TRUE"
905
+
906
+ # Build the database filter conditions
907
+ # Logic: Allow if (matches additional_database_names_allowlist) OR (matches database_pattern.allow AND NOT matches database_pattern.deny)
908
+ # Note: Using UPPER() + RLIKE for case-insensitive matching is more performant than REGEXP_LIKE with 'i' flag
909
+
910
+ # Build additional database names condition (exact matches) - these always get included
911
+ additional_db_condition = None
912
+ if additional_database_names:
913
+ additional_db_conditions = []
914
+ for db_name in additional_database_names:
915
+ # Escape single quotes
916
+ escaped_db_name = db_name.replace("'", "''")
917
+ additional_db_conditions.append(
918
+ f"SPLIT_PART(UPPER(o:objectName), '.', 1) = '{escaped_db_name.upper()}'"
919
+ )
920
+ if additional_db_conditions:
921
+ additional_db_condition = " OR ".join(additional_db_conditions)
922
+
923
+ # Build database pattern condition (allow AND NOT deny)
924
+ database_pattern_condition = None
925
+ if database_pattern:
926
+ allow_patterns = database_pattern.allow
927
+ deny_patterns = database_pattern.deny
928
+
929
+ pattern_parts = []
930
+
931
+ # Add allow patterns (if not the default "allow all")
932
+ if allow_patterns and allow_patterns != [".*"]:
933
+ allow_conditions = []
934
+ for pattern in allow_patterns:
935
+ # Escape single quotes that might be present in the regex pattern
936
+ escaped_pattern = pattern.replace("'", "''")
937
+ allow_conditions.append(
938
+ f"SPLIT_PART(UPPER(o:objectName), '.', 1) RLIKE '{escaped_pattern}'"
939
+ )
940
+ if allow_conditions:
941
+ pattern_parts.append(
942
+ allow_conditions[0]
943
+ if len(allow_conditions) == 1
944
+ else f"({' OR '.join(allow_conditions)})"
945
+ )
629
946
 
947
+ # Add deny patterns
948
+ if deny_patterns:
949
+ deny_conditions = []
950
+ for pattern in deny_patterns:
951
+ # Escape single quotes that might be present in the regex pattern
952
+ escaped_pattern = pattern.replace("'", "''")
953
+ deny_conditions.append(
954
+ f"SPLIT_PART(UPPER(o:objectName), '.', 1) NOT RLIKE '{escaped_pattern}'"
955
+ )
956
+ if deny_conditions:
957
+ pattern_parts.append(
958
+ deny_conditions[0]
959
+ if len(deny_conditions) == 1
960
+ else f"({' AND '.join(deny_conditions)})"
961
+ )
630
962
 
631
- # Make sure we don't try to generate too much info for a single query.
632
- _MAX_TABLES_PER_QUERY = 20
963
+ if pattern_parts:
964
+ database_pattern_condition = " AND ".join(pattern_parts)
633
965
 
966
+ # Combine conditions: additional_database_names OR database_pattern
967
+ filter_conditions = []
968
+ if additional_db_condition:
969
+ filter_conditions.append(
970
+ f"({additional_db_condition})"
971
+ if len(additional_db_condition.split(" OR ")) > 1
972
+ else additional_db_condition
973
+ )
974
+ if database_pattern_condition:
975
+ filter_conditions.append(
976
+ f"({database_pattern_condition})"
977
+ if len(database_pattern_condition.split(" AND ")) > 1
978
+ else database_pattern_condition
979
+ )
634
980
 
635
- def _build_enriched_query_log_query(
636
- start_time: datetime,
637
- end_time: datetime,
638
- bucket_duration: BucketDuration,
639
- deny_usernames: Optional[List[str]],
640
- ) -> str:
641
- start_time_millis = int(start_time.timestamp() * 1000)
642
- end_time_millis = int(end_time.timestamp() * 1000)
981
+ if filter_conditions:
982
+ database_filter_condition = (
983
+ filter_conditions[0]
984
+ if len(filter_conditions) == 1
985
+ else " OR ".join(filter_conditions)
986
+ )
643
987
 
644
- users_filter = ""
645
- if deny_usernames:
646
- user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
647
- users_filter = f"user_name NOT IN ({user_not_in})"
988
+ # Build a condition that checks if any objects in the arrays match the database pattern
989
+ # This implements "at least one" matching behavior: queries are allowed if they touch
990
+ # at least one database that matches the pattern, even if they also touch other databases
991
+ # Use ARRAY_SIZE with FILTER which is more compatible with Snowflake
992
+ direct_objects_condition = f"ARRAY_SIZE(FILTER(direct_objects_accessed, o -> {database_filter_condition})) > 0"
993
+ objects_modified_condition = f"ARRAY_SIZE(FILTER(objects_modified, o -> {database_filter_condition})) > 0"
994
+
995
+ # CRITICAL: Handle DDL operations by checking object_modified_by_ddl field
996
+ # DDL operations like ALTER TABLE RENAME store their data here instead of in the arrays
997
+ # We need to adapt the filter condition for a single object rather than an array
998
+ ddl_filter_condition = database_filter_condition.replace(
999
+ "o:objectName", "object_modified_by_ddl:objectName"
1000
+ )
1001
+ object_modified_by_ddl_condition = f"({ddl_filter_condition})"
648
1002
 
649
- time_bucket_size = bucket_duration.value
650
- assert time_bucket_size in ("HOUR", "DAY", "MONTH")
1003
+ return f"({direct_objects_condition} OR {objects_modified_condition} OR {object_modified_by_ddl_condition})"
1004
+ else:
1005
+ return "TRUE"
1006
+
1007
+ def _query_fingerprinted_queries(self):
1008
+ if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
1009
+ secondary_fingerprint_sql = """
1010
+ CASE
1011
+ WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
1012
+ -- Extract project id and hash it
1013
+ THEN CAST(HASH(
1014
+ REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
1015
+ REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
1016
+ ) AS VARCHAR)
1017
+ ELSE NULL
1018
+ END"""
1019
+ elif self.dedup_strategy == QueryDedupStrategyType.NONE:
1020
+ secondary_fingerprint_sql = "NULL"
1021
+ else:
1022
+ raise NotImplementedError(
1023
+ f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
1024
+ )
1025
+ return f"""
1026
+ SELECT *,
1027
+ -- TODO: Generate better fingerprints for each query by pushing down regex logic.
1028
+ query_history.query_parameterized_hash as query_fingerprint,
1029
+ -- Optional and additional hash to be used for query deduplication and final query identity
1030
+ {secondary_fingerprint_sql} as query_secondary_fingerprint
1031
+ FROM
1032
+ snowflake.account_usage.query_history
1033
+ WHERE
1034
+ query_history.start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
1035
+ AND query_history.start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
1036
+ AND execution_status = 'SUCCESS'
1037
+ AND {self.users_filter}"""
1038
+
1039
+ def _query_deduplicated_queries(self):
1040
+ if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
1041
+ return f"""
1042
+ SELECT
1043
+ *,
1044
+ DATE_TRUNC(
1045
+ {self.time_bucket_size},
1046
+ CONVERT_TIMEZONE('UTC', start_time)
1047
+ ) AS bucket_start_time,
1048
+ COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
1049
+ FROM
1050
+ fingerprinted_queries
1051
+ QUALIFY
1052
+ ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1"""
1053
+ elif self.dedup_strategy == QueryDedupStrategyType.NONE:
1054
+ return f"""
1055
+ SELECT
1056
+ *,
1057
+ DATE_TRUNC(
1058
+ {self.time_bucket_size},
1059
+ CONVERT_TIMEZONE('UTC', start_time)
1060
+ ) AS bucket_start_time,
1061
+ 1 AS query_count,
1062
+ FROM
1063
+ fingerprinted_queries"""
1064
+ else:
1065
+ raise NotImplementedError(
1066
+ f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
1067
+ )
651
1068
 
652
- return f"""\
1069
+ def build_enriched_query_log_query(self) -> str:
1070
+ return f"""\
653
1071
  WITH
654
1072
  fingerprinted_queries as (
655
- SELECT *,
656
- -- TODO: Generate better fingerprints for each query by pushing down regex logic.
657
- query_history.query_parameterized_hash as query_fingerprint
658
- FROM
659
- snowflake.account_usage.query_history
660
- WHERE
661
- query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3)
662
- AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3)
663
- AND execution_status = 'SUCCESS'
664
- AND {users_filter or "TRUE"}
1073
+ {self._query_fingerprinted_queries()}
665
1074
  )
666
1075
  , deduplicated_queries as (
667
- SELECT
668
- *,
669
- DATE_TRUNC(
670
- {time_bucket_size},
671
- CONVERT_TIMEZONE('UTC', start_time)
672
- ) AS bucket_start_time,
673
- COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint) AS query_count,
674
- FROM
675
- fingerprinted_queries
676
- QUALIFY
677
- ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint ORDER BY start_time DESC) = 1
1076
+ {self._query_deduplicated_queries()}
678
1077
  )
679
1078
  , raw_access_history AS (
680
1079
  SELECT
681
1080
  query_id,
1081
+ root_query_id,
682
1082
  query_start_time,
683
1083
  user_name,
684
1084
  direct_objects_accessed,
@@ -687,21 +1087,23 @@ fingerprinted_queries as (
687
1087
  FROM
688
1088
  snowflake.account_usage.access_history
689
1089
  WHERE
690
- query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
691
- AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
692
- AND {users_filter or "TRUE"}
1090
+ query_start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
1091
+ AND query_start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
1092
+ AND {self.users_filter}
693
1093
  AND query_id IN (
694
1094
  SELECT query_id FROM deduplicated_queries
695
1095
  )
1096
+ AND {self.access_history_database_filter}
696
1097
  )
697
1098
  , filtered_access_history AS (
698
1099
  -- TODO: Add table filter clause.
699
1100
  SELECT
700
1101
  query_id,
1102
+ root_query_id,
701
1103
  query_start_time,
702
1104
  ARRAY_SLICE(
703
1105
  FILTER(direct_objects_accessed, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}),
704
- 0, {_MAX_TABLES_PER_QUERY}
1106
+ 0, {self.max_tables_per_query}
705
1107
  ) as direct_objects_accessed,
706
1108
  -- TODO: Drop the columns.baseSources subfield.
707
1109
  FILTER(objects_modified, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}) as objects_modified,
@@ -714,6 +1116,7 @@ fingerprinted_queries as (
714
1116
  q.bucket_start_time,
715
1117
  q.query_id,
716
1118
  q.query_fingerprint,
1119
+ q.query_secondary_fingerprint,
717
1120
  q.query_count,
718
1121
  q.session_id AS "SESSION_ID",
719
1122
  q.start_time AS "QUERY_START_TIME",
@@ -727,6 +1130,7 @@ fingerprinted_queries as (
727
1130
  q.rows_deleted AS "ROWS_DELETED",
728
1131
  q.user_name AS "USER_NAME",
729
1132
  q.role_name AS "ROLE_NAME",
1133
+ a.root_query_id,
730
1134
  a.direct_objects_accessed,
731
1135
  a.objects_modified,
732
1136
  a.object_modified_by_ddl