acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -45,6 +45,7 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
45
45
  SnowflakeColumn,
46
46
  SnowflakeDatabase,
47
47
  SnowflakeDataDictionary,
48
+ SnowflakeDynamicTable,
48
49
  SnowflakeFK,
49
50
  SnowflakePK,
50
51
  SnowflakeSchema,
@@ -76,7 +77,7 @@ from datahub.ingestion.source_report.ingestion_stage import (
76
77
  EXTERNAL_TABLE_DDL_LINEAGE,
77
78
  LINEAGE_EXTRACTION,
78
79
  METADATA_EXTRACTION,
79
- PROFILING,
80
+ IngestionHighStage,
80
81
  )
81
82
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
82
83
  GlobalTags,
@@ -165,8 +166,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
165
166
 
166
167
  def __init__(
167
168
  self,
168
- config: SnowflakeV2Config,
169
- report: SnowflakeV2Report,
169
+ config: SnowflakeV2Config, # FIXME: SnowflakeSummary is passing here SnowflakeSummaryConfig
170
+ report: SnowflakeV2Report, # FIXME: SnowflakeSummary is passing here SnowflakeSummaryReport
170
171
  connection: SnowflakeConnection,
171
172
  filters: SnowflakeFilter,
172
173
  identifiers: SnowflakeIdentifierBuilder,
@@ -174,6 +175,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
174
175
  profiler: Optional[SnowflakeProfiler],
175
176
  aggregator: Optional[SqlParsingAggregator],
176
177
  snowsight_url_builder: Optional[SnowsightUrlBuilder],
178
+ fetch_views_from_information_schema: bool = False,
177
179
  ) -> None:
178
180
  self.config: SnowflakeV2Config = config
179
181
  self.report: SnowflakeV2Report = report
@@ -182,7 +184,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
182
184
  self.identifiers: SnowflakeIdentifierBuilder = identifiers
183
185
 
184
186
  self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary(
185
- connection=self.connection
187
+ connection=self.connection,
188
+ report=self.report,
189
+ fetch_views_from_information_schema=fetch_views_from_information_schema,
186
190
  )
187
191
  self.report.data_dictionary_cache = self.data_dictionary
188
192
 
@@ -356,7 +360,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
356
360
  yield from self._process_db_schemas(snowflake_db, db_tables)
357
361
 
358
362
  if self.profiler and db_tables:
359
- with self.report.new_stage(f"{snowflake_db.name}: {PROFILING}"):
363
+ with self.report.new_high_stage(IngestionHighStage.PROFILING):
360
364
  yield from self.profiler.get_workunits(snowflake_db, db_tables)
361
365
 
362
366
  def _process_db_schemas(
@@ -437,13 +441,16 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
437
441
  tables = self.fetch_tables_for_schema(
438
442
  snowflake_schema, db_name, schema_name
439
443
  )
444
+ if self.config.include_views:
445
+ views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
446
+
447
+ if self.config.include_tables:
440
448
  db_tables[schema_name] = tables
441
449
  yield from self._process_tables(
442
450
  tables, snowflake_schema, db_name, schema_name
443
451
  )
444
452
 
445
453
  if self.config.include_views:
446
- views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
447
454
  yield from self._process_views(
448
455
  views, snowflake_schema, db_name, schema_name
449
456
  )
@@ -495,6 +502,22 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
495
502
  if self.config.include_technical_schema:
496
503
  data_reader = self.make_data_reader()
497
504
  for table in tables:
505
+ # Handle dynamic table definitions for lineage
506
+ if (
507
+ isinstance(table, SnowflakeDynamicTable)
508
+ and table.definition
509
+ and self.aggregator
510
+ ):
511
+ table_identifier = self.identifiers.get_dataset_identifier(
512
+ table.name, schema_name, db_name
513
+ )
514
+ self.aggregator.add_view_definition(
515
+ view_urn=self.identifiers.gen_dataset_urn(table_identifier),
516
+ view_definition=table.definition,
517
+ default_db=db_name,
518
+ default_schema=schema_name,
519
+ )
520
+
498
521
  table_wu_generator = self._process_table(
499
522
  table, snowflake_schema, db_name
500
523
  )
@@ -935,6 +958,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
935
958
  }
936
959
  )
937
960
 
961
+ if isinstance(table, SnowflakeDynamicTable):
962
+ if table.target_lag:
963
+ custom_properties["TARGET_LAG"] = table.target_lag
964
+
938
965
  if isinstance(table, SnowflakeView) and table.is_secure:
939
966
  custom_properties["IS_SECURE"] = "true"
940
967
 
@@ -980,7 +1007,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
980
1007
  schema_name,
981
1008
  db_name,
982
1009
  (
983
- SnowflakeObjectDomain.TABLE
1010
+ SnowflakeObjectDomain.DYNAMIC_TABLE
1011
+ if isinstance(table, SnowflakeTable) and table.is_dynamic
1012
+ else SnowflakeObjectDomain.TABLE
984
1013
  if isinstance(table, SnowflakeTable)
985
1014
  else SnowflakeObjectDomain.VIEW
986
1015
  ),
@@ -1218,7 +1247,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1218
1247
  # falling back to get tables for schema
1219
1248
  if tables is None:
1220
1249
  self.report.num_get_tables_for_schema_queries += 1
1221
- return self.data_dictionary.get_tables_for_schema(schema_name, db_name)
1250
+ return self.data_dictionary.get_tables_for_schema(
1251
+ db_name=db_name,
1252
+ schema_name=schema_name,
1253
+ )
1222
1254
 
1223
1255
  # Some schema may not have any table
1224
1256
  return tables.get(schema_name, [])
@@ -1228,8 +1260,17 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1228
1260
  ) -> List[SnowflakeView]:
1229
1261
  views = self.data_dictionary.get_views_for_database(db_name)
1230
1262
 
1231
- # Some schema may not have any table
1232
- return views.get(schema_name, [])
1263
+ if views is not None:
1264
+ # Some schemas may not have any views
1265
+ return views.get(schema_name, [])
1266
+
1267
+ # Usually this fails when there are too many views in the schema.
1268
+ # Fall back to per-schema queries.
1269
+ self.report.num_get_views_for_schema_queries += 1
1270
+ return self.data_dictionary.get_views_for_schema_using_information_schema(
1271
+ db_name=db_name,
1272
+ schema_name=schema_name,
1273
+ )
1233
1274
 
1234
1275
  def get_columns_for_table(
1235
1276
  self, table_name: str, snowflake_schema: SnowflakeSchema, db_name: str
@@ -20,6 +20,7 @@ from datahub.ingestion.source.snowflake.snowflake_schema_gen import (
20
20
  SnowflakeSchemaGenerator,
21
21
  )
22
22
  from datahub.ingestion.source.snowflake.snowflake_utils import (
23
+ SnowflakeFilter,
23
24
  SnowflakeIdentifierBuilder,
24
25
  )
25
26
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
@@ -58,7 +59,7 @@ class SnowflakeSummaryReport(SourceReport, BaseTimeWindowReport):
58
59
 
59
60
 
60
61
  @config_class(SnowflakeSummaryConfig)
61
- @support_status(SupportStatus.INCUBATING)
62
+ @support_status(SupportStatus.CERTIFIED)
62
63
  class SnowflakeSummarySource(Source):
63
64
  def __init__(self, ctx: PipelineContext, config: SnowflakeSummaryConfig):
64
65
  super().__init__(ctx)
@@ -81,6 +82,11 @@ class SnowflakeSummarySource(Source):
81
82
  profiler=None,
82
83
  aggregator=None,
83
84
  snowsight_url_builder=None,
85
+ filters=SnowflakeFilter(
86
+ filter_config=self.config,
87
+ structured_reporter=self.report,
88
+ ),
89
+ fetch_views_from_information_schema=False, # we haven't enabled this config for SnowflakeSummarySource
84
90
  )
85
91
 
86
92
  # Databases.
@@ -231,7 +231,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
231
231
 
232
232
  with self.report.usage_aggregation.result_fetch_timer as fetch_timer:
233
233
  for row in results:
234
- with fetch_timer.pause(), self.report.usage_aggregation.result_skip_timer as skip_timer:
234
+ with (
235
+ fetch_timer.pause(),
236
+ self.report.usage_aggregation.result_skip_timer as skip_timer,
237
+ ):
235
238
  if results.rownumber is not None and results.rownumber % 1000 == 0:
236
239
  logger.debug(f"Processing usage row number {results.rownumber}")
237
240
  logger.debug(self.report.usage_aggregation.as_string())
@@ -255,7 +258,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
255
258
  f"Skipping usage for {row['OBJECT_DOMAIN']} {dataset_identifier}, as table is not accessible."
256
259
  )
257
260
  continue
258
- with skip_timer.pause(), self.report.usage_aggregation.result_map_timer as map_timer:
261
+ with (
262
+ skip_timer.pause(),
263
+ self.report.usage_aggregation.result_map_timer as map_timer,
264
+ ):
259
265
  wu = self.build_usage_statistics_for_dataset(
260
266
  dataset_identifier, row
261
267
  )
@@ -9,6 +9,7 @@ from datahub.emitter.mce_builder import (
9
9
  from datahub.emitter.mcp_builder import DatabaseKey, SchemaKey
10
10
  from datahub.ingestion.api.source import SourceReport
11
11
  from datahub.ingestion.source.snowflake.constants import (
12
+ DEFAULT_SNOWFLAKE_DOMAIN,
12
13
  SNOWFLAKE_REGION_CLOUD_REGION_MAPPING,
13
14
  SnowflakeCloudProvider,
14
15
  SnowflakeObjectDomain,
@@ -34,16 +35,21 @@ class SnowsightUrlBuilder:
34
35
  "us-east-1",
35
36
  "eu-west-1",
36
37
  "eu-central-1",
37
- "ap-southeast-1",
38
38
  "ap-southeast-2",
39
39
  ]
40
40
 
41
41
  snowsight_base_url: str
42
42
 
43
- def __init__(self, account_locator: str, region: str, privatelink: bool = False):
43
+ def __init__(
44
+ self,
45
+ account_locator: str,
46
+ region: str,
47
+ privatelink: bool = False,
48
+ snowflake_domain: str = DEFAULT_SNOWFLAKE_DOMAIN,
49
+ ):
44
50
  cloud, cloud_region_id = self.get_cloud_region_from_snowflake_region_id(region)
45
51
  self.snowsight_base_url = self.create_snowsight_base_url(
46
- account_locator, cloud_region_id, cloud, privatelink
52
+ account_locator, cloud_region_id, cloud, privatelink, snowflake_domain
47
53
  )
48
54
 
49
55
  @staticmethod
@@ -52,6 +58,7 @@ class SnowsightUrlBuilder:
52
58
  cloud_region_id: str,
53
59
  cloud: str,
54
60
  privatelink: bool = False,
61
+ snowflake_domain: str = DEFAULT_SNOWFLAKE_DOMAIN,
55
62
  ) -> str:
56
63
  if cloud:
57
64
  url_cloud_provider_suffix = f".{cloud}"
@@ -66,8 +73,14 @@ class SnowsightUrlBuilder:
66
73
  url_cloud_provider_suffix = ""
67
74
  else:
68
75
  url_cloud_provider_suffix = f".{cloud}"
69
- if privatelink:
70
- url = f"https://app.{account_locator}.{cloud_region_id}.privatelink.snowflakecomputing.com/"
76
+ # Note: Snowsight is always accessed via the public internet (app.snowflake.com)
77
+ # even for accounts using privatelink. Privatelink only applies to database connections,
78
+ # not the Snowsight web UI.
79
+ # Standard Snowsight URL format - works for most regions
80
+ # China region may use app.snowflake.cn instead of app.snowflake.com. This is not documented, just
81
+ # guessing Based on existence of snowflake.cn domain (https://domainindex.com/domains/snowflake.cn)
82
+ if snowflake_domain == "snowflakecomputing.cn":
83
+ url = f"https://app.snowflake.cn/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
71
84
  else:
72
85
  url = f"https://app.snowflake.com/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
73
86
  return url
@@ -93,9 +106,20 @@ class SnowsightUrlBuilder:
93
106
  table_name: str,
94
107
  schema_name: str,
95
108
  db_name: str,
96
- domain: Literal[SnowflakeObjectDomain.TABLE, SnowflakeObjectDomain.VIEW],
109
+ domain: Literal[
110
+ SnowflakeObjectDomain.TABLE,
111
+ SnowflakeObjectDomain.VIEW,
112
+ SnowflakeObjectDomain.DYNAMIC_TABLE,
113
+ ],
97
114
  ) -> Optional[str]:
98
- return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/{domain}/{table_name}/"
115
+ # For dynamic tables, use the dynamic-table domain in the URL path
116
+ # Ensure only explicitly dynamic tables use dynamic-table URL path
117
+ url_domain = (
118
+ "dynamic-table"
119
+ if domain == SnowflakeObjectDomain.DYNAMIC_TABLE
120
+ else str(domain)
121
+ )
122
+ return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/{url_domain}/{table_name}/"
99
123
 
100
124
  def get_external_url_for_schema(
101
125
  self, schema_name: str, db_name: str
@@ -129,6 +153,7 @@ class SnowflakeFilter:
129
153
  SnowflakeObjectDomain.MATERIALIZED_VIEW,
130
154
  SnowflakeObjectDomain.ICEBERG_TABLE,
131
155
  SnowflakeObjectDomain.STREAM,
156
+ SnowflakeObjectDomain.DYNAMIC_TABLE,
132
157
  ):
133
158
  return False
134
159
  if _is_sys_table(dataset_name):
@@ -160,7 +185,8 @@ class SnowflakeFilter:
160
185
  return False
161
186
 
162
187
  if dataset_type.lower() in {
163
- SnowflakeObjectDomain.TABLE
188
+ SnowflakeObjectDomain.TABLE,
189
+ SnowflakeObjectDomain.DYNAMIC_TABLE,
164
190
  } and not self.filter_config.table_pattern.allowed(
165
191
  _cleanup_qualified_name(dataset_name, self.structured_reporter)
166
192
  ):
@@ -325,15 +351,10 @@ class SnowflakeIdentifierBuilder:
325
351
  user_email: Optional[str],
326
352
  ) -> str:
327
353
  if user_email:
328
- return self.snowflake_identifier(
329
- user_email
330
- if self.identifier_config.email_as_user_identifier is True
331
- else user_email.split("@")[0]
332
- )
354
+ return self.snowflake_identifier(user_email)
333
355
  return self.snowflake_identifier(
334
356
  f"{user_name}@{self.identifier_config.email_domain}"
335
- if self.identifier_config.email_as_user_identifier is True
336
- and self.identifier_config.email_domain is not None
357
+ if self.identifier_config.email_domain is not None
337
358
  else user_name
338
359
  )
339
360
 
@@ -32,6 +32,7 @@ from datahub.ingestion.api.source import (
32
32
  )
33
33
  from datahub.ingestion.api.source_helpers import auto_workunit
34
34
  from datahub.ingestion.api.workunit import MetadataWorkUnit
35
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
35
36
  from datahub.ingestion.source.snowflake.constants import (
36
37
  GENERIC_PERMISSION_ERROR_KEY,
37
38
  SnowflakeEdition,
@@ -72,6 +73,7 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
72
73
  from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
73
74
  from datahub.ingestion.source.state.redundant_run_skip_handler import (
74
75
  RedundantLineageRunSkipHandler,
76
+ RedundantQueriesRunSkipHandler,
75
77
  RedundantUsageRunSkipHandler,
76
78
  )
77
79
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
@@ -97,7 +99,14 @@ logger: logging.Logger = logging.getLogger(__name__)
97
99
  @support_status(SupportStatus.CERTIFIED)
98
100
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
99
101
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
100
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
102
+ @capability(
103
+ SourceCapability.CONTAINERS,
104
+ "Enabled by default",
105
+ subtype_modifier=[
106
+ SourceCapabilityModifier.DATABASE,
107
+ SourceCapabilityModifier.SCHEMA,
108
+ ],
109
+ )
101
110
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
102
111
  @capability(
103
112
  SourceCapability.DATA_PROFILING,
@@ -118,7 +127,7 @@ logger: logging.Logger = logging.getLogger(__name__)
118
127
  )
119
128
  @capability(
120
129
  SourceCapability.DELETION_DETECTION,
121
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
130
+ "Enabled by default via stateful ingestion",
122
131
  supported=True,
123
132
  )
124
133
  @capability(
@@ -131,6 +140,7 @@ logger: logging.Logger = logging.getLogger(__name__)
131
140
  "Optionally enabled via `classification.enabled`",
132
141
  supported=True,
133
142
  )
143
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
134
144
  class SnowflakeV2Source(
135
145
  SnowflakeCommonMixin,
136
146
  StatefulIngestionSourceBase,
@@ -162,7 +172,11 @@ class SnowflakeV2Source(
162
172
  )
163
173
 
164
174
  # For database, schema, tables, views, etc
165
- self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
175
+ self.data_dictionary = SnowflakeDataDictionary(
176
+ connection=self.connection,
177
+ report=self.report,
178
+ fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
179
+ )
166
180
  self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
167
181
 
168
182
  self.discovered_datasets: Optional[List[str]] = None
@@ -186,6 +200,7 @@ class SnowflakeV2Source(
186
200
  ),
187
201
  generate_usage_statistics=False,
188
202
  generate_operations=False,
203
+ generate_queries=self.config.include_queries,
189
204
  format_queries=self.config.format_sql_queries,
190
205
  is_temp_table=self._is_temp_table,
191
206
  is_allowed_table=self._is_allowed_table,
@@ -193,7 +208,7 @@ class SnowflakeV2Source(
193
208
  )
194
209
  self.report.sql_aggregator = self.aggregator.report
195
210
 
196
- if self.config.include_table_lineage:
211
+ if self.config.include_table_lineage and not self.config.use_queries_v2:
197
212
  redundant_lineage_run_skip_handler: Optional[
198
213
  RedundantLineageRunSkipHandler
199
214
  ] = None
@@ -311,6 +326,7 @@ class SnowflakeV2Source(
311
326
  SourceCapability.PLATFORM_INSTANCE,
312
327
  SourceCapability.DOMAINS,
313
328
  SourceCapability.DELETION_DETECTION,
329
+ SourceCapability.TEST_CONNECTION,
314
330
  )
315
331
  ]
316
332
 
@@ -516,6 +532,7 @@ class SnowflakeV2Source(
516
532
  snowsight_url_builder=snowsight_url_builder,
517
533
  filters=self.filters,
518
534
  identifiers=self.identifiers,
535
+ fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
519
536
  )
520
537
 
521
538
  with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
@@ -573,8 +590,20 @@ class SnowflakeV2Source(
573
590
  with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
574
591
  schema_resolver = self.aggregator._schema_resolver
575
592
 
593
+ redundant_queries_run_skip_handler: Optional[
594
+ RedundantQueriesRunSkipHandler
595
+ ] = None
596
+ if self.config.enable_stateful_time_window:
597
+ redundant_queries_run_skip_handler = RedundantQueriesRunSkipHandler(
598
+ source=self,
599
+ config=self.config,
600
+ pipeline_name=self.ctx.pipeline_name,
601
+ run_id=self.ctx.run_id,
602
+ )
603
+
576
604
  queries_extractor = SnowflakeQueriesExtractor(
577
605
  connection=self.connection,
606
+ # TODO: this should be its own section in main recipe
578
607
  config=SnowflakeQueriesExtractorConfig(
579
608
  window=BaseTimeWindowConfig(
580
609
  start_time=self.config.start_time,
@@ -589,10 +618,15 @@ class SnowflakeV2Source(
589
618
  include_query_usage_statistics=self.config.include_query_usage_statistics,
590
619
  user_email_pattern=self.config.user_email_pattern,
591
620
  pushdown_deny_usernames=self.config.pushdown_deny_usernames,
621
+ pushdown_allow_usernames=self.config.pushdown_allow_usernames,
622
+ query_dedup_strategy=self.config.query_dedup_strategy,
623
+ push_down_database_pattern_access_history=self.config.push_down_database_pattern_access_history,
624
+ additional_database_names_allowlist=self.config.additional_database_names_allowlist,
592
625
  ),
593
626
  structured_report=self.report,
594
627
  filters=self.filters,
595
628
  identifiers=self.identifiers,
629
+ redundant_run_skip_handler=redundant_queries_run_skip_handler,
596
630
  schema_resolver=schema_resolver,
597
631
  discovered_tables=self.discovered_datasets,
598
632
  graph=self.ctx.graph,
@@ -730,6 +764,7 @@ class SnowflakeV2Source(
730
764
  # For privatelink, account identifier ends with .privatelink
731
765
  # See https://docs.snowflake.com/en/user-guide/organizations-connect.html#private-connectivity-urls
732
766
  privatelink=self.config.account_id.endswith(".privatelink"),
767
+ snowflake_domain=self.config.snowflake_domain,
733
768
  )
734
769
 
735
770
  except Exception as e:
@@ -0,0 +1,143 @@
1
+ import dataclasses
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from typing import Any, Iterable, List, Optional
5
+
6
+ from datahub.ingestion.api.closeable import Closeable
7
+ from datahub.metadata.urns import CorpUserUrn
8
+ from datahub.sql_parsing.sql_parsing_aggregator import (
9
+ PreparsedQuery,
10
+ UrnStr,
11
+ )
12
+ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
13
+ from datahub.utilities.file_backed_collections import FileBackedDict
14
+
15
+
16
+ @dataclasses.dataclass
17
+ class StoredProcCall:
18
+ snowflake_root_query_id: str
19
+
20
+ # Query text will typically be something like:
21
+ # "CALL SALES_FORECASTING.CUSTOMER_ANALYSIS_PROC();"
22
+ query_text: str
23
+
24
+ timestamp: datetime
25
+ user: CorpUserUrn
26
+ default_db: str
27
+ default_schema: str
28
+
29
+
30
+ @dataclass
31
+ class StoredProcExecutionLineage:
32
+ call: StoredProcCall
33
+
34
+ inputs: List[UrnStr]
35
+ outputs: List[UrnStr]
36
+
37
+
38
+ @dataclass
39
+ class StoredProcLineageReport:
40
+ num_stored_proc_calls: int = 0
41
+ num_related_queries: int = 0
42
+ num_related_queries_without_proc_call: int = 0
43
+
44
+ # Incremented at generation/build time.
45
+ num_stored_proc_lineage_entries: int = 0
46
+ num_stored_proc_calls_with_no_inputs: int = 0
47
+ num_stored_proc_calls_with_no_outputs: int = 0
48
+
49
+
50
+ class StoredProcLineageTracker(Closeable):
51
+ """
52
+ Tracks table-level lineage for Snowflake stored procedures.
53
+
54
+ Stored procedures in Snowflake trigger multiple SQL queries during execution.
55
+ Snowflake assigns each stored procedure call a unique query_id and uses this as the
56
+ root_query_id for all subsequent queries executed within that procedure. This allows
57
+ us to trace which queries belong to a specific stored procedure execution and build
58
+ table-level lineage by aggregating inputs/outputs from all related queries.
59
+ """
60
+
61
+ def __init__(self, platform: str, shared_connection: Optional[Any] = None):
62
+ self.platform = platform
63
+ self.report = StoredProcLineageReport()
64
+
65
+ # { root_query_id -> StoredProcExecutionLineage }
66
+ self._stored_proc_execution_lineage: FileBackedDict[
67
+ StoredProcExecutionLineage
68
+ ] = FileBackedDict(shared_connection, tablename="stored_proc_lineage")
69
+
70
+ def add_stored_proc_call(self, call: StoredProcCall) -> None:
71
+ """Add a stored procedure call to track."""
72
+ self._stored_proc_execution_lineage[call.snowflake_root_query_id] = (
73
+ StoredProcExecutionLineage(
74
+ call=call,
75
+ # Will be populated by subsequent queries.
76
+ inputs=[],
77
+ outputs=[],
78
+ )
79
+ )
80
+ self.report.num_stored_proc_calls += 1
81
+
82
+ def add_related_query(self, query: PreparsedQuery) -> bool:
83
+ """Add a query that might be related to a stored procedure execution.
84
+
85
+ Returns True if the query was added to a stored procedure execution, False otherwise.
86
+ """
87
+ snowflake_root_query_id = (query.extra_info or {}).get(
88
+ "snowflake_root_query_id"
89
+ )
90
+
91
+ if snowflake_root_query_id:
92
+ if snowflake_root_query_id not in self._stored_proc_execution_lineage:
93
+ self.report.num_related_queries_without_proc_call += 1
94
+ return False
95
+
96
+ stored_proc_execution = self._stored_proc_execution_lineage.for_mutation(
97
+ snowflake_root_query_id
98
+ )
99
+ stored_proc_execution.inputs.extend(query.upstreams)
100
+ if query.downstream is not None:
101
+ stored_proc_execution.outputs.append(query.downstream)
102
+ self.report.num_related_queries += 1
103
+ return True
104
+
105
+ return False
106
+
107
+ def build_merged_lineage_entries(self) -> Iterable[PreparsedQuery]:
108
+ # For stored procedures, we can only get table-level lineage from the audit log.
109
+ # We represent these as PreparsedQuery objects for now. Eventually we'll want to
110
+ # create dataJobInputOutput lineage instead.
111
+
112
+ for stored_proc_execution in self._stored_proc_execution_lineage.values():
113
+ if not stored_proc_execution.inputs:
114
+ self.report.num_stored_proc_calls_with_no_inputs += 1
115
+ continue
116
+
117
+ if not stored_proc_execution.outputs:
118
+ self.report.num_stored_proc_calls_with_no_outputs += 1
119
+ # Still continue to generate lineage for cases where we have inputs but no outputs
120
+
121
+ for downstream in stored_proc_execution.outputs:
122
+ stored_proc_query_id = get_query_fingerprint(
123
+ stored_proc_execution.call.query_text,
124
+ self.platform,
125
+ fast=True,
126
+ secondary_id=downstream,
127
+ )
128
+
129
+ lineage_entry = PreparsedQuery(
130
+ query_id=stored_proc_query_id,
131
+ query_text=stored_proc_execution.call.query_text,
132
+ upstreams=stored_proc_execution.inputs,
133
+ downstream=downstream,
134
+ query_count=0,
135
+ user=stored_proc_execution.call.user,
136
+ timestamp=stored_proc_execution.call.timestamp,
137
+ )
138
+
139
+ self.report.num_stored_proc_lineage_entries += 1
140
+ yield lineage_entry
141
+
142
+ def close(self) -> None:
143
+ self._stored_proc_execution_lineage.close()