acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,7 @@ from datahub.ingestion.source.snowflake.snowflake_config import (
8
8
  )
9
9
  from datahub.utilities.prefix_batch_builder import PrefixGroup
10
10
 
11
- SHOW_VIEWS_MAX_PAGE_SIZE = 10000
11
+ SHOW_COMMAND_MAX_PAGE_SIZE = 10000
12
12
  SHOW_STREAM_MAX_PAGE_SIZE = 10000
13
13
 
14
14
 
@@ -38,12 +38,23 @@ class SnowflakeQuery:
38
38
  SnowflakeObjectDomain.MATERIALIZED_VIEW.capitalize(),
39
39
  SnowflakeObjectDomain.ICEBERG_TABLE.capitalize(),
40
40
  SnowflakeObjectDomain.STREAM.capitalize(),
41
+ SnowflakeObjectDomain.DYNAMIC_TABLE.capitalize(),
41
42
  }
42
43
 
43
44
  ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format(
44
45
  ",".join(f"'{domain}'" for domain in ACCESS_HISTORY_TABLE_VIEW_DOMAINS)
45
46
  )
46
47
 
48
+ # Domains that can be downstream tables in lineage
49
+ DOWNSTREAM_TABLE_DOMAINS = {
50
+ SnowflakeObjectDomain.TABLE.capitalize(),
51
+ SnowflakeObjectDomain.DYNAMIC_TABLE.capitalize(),
52
+ }
53
+
54
+ DOWNSTREAM_TABLE_DOMAINS_FILTER = "({})".format(
55
+ ",".join(f"'{domain}'" for domain in DOWNSTREAM_TABLE_DOMAINS)
56
+ )
57
+
47
58
  @staticmethod
48
59
  def current_account() -> str:
49
60
  return "select CURRENT_ACCOUNT()"
@@ -235,7 +246,7 @@ class SnowflakeQuery:
235
246
  @staticmethod
236
247
  def show_views_for_database(
237
248
  db_name: str,
238
- limit: int = SHOW_VIEWS_MAX_PAGE_SIZE,
249
+ limit: int = SHOW_COMMAND_MAX_PAGE_SIZE,
239
250
  view_pagination_marker: Optional[str] = None,
240
251
  ) -> str:
241
252
  # While there is an information_schema.views view, that only shows the view definition if the role
@@ -244,7 +255,7 @@ class SnowflakeQuery:
244
255
 
245
256
  # SHOW VIEWS can return a maximum of 10000 rows.
246
257
  # https://docs.snowflake.com/en/sql-reference/sql/show-views#usage-notes
247
- assert limit <= SHOW_VIEWS_MAX_PAGE_SIZE
258
+ assert limit <= SHOW_COMMAND_MAX_PAGE_SIZE
248
259
 
249
260
  # To work around this, we paginate through the results using the FROM clause.
250
261
  from_clause = (
@@ -255,6 +266,33 @@ SHOW VIEWS IN DATABASE "{db_name}"
255
266
  LIMIT {limit} {from_clause};
256
267
  """
257
268
 
269
+ @staticmethod
270
+ def get_views_for_database(db_name: str) -> str:
271
+ # We've seen some issues with the `SHOW VIEWS` query,
272
+ # particularly when it requires pagination.
273
+ # This is an experimental alternative query that might be more reliable.
274
+ return f"""\
275
+ SELECT
276
+ TABLE_CATALOG as "VIEW_CATALOG",
277
+ TABLE_SCHEMA as "VIEW_SCHEMA",
278
+ TABLE_NAME as "VIEW_NAME",
279
+ COMMENT,
280
+ VIEW_DEFINITION,
281
+ CREATED,
282
+ LAST_ALTERED,
283
+ IS_SECURE
284
+ FROM "{db_name}".information_schema.views
285
+ WHERE TABLE_CATALOG = '{db_name}'
286
+ AND TABLE_SCHEMA != 'INFORMATION_SCHEMA'
287
+ """
288
+
289
+ @staticmethod
290
+ def get_views_for_schema(db_name: str, schema_name: str) -> str:
291
+ return f"""\
292
+ {SnowflakeQuery.get_views_for_database(db_name).rstrip()}
293
+ AND TABLE_SCHEMA = '{schema_name}'
294
+ """
295
+
258
296
  @staticmethod
259
297
  def get_secure_view_definitions() -> str:
260
298
  # https://docs.snowflake.com/en/sql-reference/account-usage/views
@@ -686,7 +724,7 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
686
724
  AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
687
725
  AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)
688
726
  AND upstream_table_domain in {allowed_upstream_table_domains}
689
- AND downstream_table_domain = '{SnowflakeObjectDomain.TABLE.capitalize()}'
727
+ AND downstream_table_domain in {SnowflakeQuery.DOWNSTREAM_TABLE_DOMAINS_FILTER}
690
728
  {("AND " + upstream_sql_filter) if upstream_sql_filter else ""}
691
729
  ),
692
730
  column_upstream_jobs AS (
@@ -843,7 +881,7 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
843
881
  AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
844
882
  AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)
845
883
  AND upstream_table_domain in {allowed_upstream_table_domains}
846
- AND downstream_table_domain = '{SnowflakeObjectDomain.TABLE.capitalize()}'
884
+ AND downstream_table_domain in {SnowflakeQuery.DOWNSTREAM_TABLE_DOMAINS_FILTER}
847
885
  {("AND " + upstream_sql_filter) if upstream_sql_filter else ""}
848
886
  ),
849
887
  table_upstream_jobs_unique AS (
@@ -940,3 +978,37 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
940
978
  f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
941
979
  )
942
980
  return f"""SHOW STREAMS IN DATABASE "{db_name}" LIMIT {limit} {from_clause};"""
981
+
982
+ @staticmethod
983
+ def show_dynamic_tables_for_database(
984
+ db_name: str,
985
+ limit: int = SHOW_COMMAND_MAX_PAGE_SIZE,
986
+ dynamic_table_pagination_marker: Optional[str] = None,
987
+ ) -> str:
988
+ """Get dynamic table definitions using SHOW DYNAMIC TABLES."""
989
+ assert limit <= SHOW_COMMAND_MAX_PAGE_SIZE
990
+
991
+ from_clause = (
992
+ f"""FROM '{dynamic_table_pagination_marker}'"""
993
+ if dynamic_table_pagination_marker
994
+ else ""
995
+ )
996
+ return f"""\
997
+ SHOW DYNAMIC TABLES IN DATABASE "{db_name}"
998
+ LIMIT {limit} {from_clause};
999
+ """
1000
+
1001
+ @staticmethod
1002
+ def get_dynamic_table_graph_history(db_name: str) -> str:
1003
+ """Get dynamic table dependency information from information schema."""
1004
+ return f"""
1005
+ SELECT
1006
+ name,
1007
+ inputs,
1008
+ target_lag_type,
1009
+ target_lag_sec,
1010
+ scheduling_state,
1011
+ alter_trigger
1012
+ FROM TABLE("{db_name}".INFORMATION_SCHEMA.DYNAMIC_TABLE_GRAPH_HISTORY())
1013
+ ORDER BY name
1014
+ """
@@ -9,7 +9,6 @@ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
9
9
  from datahub.ingestion.source.state.stateful_ingestion_base import (
10
10
  StatefulIngestionReport,
11
11
  )
12
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
13
12
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
14
13
  from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
15
14
  from datahub.utilities.lossy_collections import LossyDict
@@ -96,7 +95,6 @@ class SnowflakeV2Report(
96
95
  SnowflakeUsageReport,
97
96
  StatefulIngestionReport,
98
97
  ClassificationReportMixin,
99
- IngestionStageReport,
100
98
  ):
101
99
  account_locator: Optional[str] = None
102
100
  region: Optional[str] = None
@@ -128,6 +126,7 @@ class SnowflakeV2Report(
128
126
  # "Information schema query returned too much data. Please repeat query with more selective predicates.""
129
127
  # This will result in overall increase in time complexity
130
128
  num_get_tables_for_schema_queries: int = 0
129
+ num_get_views_for_schema_queries: int = 0
131
130
 
132
131
  # these will be non-zero if the user choses to enable the extract_tags = "with_lineage" option, which requires
133
132
  # individual queries per object (database, schema, table) and an extra query per table to get the tags on the columns.
@@ -1,18 +1,19 @@
1
1
  import logging
2
- import os
3
2
  from collections import defaultdict
4
3
  from dataclasses import dataclass, field
5
4
  from datetime import datetime
6
- from typing import Callable, Dict, Iterable, List, MutableMapping, Optional
5
+ from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Tuple
7
6
 
7
+ from datahub.configuration.env_vars import get_snowflake_schema_parallelism
8
8
  from datahub.ingestion.api.report import SupportsAsObj
9
9
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
10
10
  from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
11
11
  from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection
12
12
  from datahub.ingestion.source.snowflake.snowflake_query import (
13
- SHOW_VIEWS_MAX_PAGE_SIZE,
13
+ SHOW_COMMAND_MAX_PAGE_SIZE,
14
14
  SnowflakeQuery,
15
15
  )
16
+ from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
16
17
  from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
17
18
  from datahub.ingestion.source.sql.stored_procedures.base import BaseProcedure
18
19
  from datahub.utilities.file_backed_collections import FileBackedDict
@@ -21,7 +22,7 @@ from datahub.utilities.serialized_lru_cache import serialized_lru_cache
21
22
 
22
23
  logger: logging.Logger = logging.getLogger(__name__)
23
24
 
24
- SCHEMA_PARALLELISM = int(os.getenv("DATAHUB_SNOWFLAKE_SCHEMA_PARALLELISM", 20))
25
+ SCHEMA_PARALLELISM = get_snowflake_schema_parallelism()
25
26
 
26
27
 
27
28
  @dataclass
@@ -103,6 +104,17 @@ class SnowflakeTable(BaseTable):
103
104
  return DatasetSubTypes.TABLE
104
105
 
105
106
 
107
+ @dataclass
108
+ class SnowflakeDynamicTable(SnowflakeTable):
109
+ definition: Optional[str] = (
110
+ None # SQL query that defines the dynamic table's content
111
+ )
112
+ target_lag: Optional[str] = None # Refresh frequency (e.g., "1 HOUR", "30 MINUTES")
113
+
114
+ def get_subtype(self) -> DatasetSubTypes:
115
+ return DatasetSubTypes.DYNAMIC_TABLE
116
+
117
+
106
118
  @dataclass
107
119
  class SnowflakeView(BaseView):
108
120
  materialized: bool = False
@@ -226,10 +238,17 @@ class _SnowflakeTagCache:
226
238
 
227
239
 
228
240
  class SnowflakeDataDictionary(SupportsAsObj):
229
- def __init__(self, connection: SnowflakeConnection) -> None:
241
+ def __init__(
242
+ self,
243
+ connection: SnowflakeConnection,
244
+ report: SnowflakeV2Report,
245
+ fetch_views_from_information_schema: bool = False,
246
+ ) -> None:
230
247
  self.connection = connection
248
+ self.report = report
249
+ self._fetch_views_from_information_schema = fetch_views_from_information_schema
231
250
 
232
- def as_obj(self) -> Dict[str, Dict[str, int]]:
251
+ def as_obj(self) -> Dict[str, Any]:
233
252
  # TODO: Move this into a proper report type that gets computed.
234
253
 
235
254
  # Reports how many times we reset in-memory `functools.lru_cache` caches of data,
@@ -245,7 +264,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
245
264
  self.get_fk_constraints_for_schema,
246
265
  ]
247
266
 
248
- report = {}
267
+ report: Dict[str, Any] = {
268
+ "fetch_views_from_information_schema": self._fetch_views_from_information_schema,
269
+ }
249
270
  for func in lru_cache_functions:
250
271
  report[func.__name__] = func.cache_info()._asdict() # type: ignore
251
272
  return report
@@ -355,8 +376,11 @@ class SnowflakeDataDictionary(SupportsAsObj):
355
376
  if table["TABLE_SCHEMA"] not in tables:
356
377
  tables[table["TABLE_SCHEMA"]] = []
357
378
 
379
+ is_dynamic = table.get("IS_DYNAMIC", "NO").upper() == "YES"
380
+ table_cls = SnowflakeDynamicTable if is_dynamic else SnowflakeTable
381
+
358
382
  tables[table["TABLE_SCHEMA"]].append(
359
- SnowflakeTable(
383
+ table_cls(
360
384
  name=table["TABLE_NAME"],
361
385
  type=table["TABLE_TYPE"],
362
386
  created=table["CREATED"],
@@ -365,11 +389,15 @@ class SnowflakeDataDictionary(SupportsAsObj):
365
389
  rows_count=table["ROW_COUNT"],
366
390
  comment=table["COMMENT"],
367
391
  clustering_key=table["CLUSTERING_KEY"],
368
- is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
392
+ is_dynamic=is_dynamic,
369
393
  is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
370
394
  is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
371
395
  )
372
396
  )
397
+
398
+ # Populate dynamic table definitions
399
+ self.populate_dynamic_table_definitions(tables, db_name)
400
+
373
401
  return tables
374
402
 
375
403
  def get_tables_for_schema(
@@ -382,8 +410,11 @@ class SnowflakeDataDictionary(SupportsAsObj):
382
410
  )
383
411
 
384
412
  for table in cur:
413
+ is_dynamic = table.get("IS_DYNAMIC", "NO").upper() == "YES"
414
+ table_cls = SnowflakeDynamicTable if is_dynamic else SnowflakeTable
415
+
385
416
  tables.append(
386
- SnowflakeTable(
417
+ table_cls(
387
418
  name=table["TABLE_NAME"],
388
419
  type=table["TABLE_TYPE"],
389
420
  created=table["CREATED"],
@@ -392,16 +423,31 @@ class SnowflakeDataDictionary(SupportsAsObj):
392
423
  rows_count=table["ROW_COUNT"],
393
424
  comment=table["COMMENT"],
394
425
  clustering_key=table["CLUSTERING_KEY"],
395
- is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
426
+ is_dynamic=is_dynamic,
396
427
  is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
397
428
  is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
398
429
  )
399
430
  )
431
+
432
+ # Populate dynamic table definitions for just this schema
433
+ schema_tables = {schema_name: tables}
434
+ self.populate_dynamic_table_definitions(schema_tables, db_name)
435
+
400
436
  return tables
401
437
 
402
438
  @serialized_lru_cache(maxsize=1)
403
- def get_views_for_database(self, db_name: str) -> Dict[str, List[SnowflakeView]]:
404
- page_limit = SHOW_VIEWS_MAX_PAGE_SIZE
439
+ def get_views_for_database(
440
+ self, db_name: str
441
+ ) -> Optional[Dict[str, List[SnowflakeView]]]:
442
+ if self._fetch_views_from_information_schema:
443
+ return self._get_views_for_database_using_information_schema(db_name)
444
+ else:
445
+ return self._get_views_for_database_using_show(db_name)
446
+
447
+ def _get_views_for_database_using_show(
448
+ self, db_name: str
449
+ ) -> Dict[str, List[SnowflakeView]]:
450
+ page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
405
451
 
406
452
  views: Dict[str, List[SnowflakeView]] = {}
407
453
 
@@ -431,10 +477,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
431
477
  SnowflakeView(
432
478
  name=view_name,
433
479
  created=view["created_on"],
434
- # last_altered=table["last_altered"],
435
480
  comment=view["comment"],
436
481
  view_definition=view["text"],
437
- last_altered=view["created_on"],
482
+ last_altered=view["created_on"], # TODO: This is not correct.
438
483
  materialized=(
439
484
  view.get("is_materialized", "false").lower() == "true"
440
485
  ),
@@ -449,6 +494,163 @@ class SnowflakeDataDictionary(SupportsAsObj):
449
494
  )
450
495
  view_pagination_marker = view_name
451
496
 
497
+ # Because this is in a cached function, this will only log once per database.
498
+ view_counts = {schema_name: len(views[schema_name]) for schema_name in views}
499
+ logger.info(
500
+ f"Finished fetching views in {db_name}; counts by schema {view_counts}"
501
+ )
502
+ return views
503
+
504
+ def _map_view(self, db_name: str, row: Dict[str, Any]) -> Tuple[str, SnowflakeView]:
505
+ schema_name = row["VIEW_SCHEMA"]
506
+ view_definition = row.get("VIEW_DEFINITION")
507
+ fragment_view_definition = (
508
+ view_definition[:50].strip() if view_definition else None
509
+ )
510
+ logger.info(
511
+ f"Mapping view {db_name}.{schema_name}.{row['VIEW_NAME']} with view definition: {fragment_view_definition}..."
512
+ )
513
+
514
+ return schema_name, SnowflakeView(
515
+ name=row["VIEW_NAME"],
516
+ created=row["CREATED"],
517
+ comment=row["COMMENT"],
518
+ view_definition=view_definition,
519
+ last_altered=row["LAST_ALTERED"],
520
+ is_secure=(row.get("IS_SECURE", "false").lower() == "true"),
521
+ # TODO: This doesn't work for materialized views.
522
+ materialized=False,
523
+ )
524
+
525
+ def _maybe_populate_empty_view_definitions(
526
+ self,
527
+ db_name: str,
528
+ schema_name: str,
529
+ views_with_empty_definition: List[SnowflakeView],
530
+ ) -> List[SnowflakeView]:
531
+ if not views_with_empty_definition:
532
+ return []
533
+
534
+ view_names = [view.name for view in views_with_empty_definition]
535
+ batches = [
536
+ batch[0]
537
+ for batch in build_prefix_batches(
538
+ view_names, max_batch_size=1000, max_groups_in_batch=1
539
+ )
540
+ if batch
541
+ # Skip empty batch if so, also max_groups_in_batch=1 makes it safe to access batch[0]
542
+ ]
543
+
544
+ view_map: Dict[str, SnowflakeView] = {
545
+ view.name: view for view in views_with_empty_definition
546
+ }
547
+ views_found_count = 0
548
+
549
+ logger.info(
550
+ f"Fetching definitions for {len(view_map)} views in {db_name}.{schema_name} "
551
+ f"using batched 'SHOW VIEWS ... LIKE ...' queries. Found {len(batches)} batch(es)."
552
+ )
553
+
554
+ for batch_index, prefix_group in enumerate(batches):
555
+ query = f'SHOW VIEWS LIKE \'{prefix_group.prefix}%\' IN SCHEMA "{db_name}"."{schema_name}"'
556
+ logger.info(f"Processing batch {batch_index + 1}/{len(batches)}: {query}")
557
+
558
+ try:
559
+ cur = self.connection.query(query)
560
+ for row in cur:
561
+ view_name = row["name"]
562
+ if view_name in view_map:
563
+ view_definition = row.get("text")
564
+ if view_definition: # Ensure definition is not None or empty
565
+ view_map[view_name].view_definition = view_definition
566
+ views_found_count += 1
567
+ logger.debug(
568
+ f"Fetched view definition for {db_name}.{schema_name}.{view_name}"
569
+ )
570
+ # If all targeted views are found, we could theoretically break early,
571
+ # but SHOW VIEWS doesn't guarantee order, so we must process all results.
572
+ else:
573
+ logger.warning(
574
+ f"'text' field missing or empty in SHOW VIEWS result for {db_name}.{schema_name}.{view_name}"
575
+ )
576
+
577
+ except Exception as e:
578
+ logger.error(
579
+ f"Failed to execute query for batch {batch_index + 1} ('{query}') for {db_name}.{schema_name} or process its results.",
580
+ exc_info=e,
581
+ )
582
+ # Returning the original list; some views might still be missing definitions.
583
+ # This also means subsequent batches for this schema (in this call) are skipped.
584
+ return views_with_empty_definition
585
+
586
+ logger.info(
587
+ f"Finished processing 'SHOW VIEWS' batches for {db_name}.{schema_name}. "
588
+ f"Fetched definitions for {views_found_count} out of {len(view_map)} targeted views."
589
+ )
590
+
591
+ if views_found_count < len(view_map):
592
+ missing_count = len(view_map) - views_found_count
593
+ logger.warning(
594
+ f"Could not fetch definitions for {missing_count} views in {db_name}.{schema_name} after processing all batches."
595
+ )
596
+ # The SnowflakeView objects in the original list were modified in place via view_map
597
+ return views_with_empty_definition
598
+
599
+ def _get_views_for_database_using_information_schema(
600
+ self, db_name: str
601
+ ) -> Optional[Dict[str, List[SnowflakeView]]]:
602
+ try:
603
+ cur = self.connection.query(
604
+ SnowflakeQuery.get_views_for_database(db_name),
605
+ )
606
+ except Exception as e:
607
+ logger.debug(f"Failed to get all views for database {db_name}", exc_info=e)
608
+ # Error - Information schema query returned too much data. Please repeat query with more selective predicates.
609
+ return None
610
+
611
+ views: Dict[str, List[SnowflakeView]] = {}
612
+ views_with_empty_definition: Dict[str, List[SnowflakeView]] = {}
613
+
614
+ for row in cur:
615
+ schema_name, view = self._map_view(db_name, row)
616
+ if view.view_definition is None or view.view_definition == "":
617
+ views_with_empty_definition.setdefault(schema_name, []).append(view)
618
+ else:
619
+ views.setdefault(schema_name, []).append(view)
620
+
621
+ for schema_name, empty_views in views_with_empty_definition.items():
622
+ updated_views = self._maybe_populate_empty_view_definitions(
623
+ db_name, schema_name, empty_views
624
+ )
625
+ views.setdefault(schema_name, []).extend(updated_views)
626
+
627
+ return views
628
+
629
+ def get_views_for_schema_using_information_schema(
630
+ self, *, schema_name: str, db_name: str
631
+ ) -> List[SnowflakeView]:
632
+ cur = self.connection.query(
633
+ SnowflakeQuery.get_views_for_schema(
634
+ db_name=db_name, schema_name=schema_name
635
+ ),
636
+ )
637
+
638
+ views: List[SnowflakeView] = []
639
+ views_with_empty_definition: List[SnowflakeView] = []
640
+
641
+ for row in cur:
642
+ schema_name, view = self._map_view(db_name, row)
643
+ if view.view_definition is None or view.view_definition == "":
644
+ views_with_empty_definition.append(view)
645
+ else:
646
+ views.append(view)
647
+
648
+ if views_with_empty_definition:
649
+ updated_empty_views = self._maybe_populate_empty_view_definitions(
650
+ db_name, schema_name, views_with_empty_definition
651
+ )
652
+ views.extend(updated_empty_views)
653
+
452
654
  return views
453
655
 
454
656
  @serialized_lru_cache(maxsize=SCHEMA_PARALLELISM)
@@ -660,7 +862,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
660
862
  def get_streams_for_database(
661
863
  self, db_name: str
662
864
  ) -> Dict[str, List[SnowflakeStream]]:
663
- page_limit = SHOW_VIEWS_MAX_PAGE_SIZE
865
+ page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
664
866
 
665
867
  streams: Dict[str, List[SnowflakeStream]] = {}
666
868
 
@@ -743,3 +945,137 @@ class SnowflakeDataDictionary(SupportsAsObj):
743
945
  )
744
946
  )
745
947
  return procedures
948
+
949
+ @serialized_lru_cache(maxsize=1)
950
+ def get_dynamic_table_graph_info(self, db_name: str) -> Dict[str, Dict[str, Any]]:
951
+ """Get dynamic table dependency information from information schema."""
952
+ dt_graph_info: Dict[str, Dict[str, Any]] = {}
953
+ try:
954
+ cur = self.connection.query(
955
+ SnowflakeQuery.get_dynamic_table_graph_history(db_name)
956
+ )
957
+ for row in cur:
958
+ dt_name = row["NAME"]
959
+ dt_graph_info[dt_name] = {
960
+ "inputs": row.get("INPUTS"),
961
+ "target_lag_type": row.get("TARGET_LAG_TYPE"),
962
+ "target_lag_sec": row.get("TARGET_LAG_SEC"),
963
+ "scheduling_state": row.get("SCHEDULING_STATE"),
964
+ "alter_trigger": row.get("ALTER_TRIGGER"),
965
+ }
966
+ logger.debug(
967
+ f"Successfully retrieved graph info for {len(dt_graph_info)} dynamic tables in {db_name}"
968
+ )
969
+ except Exception as e:
970
+ self.report.warning(
971
+ "Failed to get dynamic table graph history",
972
+ db_name,
973
+ exc=e,
974
+ )
975
+
976
+ return dt_graph_info
977
+
978
+ @serialized_lru_cache(maxsize=1)
979
+ def get_dynamic_tables_with_definitions(
980
+ self, db_name: str
981
+ ) -> Dict[str, List[SnowflakeDynamicTable]]:
982
+ """Get dynamic tables with their definitions using SHOW DYNAMIC TABLES."""
983
+ page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
984
+ dynamic_tables: Dict[str, List[SnowflakeDynamicTable]] = {}
985
+
986
+ # Get graph/dependency information (pass db_name)
987
+ dt_graph_info = self.get_dynamic_table_graph_info(db_name)
988
+
989
+ first_iteration = True
990
+ dt_pagination_marker: Optional[str] = None
991
+
992
+ while first_iteration or dt_pagination_marker is not None:
993
+ try:
994
+ cur = self.connection.query(
995
+ SnowflakeQuery.show_dynamic_tables_for_database(
996
+ db_name,
997
+ limit=page_limit,
998
+ dynamic_table_pagination_marker=dt_pagination_marker,
999
+ )
1000
+ )
1001
+
1002
+ first_iteration = False
1003
+ dt_pagination_marker = None
1004
+ result_set_size = 0
1005
+
1006
+ for dt in cur:
1007
+ result_set_size += 1
1008
+
1009
+ dt_name = dt["name"]
1010
+ schema_name = dt["schema_name"]
1011
+
1012
+ if schema_name not in dynamic_tables:
1013
+ dynamic_tables[schema_name] = []
1014
+
1015
+ # Get definition from SHOW result
1016
+ definition = dt.get("text")
1017
+
1018
+ # Get target lag from SHOW result or graph info
1019
+ target_lag = dt.get("target_lag")
1020
+ if not target_lag and dt_graph_info:
1021
+ qualified_name = f"{db_name}.{schema_name}.{dt_name}"
1022
+ graph_info = dt_graph_info.get(qualified_name, {})
1023
+ if graph_info.get("target_lag_type") and graph_info.get(
1024
+ "target_lag_sec"
1025
+ ):
1026
+ target_lag = f"{graph_info['target_lag_sec']} {graph_info['target_lag_type']}"
1027
+
1028
+ dynamic_tables[schema_name].append(
1029
+ SnowflakeDynamicTable(
1030
+ name=dt_name,
1031
+ created=dt["created_on"],
1032
+ last_altered=dt.get("created_on"),
1033
+ size_in_bytes=dt.get("bytes", 0),
1034
+ rows_count=dt.get("rows", 0),
1035
+ comment=dt.get("comment"),
1036
+ definition=definition,
1037
+ target_lag=target_lag,
1038
+ is_dynamic=True,
1039
+ type="DYNAMIC TABLE",
1040
+ )
1041
+ )
1042
+
1043
+ if result_set_size >= page_limit:
1044
+ logger.info(
1045
+ f"Fetching next page of dynamic tables for {db_name} - after {dt_name}"
1046
+ )
1047
+ dt_pagination_marker = dt_name
1048
+
1049
+ except Exception as e:
1050
+ logger.debug(
1051
+ f"Failed to get dynamic tables for database {db_name}: {e}"
1052
+ )
1053
+ break
1054
+
1055
+ return dynamic_tables
1056
+
1057
+ def populate_dynamic_table_definitions(
1058
+ self, tables: Dict[str, List[SnowflakeTable]], db_name: str
1059
+ ) -> None:
1060
+ """Populate dynamic table definitions for tables that are marked as dynamic."""
1061
+ try:
1062
+ # Get dynamic tables with definitions from SHOW command
1063
+ dt_with_definitions = self.get_dynamic_tables_with_definitions(db_name)
1064
+
1065
+ for schema_name, table_list in tables.items():
1066
+ for table in table_list:
1067
+ if (
1068
+ isinstance(table, SnowflakeDynamicTable)
1069
+ and table.definition is None
1070
+ ):
1071
+ # Find matching dynamic table from SHOW results
1072
+ show_dt_list = dt_with_definitions.get(schema_name, [])
1073
+ for show_dt in show_dt_list:
1074
+ if show_dt.name == table.name:
1075
+ table.definition = show_dt.definition
1076
+ table.target_lag = show_dt.target_lag
1077
+ break
1078
+ except Exception as e:
1079
+ logger.debug(
1080
+ f"Failed to populate dynamic table definitions for {db_name}: {e}"
1081
+ )