acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,8 @@ class SnowflakeCloudProvider(StrEnum):
9
9
 
10
10
  SNOWFLAKE_DEFAULT_CLOUD = SnowflakeCloudProvider.AWS
11
11
 
12
+ DEFAULT_SNOWFLAKE_DOMAIN = "snowflakecomputing.com"
13
+
12
14
 
13
15
  class SnowflakeEdition(StrEnum):
14
16
  STANDARD = "Standard"
@@ -54,6 +56,8 @@ class SnowflakeObjectDomain(StrEnum):
54
56
  COLUMN = "column"
55
57
  ICEBERG_TABLE = "iceberg table"
56
58
  STREAM = "stream"
59
+ PROCEDURE = "procedure"
60
+ DYNAMIC_TABLE = "dynamic table"
57
61
 
58
62
 
59
63
  GENERIC_PERMISSION_ERROR_KEY = "permission-error"
@@ -1,12 +1,13 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
3
  from dataclasses import dataclass
4
+ from enum import Enum
4
5
  from typing import Dict, List, Optional, Set
5
6
 
6
7
  import pydantic
7
- from pydantic import Field, SecretStr, root_validator, validator
8
+ from pydantic import Field, root_validator, validator
8
9
 
9
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
10
11
  from datahub.configuration.pattern_utils import UUID_REGEX
11
12
  from datahub.configuration.source_common import (
12
13
  EnvConfigMixin,
@@ -22,6 +23,7 @@ from datahub.ingestion.api.incremental_properties_helper import (
22
23
  from datahub.ingestion.glossary.classification_mixin import (
23
24
  ClassificationSourceConfigMixin,
24
25
  )
26
+ from datahub.ingestion.source.snowflake.constants import SnowflakeEdition
25
27
  from datahub.ingestion.source.snowflake.snowflake_connection import (
26
28
  SnowflakeConnectionConfig,
27
29
  )
@@ -29,6 +31,7 @@ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterCo
29
31
  from datahub.ingestion.source.state.stateful_ingestion_base import (
30
32
  StatefulLineageConfigMixin,
31
33
  StatefulProfilingConfigMixin,
34
+ StatefulTimeWindowConfigMixin,
32
35
  StatefulUsageConfigMixin,
33
36
  )
34
37
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
@@ -48,9 +51,15 @@ DEFAULT_TEMP_TABLES_PATTERNS = [
48
51
  rf".*\.SEGMENT_{UUID_REGEX}", # segment
49
52
  rf".*\.STAGING_.*_{UUID_REGEX}", # stitch
50
53
  r".*\.(GE_TMP_|GE_TEMP_|GX_TEMP_)[0-9A-F]{8}", # great expectations
54
+ r".*\.SNOWPARK_TEMP_TABLE_.+", # snowpark
51
55
  ]
52
56
 
53
57
 
58
+ class QueryDedupStrategyType(Enum):
59
+ STANDARD = "STANDARD"
60
+ NONE = "NONE"
61
+
62
+
54
63
  class TagOption(StrEnum):
55
64
  with_lineage = "with_lineage"
56
65
  without_lineage = "without_lineage"
@@ -59,13 +68,10 @@ class TagOption(StrEnum):
59
68
 
60
69
  @dataclass(frozen=True)
61
70
  class DatabaseId:
62
- database: str = Field(
63
- description="Database created from share in consumer account."
64
- )
65
- platform_instance: Optional[str] = Field(
66
- default=None,
67
- description="Platform instance of consumer snowflake account.",
68
- )
71
+ # Database created from share in consumer account
72
+ database: str
73
+ # Platform instance of consumer snowflake account
74
+ platform_instance: Optional[str] = None
69
75
 
70
76
 
71
77
  class SnowflakeShareConfig(ConfigModel):
@@ -100,7 +106,15 @@ class SnowflakeFilterConfig(SQLFilterConfig):
100
106
 
101
107
  stream_pattern: AllowDenyPattern = Field(
102
108
  default=AllowDenyPattern.allow_all(),
103
- description="Regex patterns for streams to filter in ingestion. Note: Defaults to table_pattern if not specified. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
109
+ description="Regex patterns for streams to filter in ingestion. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
110
+ )
111
+
112
+ procedure_pattern: AllowDenyPattern = Field(
113
+ default=AllowDenyPattern.allow_all(),
114
+ description="Regex patterns for procedures to filter in ingestion. "
115
+ "Specify regex to match the entire procedure name in database.schema.procedure format. "
116
+ "e.g. to match all procedures starting with customer in Customer database and public schema,"
117
+ " use the regex 'Customer.public.customer.*'",
104
118
  )
105
119
 
106
120
  match_fully_qualified_names: bool = Field(
@@ -145,14 +159,11 @@ class SnowflakeIdentifierConfig(
145
159
 
146
160
  email_domain: Optional[str] = pydantic.Field(
147
161
  default=None,
148
- description="Email domain of your organization so users can be displayed on UI appropriately.",
162
+ description="Email domain of your organization so users can be displayed on UI appropriately. This is used only if we cannot infer email ID.",
149
163
  )
150
164
 
151
- email_as_user_identifier: bool = Field(
152
- default=True,
153
- description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
154
- "provided, generates email addresses for snowflake users with unset emails, based on their "
155
- "username.",
165
+ _email_as_user_identifier = pydantic_removed_field(
166
+ "email_as_user_identifier",
156
167
  )
157
168
 
158
169
 
@@ -189,6 +200,7 @@ class SnowflakeV2Config(
189
200
  SnowflakeUsageConfig,
190
201
  StatefulLineageConfigMixin,
191
202
  StatefulUsageConfigMixin,
203
+ StatefulTimeWindowConfigMixin,
192
204
  StatefulProfilingConfigMixin,
193
205
  ClassificationSourceConfigMixin,
194
206
  IncrementalPropertiesConfigMixin,
@@ -203,6 +215,16 @@ class SnowflakeV2Config(
203
215
  description="If enabled, populates the ingested views' definitions.",
204
216
  )
205
217
 
218
+ fetch_views_from_information_schema: bool = Field(
219
+ default=False,
220
+ description="If enabled, uses information_schema.views to fetch view definitions instead of SHOW VIEWS command. "
221
+ "This alternative method can be more reliable for databases with large numbers of views (> 10K views), as the "
222
+ "SHOW VIEWS approach has proven unreliable and can lead to missing views in such scenarios. However, this method "
223
+ "requires OWNERSHIP privileges on views to retrieve their definitions. For views without ownership permissions "
224
+ "(where VIEW_DEFINITION is null/empty), the system will automatically fall back to using batched SHOW VIEWS queries "
225
+ "to populate the missing definitions.",
226
+ )
227
+
206
228
  include_technical_schema: bool = Field(
207
229
  default=True,
208
230
  description="If enabled, populates the snowflake technical schema and descriptions.",
@@ -223,7 +245,7 @@ class SnowflakeV2Config(
223
245
  )
224
246
 
225
247
  use_queries_v2: bool = Field(
226
- default=False,
248
+ default=True,
227
249
  description="If enabled, uses the new queries extractor to extract queries from snowflake.",
228
250
  )
229
251
  include_queries: bool = Field(
@@ -241,6 +263,11 @@ class SnowflakeV2Config(
241
263
  "This is useful if you have a large number of schemas and want to avoid bulk fetching the schema for each table/view.",
242
264
  )
243
265
 
266
+ query_dedup_strategy: QueryDedupStrategyType = Field(
267
+ default=QueryDedupStrategyType.STANDARD,
268
+ description=f"Experimental: Choose the strategy for query deduplication (default value is appropriate for most use-cases; make sure you understand performance implications before changing it). Allowed values are: {', '.join([s.name for s in QueryDedupStrategyType])}",
269
+ )
270
+
244
271
  _check_role_grants_removed = pydantic_removed_field("check_role_grants")
245
272
  _provision_role_removed = pydantic_removed_field("provision_role")
246
273
 
@@ -254,10 +281,11 @@ class SnowflakeV2Config(
254
281
  description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
255
282
  )
256
283
 
257
- structured_properties_template_cache_invalidation_interval: int = Field(
258
- hidden_from_docs=True,
259
- default=60,
260
- description="Interval in seconds to invalidate the structured properties template cache.",
284
+ structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
285
+ Field(
286
+ default=60,
287
+ description="Interval in seconds to invalidate the structured properties template cache.",
288
+ )
261
289
  )
262
290
 
263
291
  include_external_url: bool = Field(
@@ -284,10 +312,16 @@ class SnowflakeV2Config(
284
312
  description="If enabled, streams will be ingested as separate entities from tables/views.",
285
313
  )
286
314
 
315
+ include_procedures: bool = Field(
316
+ default=True,
317
+ description="If enabled, procedures will be ingested as pipelines/tasks.",
318
+ )
319
+
287
320
  structured_property_pattern: AllowDenyPattern = Field(
288
321
  default=AllowDenyPattern.allow_all(),
289
322
  description=(
290
323
  "List of regex patterns for structured properties to include in ingestion."
324
+ " Applied to tags with form `<database>.<schema>.<tag_name>`."
291
325
  " Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
292
326
  ),
293
327
  )
@@ -300,7 +334,7 @@ class SnowflakeV2Config(
300
334
  "to ignore the temporary staging tables created by known ETL tools.",
301
335
  )
302
336
 
303
- rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field(
337
+ rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field( # type: ignore[pydantic-field]
304
338
  "upstreams_deny_pattern", "temporary_tables_pattern"
305
339
  )
306
340
 
@@ -312,6 +346,17 @@ class SnowflakeV2Config(
312
346
  " Map of share name -> details of share.",
313
347
  )
314
348
 
349
+ known_snowflake_edition: Optional[SnowflakeEdition] = Field(
350
+ default=None,
351
+ description="Explicitly specify the Snowflake edition (STANDARD or ENTERPRISE). If unset, the edition will be inferred automatically using 'SHOW TAGS'.",
352
+ )
353
+
354
+ # Allows empty containers to be ingested before datasets are added, avoiding permission errors
355
+ warn_no_datasets: HiddenFromDocs[bool] = Field(
356
+ default=False,
357
+ description="If True, warns when no datasets are found during ingestion. If False, ingestion fails when no datasets are found.",
358
+ )
359
+
315
360
  include_assertion_results: bool = Field(
316
361
  default=False,
317
362
  description="Whether to ingest assertion run results for assertions created using Datahub"
@@ -320,11 +365,32 @@ class SnowflakeV2Config(
320
365
 
321
366
  pushdown_deny_usernames: List[str] = Field(
322
367
  default=[],
323
- description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
368
+ description="List of snowflake usernames (SQL LIKE patterns, e.g., 'SERVICE_%', '%_PROD', 'TEST_USER') which will NOT be considered for lineage/usage/queries extraction. "
324
369
  "This is primarily useful for improving performance by filtering out users with extremely high query volumes. "
325
370
  "Only applicable if `use_queries_v2` is enabled.",
326
371
  )
327
372
 
373
+ pushdown_allow_usernames: List[str] = Field(
374
+ default=[],
375
+ description="List of snowflake usernames (SQL LIKE patterns, e.g., 'ANALYST_%', '%_USER', 'MAIN_ACCOUNT') which WILL be considered for lineage/usage/queries extraction. "
376
+ "This is primarily useful for improving performance by filtering in only specific users. "
377
+ "Only applicable if `use_queries_v2` is enabled. If not specified, all users not in deny list are included.",
378
+ )
379
+
380
+ push_down_database_pattern_access_history: bool = Field(
381
+ default=False,
382
+ description="If enabled, pushes down database pattern filtering to the access_history table for improved performance. "
383
+ "This filters on the accessed objects in access_history.",
384
+ )
385
+
386
+ additional_database_names_allowlist: List[str] = Field(
387
+ default=[],
388
+ description="Additional database names (no pattern matching) to be included in the access_history filter. "
389
+ "Only applies if push_down_database_pattern_access_history=True. "
390
+ "These databases will be included in the filter being pushed down regardless of database_pattern settings."
391
+ "This may be required in the case of _eg_ temporary tables being created in a different database than the ones in the database_name patterns.",
392
+ )
393
+
328
394
  @validator("convert_urns_to_lowercase")
329
395
  def validate_convert_urns_to_lowercase(cls, v):
330
396
  if not v:
@@ -371,17 +437,6 @@ class SnowflakeV2Config(
371
437
 
372
438
  return values
373
439
 
374
- def get_sql_alchemy_url(
375
- self,
376
- database: Optional[str] = None,
377
- username: Optional[str] = None,
378
- password: Optional[SecretStr] = None,
379
- role: Optional[str] = None,
380
- ) -> str:
381
- return SnowflakeConnectionConfig.get_sql_alchemy_url(
382
- self, database=database, username=username, password=password, role=role
383
- )
384
-
385
440
  @validator("shares")
386
441
  def validate_shares(
387
442
  cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict
@@ -424,6 +479,20 @@ class SnowflakeV2Config(
424
479
 
425
480
  return shares
426
481
 
482
+ @root_validator(pre=False, skip_on_failure=True)
483
+ def validate_queries_v2_stateful_ingestion(cls, values: Dict) -> Dict:
484
+ if values.get("use_queries_v2"):
485
+ if values.get("enable_stateful_lineage_ingestion") or values.get(
486
+ "enable_stateful_usage_ingestion"
487
+ ):
488
+ logger.warning(
489
+ "enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
490
+ "when using use_queries_v2=True. These configs only work with the legacy (non-queries v2) extraction path. "
491
+ "For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
492
+ "for the unified time window extraction (lineage + usage + operations + queries)."
493
+ )
494
+ return values
495
+
427
496
  def outbounds(self) -> Dict[str, Set[DatabaseId]]:
428
497
  """
429
498
  Returns mapping of
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import threading
2
3
  from typing import Any, Dict, Optional
3
4
 
4
5
  import pydantic
@@ -14,20 +15,26 @@ from snowflake.connector.network import (
14
15
  OAUTH_AUTHENTICATOR,
15
16
  )
16
17
 
17
- from datahub.configuration.common import ConfigModel, ConfigurationError, MetaError
18
+ from datahub.configuration.common import (
19
+ ConfigModel,
20
+ ConfigurationError,
21
+ HiddenFromDocs,
22
+ MetaError,
23
+ )
18
24
  from datahub.configuration.connection_resolver import auto_connection_resolver
19
25
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
20
26
  from datahub.ingestion.api.closeable import Closeable
21
27
  from datahub.ingestion.source.snowflake.constants import (
22
28
  CLIENT_PREFETCH_THREADS,
23
29
  CLIENT_SESSION_KEEP_ALIVE,
30
+ DEFAULT_SNOWFLAKE_DOMAIN,
24
31
  )
25
32
  from datahub.ingestion.source.snowflake.oauth_config import (
26
33
  OAuthConfiguration,
27
34
  OAuthIdentityProvider,
28
35
  )
29
36
  from datahub.ingestion.source.snowflake.oauth_generator import OAuthTokenGenerator
30
- from datahub.ingestion.source.sql.sql_config import make_sqlalchemy_uri
37
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
31
38
  from datahub.utilities.config_clean import (
32
39
  remove_protocol,
33
40
  remove_suffix,
@@ -46,8 +53,6 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
46
53
  "OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
47
54
  }
48
55
 
49
- _SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
50
-
51
56
 
52
57
  class SnowflakePermissionError(MetaError):
53
58
  """A permission error has happened"""
@@ -63,7 +68,7 @@ class SnowflakeConnectionConfig(ConfigModel):
63
68
  description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.",
64
69
  )
65
70
 
66
- scheme: str = "snowflake"
71
+ scheme: HiddenFromDocs[str] = "snowflake"
67
72
  username: Optional[str] = pydantic.Field(
68
73
  default=None, description="Snowflake username."
69
74
  )
@@ -109,18 +114,25 @@ class SnowflakeConnectionConfig(ConfigModel):
109
114
  default=None,
110
115
  description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
111
116
  )
117
+ snowflake_domain: str = pydantic.Field(
118
+ default=DEFAULT_SNOWFLAKE_DOMAIN,
119
+ description="Snowflake domain. Use 'snowflakecomputing.com' for most regions or 'snowflakecomputing.cn' for China (cn-northwest-1) region.",
120
+ )
112
121
 
113
122
  def get_account(self) -> str:
114
123
  assert self.account_id
115
124
  return self.account_id
116
125
 
117
- rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id")
126
+ rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id") # type: ignore[pydantic-field]
118
127
 
119
128
  @pydantic.validator("account_id")
120
- def validate_account_id(cls, account_id: str) -> str:
129
+ def validate_account_id(cls, account_id: str, values: Dict) -> str:
121
130
  account_id = remove_protocol(account_id)
122
131
  account_id = remove_trailing_slashes(account_id)
123
- account_id = remove_suffix(account_id, _SNOWFLAKE_HOST_SUFFIX)
132
+ # Get the domain from config, fallback to default
133
+ domain = values.get("snowflake_domain", DEFAULT_SNOWFLAKE_DOMAIN)
134
+ snowflake_host_suffix = f".{domain}"
135
+ account_id = remove_suffix(account_id, snowflake_host_suffix)
124
136
  return account_id
125
137
 
126
138
  @pydantic.validator("authentication_type", always=True)
@@ -192,23 +204,11 @@ class SnowflakeConnectionConfig(ConfigModel):
192
204
  "but should be set when using use_certificate false for oauth_config"
193
205
  )
194
206
 
195
- def get_sql_alchemy_url(
196
- self,
197
- database: Optional[str] = None,
198
- username: Optional[str] = None,
199
- password: Optional[pydantic.SecretStr] = None,
200
- role: Optional[str] = None,
201
- ) -> str:
202
- if username is None:
203
- username = self.username
204
- if password is None:
205
- password = self.password
206
- if role is None:
207
- role = self.role
207
+ def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
208
208
  return make_sqlalchemy_uri(
209
209
  self.scheme,
210
- username,
211
- password.get_secret_value() if password else None,
210
+ self.username,
211
+ self.password.get_secret_value() if self.password else None,
212
212
  self.account_id,
213
213
  f'"{database}"' if database is not None else database,
214
214
  uri_opts={
@@ -217,7 +217,7 @@ class SnowflakeConnectionConfig(ConfigModel):
217
217
  for (key, value) in {
218
218
  "authenticator": _VALID_AUTH_TYPES.get(self.authentication_type),
219
219
  "warehouse": self.warehouse,
220
- "role": role,
220
+ "role": self.role,
221
221
  "application": _APPLICATION_NAME,
222
222
  }.items()
223
223
  if value
@@ -322,6 +322,7 @@ class SnowflakeConnectionConfig(ConfigModel):
322
322
  warehouse=self.warehouse,
323
323
  authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
324
324
  application=_APPLICATION_NAME,
325
+ host=f"{self.account_id}.{self.snowflake_domain}",
325
326
  **connect_args,
326
327
  )
327
328
 
@@ -335,6 +336,7 @@ class SnowflakeConnectionConfig(ConfigModel):
335
336
  role=self.role,
336
337
  authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
337
338
  application=_APPLICATION_NAME,
339
+ host=f"{self.account_id}.{self.snowflake_domain}",
338
340
  **connect_args,
339
341
  )
340
342
 
@@ -348,6 +350,7 @@ class SnowflakeConnectionConfig(ConfigModel):
348
350
  warehouse=self.warehouse,
349
351
  role=self.role,
350
352
  application=_APPLICATION_NAME,
353
+ host=f"{self.account_id}.{self.snowflake_domain}",
351
354
  **connect_args,
352
355
  )
353
356
  elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
@@ -359,6 +362,7 @@ class SnowflakeConnectionConfig(ConfigModel):
359
362
  warehouse=self.warehouse,
360
363
  role=self.role,
361
364
  application=_APPLICATION_NAME,
365
+ host=f"{self.account_id}.{self.snowflake_domain}",
362
366
  **connect_args,
363
367
  )
364
368
  elif self.authentication_type == "OAUTH_AUTHENTICATOR":
@@ -374,6 +378,7 @@ class SnowflakeConnectionConfig(ConfigModel):
374
378
  role=self.role,
375
379
  authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
376
380
  application=_APPLICATION_NAME,
381
+ host=f"{self.account_id}.{self.snowflake_domain}",
377
382
  **connect_args,
378
383
  )
379
384
  else:
@@ -402,13 +407,30 @@ class SnowflakeConnection(Closeable):
402
407
  def __init__(self, connection: NativeSnowflakeConnection):
403
408
  self._connection = connection
404
409
 
410
+ self._query_num_lock = threading.Lock()
411
+ self._query_num = 1
412
+
405
413
  def native_connection(self) -> NativeSnowflakeConnection:
406
414
  return self._connection
407
415
 
416
+ def get_query_no(self) -> int:
417
+ with self._query_num_lock:
418
+ no = self._query_num
419
+ self._query_num += 1
420
+ return no
421
+
408
422
  def query(self, query: str) -> Any:
409
423
  try:
410
- logger.info(f"Query: {query}", stacklevel=2)
424
+ # We often run multiple queries in parallel across multiple threads,
425
+ # so we need to number them to help with log readability.
426
+ query_num = self.get_query_no()
427
+ logger.info(f"Query #{query_num}: {query.rstrip()}", stacklevel=2)
411
428
  resp = self._connection.cursor(DictCursor).execute(query)
429
+ if resp is not None and resp.rowcount is not None:
430
+ logger.info(
431
+ f"Query #{query_num} got {resp.rowcount} row(s) back from Snowflake",
432
+ stacklevel=2,
433
+ )
412
434
  return resp
413
435
 
414
436
  except Exception as e:
@@ -2,7 +2,17 @@ import json
2
2
  import logging
3
3
  from dataclasses import dataclass
4
4
  from datetime import datetime
5
- from typing import Any, Collection, Iterable, List, Optional, Set, Tuple, Type
5
+ from typing import (
6
+ TYPE_CHECKING,
7
+ Any,
8
+ Collection,
9
+ Iterable,
10
+ List,
11
+ Optional,
12
+ Set,
13
+ Tuple,
14
+ Type,
15
+ )
6
16
 
7
17
  from pydantic import BaseModel, Field, validator
8
18
 
@@ -44,6 +54,9 @@ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
44
54
  from datahub.utilities.perf_timer import PerfTimer
45
55
  from datahub.utilities.time import ts_millis_to_datetime
46
56
 
57
+ if TYPE_CHECKING:
58
+ from pydantic.deprecated.class_validators import V1Validator
59
+
47
60
  logger: logging.Logger = logging.getLogger(__name__)
48
61
 
49
62
  EXTERNAL_LINEAGE = "external_lineage"
@@ -51,7 +64,7 @@ TABLE_LINEAGE = "table_lineage"
51
64
  VIEW_LINEAGE = "view_lineage"
52
65
 
53
66
 
54
- def pydantic_parse_json(field: str) -> classmethod:
67
+ def pydantic_parse_json(field: str) -> "V1Validator":
55
68
  def _parse_from_json(cls: Type, v: Any) -> dict:
56
69
  if isinstance(v, str):
57
70
  return json.loads(v)
@@ -72,7 +85,7 @@ class ColumnUpstreamJob(BaseModel):
72
85
 
73
86
 
74
87
  class ColumnUpstreamLineage(BaseModel):
75
- column_name: Optional[str]
88
+ column_name: Optional[str] = None
76
89
  upstreams: List[ColumnUpstreamJob] = Field(default_factory=list)
77
90
 
78
91
 
@@ -91,9 +104,9 @@ class Query(BaseModel):
91
104
  class UpstreamLineageEdge(BaseModel):
92
105
  DOWNSTREAM_TABLE_NAME: str
93
106
  DOWNSTREAM_TABLE_DOMAIN: str
94
- UPSTREAM_TABLES: Optional[List[UpstreamTableNode]]
95
- UPSTREAM_COLUMNS: Optional[List[ColumnUpstreamLineage]]
96
- QUERIES: Optional[List[Query]]
107
+ UPSTREAM_TABLES: Optional[List[UpstreamTableNode]] = None
108
+ UPSTREAM_COLUMNS: Optional[List[ColumnUpstreamLineage]] = None
109
+ QUERIES: Optional[List[Query]] = None
97
110
 
98
111
  _json_upstream_tables = pydantic_parse_json("UPSTREAM_TABLES")
99
112
  _json_upstream_columns = pydantic_parse_json("UPSTREAM_COLUMNS")
@@ -360,6 +373,12 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
360
373
  self, db_row: dict
361
374
  ) -> Optional[UpstreamLineageEdge]:
362
375
  try:
376
+ _queries = db_row.get("QUERIES")
377
+ if _queries == "[\n {}\n]":
378
+ # We are creating an empty object in the list when there are no queries
379
+ # To avoid that causing a pydantic error we are setting it to an empty list
380
+ # instead of a list with an empty object
381
+ db_row["QUERIES"] = "[]"
363
382
  return UpstreamLineageEdge.parse_obj(db_row)
364
383
  except Exception as e:
365
384
  self.report.num_upstream_lineage_edge_parsing_failed += 1
@@ -135,12 +135,7 @@ class SnowflakeProfiler(GenericProfiler, SnowflakeCommonMixin):
135
135
  ) -> "DatahubGEProfiler":
136
136
  assert db_name
137
137
 
138
- url = self.config.get_sql_alchemy_url(
139
- database=db_name,
140
- username=self.config.username,
141
- password=self.config.password,
142
- role=self.config.role,
143
- )
138
+ url = self.config.get_sql_alchemy_url(database=db_name)
144
139
 
145
140
  logger.debug(f"sql_alchemy_url={url}")
146
141