acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,7 @@ import re
9
9
  from dataclasses import dataclass
10
10
  from typing import Dict, Iterable, List, Optional, Union
11
11
 
12
+ from datahub.configuration.time_window_config import BaseTimeWindowConfig
12
13
  from datahub.ingestion.api.common import PipelineContext
13
14
  from datahub.ingestion.api.decorators import (
14
15
  SupportStatus,
@@ -31,6 +32,7 @@ from datahub.ingestion.api.source import (
31
32
  )
32
33
  from datahub.ingestion.api.source_helpers import auto_workunit
33
34
  from datahub.ingestion.api.workunit import MetadataWorkUnit
35
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
34
36
  from datahub.ingestion.source.snowflake.constants import (
35
37
  GENERIC_PERMISSION_ERROR_KEY,
36
38
  SnowflakeEdition,
@@ -71,6 +73,7 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
71
73
  from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
72
74
  from datahub.ingestion.source.state.redundant_run_skip_handler import (
73
75
  RedundantLineageRunSkipHandler,
76
+ RedundantQueriesRunSkipHandler,
74
77
  RedundantUsageRunSkipHandler,
75
78
  )
76
79
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
@@ -96,7 +99,14 @@ logger: logging.Logger = logging.getLogger(__name__)
96
99
  @support_status(SupportStatus.CERTIFIED)
97
100
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
98
101
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
99
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
102
+ @capability(
103
+ SourceCapability.CONTAINERS,
104
+ "Enabled by default",
105
+ subtype_modifier=[
106
+ SourceCapabilityModifier.DATABASE,
107
+ SourceCapabilityModifier.SCHEMA,
108
+ ],
109
+ )
100
110
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
101
111
  @capability(
102
112
  SourceCapability.DATA_PROFILING,
@@ -117,7 +127,7 @@ logger: logging.Logger = logging.getLogger(__name__)
117
127
  )
118
128
  @capability(
119
129
  SourceCapability.DELETION_DETECTION,
120
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
130
+ "Enabled by default via stateful ingestion",
121
131
  supported=True,
122
132
  )
123
133
  @capability(
@@ -130,6 +140,7 @@ logger: logging.Logger = logging.getLogger(__name__)
130
140
  "Optionally enabled via `classification.enabled`",
131
141
  supported=True,
132
142
  )
143
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
133
144
  class SnowflakeV2Source(
134
145
  SnowflakeCommonMixin,
135
146
  StatefulIngestionSourceBase,
@@ -161,7 +172,11 @@ class SnowflakeV2Source(
161
172
  )
162
173
 
163
174
  # For database, schema, tables, views, etc
164
- self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
175
+ self.data_dictionary = SnowflakeDataDictionary(
176
+ connection=self.connection,
177
+ report=self.report,
178
+ fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
179
+ )
165
180
  self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
166
181
 
167
182
  self.discovered_datasets: Optional[List[str]] = None
@@ -185,6 +200,7 @@ class SnowflakeV2Source(
185
200
  ),
186
201
  generate_usage_statistics=False,
187
202
  generate_operations=False,
203
+ generate_queries=self.config.include_queries,
188
204
  format_queries=self.config.format_sql_queries,
189
205
  is_temp_table=self._is_temp_table,
190
206
  is_allowed_table=self._is_allowed_table,
@@ -192,7 +208,7 @@ class SnowflakeV2Source(
192
208
  )
193
209
  self.report.sql_aggregator = self.aggregator.report
194
210
 
195
- if self.config.include_table_lineage:
211
+ if self.config.include_table_lineage and not self.config.use_queries_v2:
196
212
  redundant_lineage_run_skip_handler: Optional[
197
213
  RedundantLineageRunSkipHandler
198
214
  ] = None
@@ -310,6 +326,7 @@ class SnowflakeV2Source(
310
326
  SourceCapability.PLATFORM_INSTANCE,
311
327
  SourceCapability.DOMAINS,
312
328
  SourceCapability.DELETION_DETECTION,
329
+ SourceCapability.TEST_CONNECTION,
313
330
  )
314
331
  ]
315
332
 
@@ -515,6 +532,7 @@ class SnowflakeV2Source(
515
532
  snowsight_url_builder=snowsight_url_builder,
516
533
  filters=self.filters,
517
534
  identifiers=self.identifiers,
535
+ fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
518
536
  )
519
537
 
520
538
  with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
@@ -551,11 +569,15 @@ class SnowflakeV2Source(
551
569
  and len(discovered_views) == 0
552
570
  and len(discovered_streams) == 0
553
571
  ):
554
- self.structured_reporter.failure(
555
- GENERIC_PERMISSION_ERROR_KEY,
556
- "No tables/views/streams found. Please check permissions.",
557
- )
558
- return
572
+ if self.config.warn_no_datasets:
573
+ self.structured_reporter.warning(
574
+ "No tables/views/streams found. Verify dataset permissions if Snowflake source is not empty.",
575
+ )
576
+ else:
577
+ self.structured_reporter.failure(
578
+ GENERIC_PERMISSION_ERROR_KEY,
579
+ "No tables/views/streams found. Verify dataset permissions in Snowflake.",
580
+ )
559
581
 
560
582
  self.discovered_datasets = (
561
583
  discovered_tables + discovered_views + discovered_streams
@@ -568,10 +590,26 @@ class SnowflakeV2Source(
568
590
  with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
569
591
  schema_resolver = self.aggregator._schema_resolver
570
592
 
593
+ redundant_queries_run_skip_handler: Optional[
594
+ RedundantQueriesRunSkipHandler
595
+ ] = None
596
+ if self.config.enable_stateful_time_window:
597
+ redundant_queries_run_skip_handler = RedundantQueriesRunSkipHandler(
598
+ source=self,
599
+ config=self.config,
600
+ pipeline_name=self.ctx.pipeline_name,
601
+ run_id=self.ctx.run_id,
602
+ )
603
+
571
604
  queries_extractor = SnowflakeQueriesExtractor(
572
605
  connection=self.connection,
606
+ # TODO: this should be its own section in main recipe
573
607
  config=SnowflakeQueriesExtractorConfig(
574
- window=self.config,
608
+ window=BaseTimeWindowConfig(
609
+ start_time=self.config.start_time,
610
+ end_time=self.config.end_time,
611
+ bucket_duration=self.config.bucket_duration,
612
+ ),
575
613
  temporary_tables_pattern=self.config.temporary_tables_pattern,
576
614
  include_lineage=self.config.include_table_lineage,
577
615
  include_usage_statistics=self.config.include_usage_stats,
@@ -580,10 +618,15 @@ class SnowflakeV2Source(
580
618
  include_query_usage_statistics=self.config.include_query_usage_statistics,
581
619
  user_email_pattern=self.config.user_email_pattern,
582
620
  pushdown_deny_usernames=self.config.pushdown_deny_usernames,
621
+ pushdown_allow_usernames=self.config.pushdown_allow_usernames,
622
+ query_dedup_strategy=self.config.query_dedup_strategy,
623
+ push_down_database_pattern_access_history=self.config.push_down_database_pattern_access_history,
624
+ additional_database_names_allowlist=self.config.additional_database_names_allowlist,
583
625
  ),
584
626
  structured_report=self.report,
585
627
  filters=self.filters,
586
628
  identifiers=self.identifiers,
629
+ redundant_run_skip_handler=redundant_queries_run_skip_handler,
587
630
  schema_resolver=schema_resolver,
588
631
  discovered_tables=self.discovered_datasets,
589
632
  graph=self.ctx.graph,
@@ -721,6 +764,7 @@ class SnowflakeV2Source(
721
764
  # For privatelink, account identifier ends with .privatelink
722
765
  # See https://docs.snowflake.com/en/user-guide/organizations-connect.html#private-connectivity-urls
723
766
  privatelink=self.config.account_id.endswith(".privatelink"),
767
+ snowflake_domain=self.config.snowflake_domain,
724
768
  )
725
769
 
726
770
  except Exception as e:
@@ -732,6 +776,8 @@ class SnowflakeV2Source(
732
776
  return None
733
777
 
734
778
  def is_standard_edition(self) -> bool:
779
+ if self.config.known_snowflake_edition is not None:
780
+ return self.config.known_snowflake_edition == SnowflakeEdition.STANDARD
735
781
  try:
736
782
  self.connection.query(SnowflakeQuery.show_tags())
737
783
  return False
@@ -0,0 +1,143 @@
1
+ import dataclasses
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from typing import Any, Iterable, List, Optional
5
+
6
+ from datahub.ingestion.api.closeable import Closeable
7
+ from datahub.metadata.urns import CorpUserUrn
8
+ from datahub.sql_parsing.sql_parsing_aggregator import (
9
+ PreparsedQuery,
10
+ UrnStr,
11
+ )
12
+ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
13
+ from datahub.utilities.file_backed_collections import FileBackedDict
14
+
15
+
16
+ @dataclasses.dataclass
17
+ class StoredProcCall:
18
+ snowflake_root_query_id: str
19
+
20
+ # Query text will typically be something like:
21
+ # "CALL SALES_FORECASTING.CUSTOMER_ANALYSIS_PROC();"
22
+ query_text: str
23
+
24
+ timestamp: datetime
25
+ user: CorpUserUrn
26
+ default_db: str
27
+ default_schema: str
28
+
29
+
30
+ @dataclass
31
+ class StoredProcExecutionLineage:
32
+ call: StoredProcCall
33
+
34
+ inputs: List[UrnStr]
35
+ outputs: List[UrnStr]
36
+
37
+
38
+ @dataclass
39
+ class StoredProcLineageReport:
40
+ num_stored_proc_calls: int = 0
41
+ num_related_queries: int = 0
42
+ num_related_queries_without_proc_call: int = 0
43
+
44
+ # Incremented at generation/build time.
45
+ num_stored_proc_lineage_entries: int = 0
46
+ num_stored_proc_calls_with_no_inputs: int = 0
47
+ num_stored_proc_calls_with_no_outputs: int = 0
48
+
49
+
50
+ class StoredProcLineageTracker(Closeable):
51
+ """
52
+ Tracks table-level lineage for Snowflake stored procedures.
53
+
54
+ Stored procedures in Snowflake trigger multiple SQL queries during execution.
55
+ Snowflake assigns each stored procedure call a unique query_id and uses this as the
56
+ root_query_id for all subsequent queries executed within that procedure. This allows
57
+ us to trace which queries belong to a specific stored procedure execution and build
58
+ table-level lineage by aggregating inputs/outputs from all related queries.
59
+ """
60
+
61
+ def __init__(self, platform: str, shared_connection: Optional[Any] = None):
62
+ self.platform = platform
63
+ self.report = StoredProcLineageReport()
64
+
65
+ # { root_query_id -> StoredProcExecutionLineage }
66
+ self._stored_proc_execution_lineage: FileBackedDict[
67
+ StoredProcExecutionLineage
68
+ ] = FileBackedDict(shared_connection, tablename="stored_proc_lineage")
69
+
70
+ def add_stored_proc_call(self, call: StoredProcCall) -> None:
71
+ """Add a stored procedure call to track."""
72
+ self._stored_proc_execution_lineage[call.snowflake_root_query_id] = (
73
+ StoredProcExecutionLineage(
74
+ call=call,
75
+ # Will be populated by subsequent queries.
76
+ inputs=[],
77
+ outputs=[],
78
+ )
79
+ )
80
+ self.report.num_stored_proc_calls += 1
81
+
82
+ def add_related_query(self, query: PreparsedQuery) -> bool:
83
+ """Add a query that might be related to a stored procedure execution.
84
+
85
+ Returns True if the query was added to a stored procedure execution, False otherwise.
86
+ """
87
+ snowflake_root_query_id = (query.extra_info or {}).get(
88
+ "snowflake_root_query_id"
89
+ )
90
+
91
+ if snowflake_root_query_id:
92
+ if snowflake_root_query_id not in self._stored_proc_execution_lineage:
93
+ self.report.num_related_queries_without_proc_call += 1
94
+ return False
95
+
96
+ stored_proc_execution = self._stored_proc_execution_lineage.for_mutation(
97
+ snowflake_root_query_id
98
+ )
99
+ stored_proc_execution.inputs.extend(query.upstreams)
100
+ if query.downstream is not None:
101
+ stored_proc_execution.outputs.append(query.downstream)
102
+ self.report.num_related_queries += 1
103
+ return True
104
+
105
+ return False
106
+
107
+ def build_merged_lineage_entries(self) -> Iterable[PreparsedQuery]:
108
+ # For stored procedures, we can only get table-level lineage from the audit log.
109
+ # We represent these as PreparsedQuery objects for now. Eventually we'll want to
110
+ # create dataJobInputOutput lineage instead.
111
+
112
+ for stored_proc_execution in self._stored_proc_execution_lineage.values():
113
+ if not stored_proc_execution.inputs:
114
+ self.report.num_stored_proc_calls_with_no_inputs += 1
115
+ continue
116
+
117
+ if not stored_proc_execution.outputs:
118
+ self.report.num_stored_proc_calls_with_no_outputs += 1
119
+ # Still continue to generate lineage for cases where we have inputs but no outputs
120
+
121
+ for downstream in stored_proc_execution.outputs:
122
+ stored_proc_query_id = get_query_fingerprint(
123
+ stored_proc_execution.call.query_text,
124
+ self.platform,
125
+ fast=True,
126
+ secondary_id=downstream,
127
+ )
128
+
129
+ lineage_entry = PreparsedQuery(
130
+ query_id=stored_proc_query_id,
131
+ query_text=stored_proc_execution.call.query_text,
132
+ upstreams=stored_proc_execution.inputs,
133
+ downstream=downstream,
134
+ query_count=0,
135
+ user=stored_proc_execution.call.user,
136
+ timestamp=stored_proc_execution.call.timestamp,
137
+ )
138
+
139
+ self.report.num_stored_proc_lineage_entries += 1
140
+ yield lineage_entry
141
+
142
+ def close(self) -> None:
143
+ self._stored_proc_execution_lineage.close()
@@ -16,6 +16,7 @@ from sqlalchemy.engine.reflection import Inspector
16
16
  from sqlalchemy.types import TypeEngine
17
17
  from sqlalchemy_bigquery import STRUCT
18
18
 
19
+ from datahub.configuration.common import HiddenFromDocs
19
20
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
20
21
  from datahub.emitter.mcp_builder import ContainerKey, DatabaseKey
21
22
  from datahub.ingestion.api.decorators import (
@@ -29,26 +30,38 @@ from datahub.ingestion.api.decorators import (
29
30
  from datahub.ingestion.api.source import StructuredLogLevel
30
31
  from datahub.ingestion.api.workunit import MetadataWorkUnit
31
32
  from datahub.ingestion.source.aws.s3_util import make_s3_urn
32
- from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
33
+ from datahub.ingestion.source.common.subtypes import (
34
+ DatasetContainerSubTypes,
35
+ SourceCapabilityModifier,
36
+ )
33
37
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
38
+ from datahub.ingestion.source.sql.athena_properties_extractor import (
39
+ AthenaPropertiesExtractor,
40
+ )
34
41
  from datahub.ingestion.source.sql.sql_common import (
35
42
  SQLAlchemySource,
36
43
  register_custom_type,
37
44
  )
38
- from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
45
+ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
39
46
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
40
47
  from datahub.ingestion.source.sql.sql_utils import (
41
48
  add_table_to_schema_container,
42
49
  gen_database_container,
43
50
  gen_database_key,
44
51
  )
52
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
45
53
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
46
- from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass
54
+ from datahub.metadata.schema_classes import (
55
+ ArrayTypeClass,
56
+ MapTypeClass,
57
+ RecordTypeClass,
58
+ )
47
59
  from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
48
60
  from datahub.utilities.sqlalchemy_type_converter import (
49
61
  MapType,
50
62
  get_schema_fields_for_sqlalchemy_column,
51
63
  )
64
+ from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
52
65
 
53
66
  try:
54
67
  from typing_extensions import override
@@ -61,6 +74,11 @@ except ImportError:
61
74
 
62
75
  logger = logging.getLogger(__name__)
63
76
 
77
+ # Precompiled regex for SQL identifier validation
78
+ # Athena identifiers can only contain lowercase letters, numbers, underscore, and period (for complex types)
79
+ # Note: Athena automatically converts uppercase to lowercase, but we're being strict for security
80
+ _IDENTIFIER_PATTERN = re.compile(r"^[a-zA-Z0-9_.]+$")
81
+
64
82
  assert STRUCT, "required type modules are not available"
65
83
  register_custom_type(STRUCT, RecordTypeClass)
66
84
  register_custom_type(MapType, MapTypeClass)
@@ -234,7 +252,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
234
252
 
235
253
 
236
254
  class AthenaConfig(SQLCommonConfig):
237
- scheme: str = "awsathena+rest"
255
+ scheme: HiddenFromDocs[str] = "awsathena+rest"
238
256
  username: Optional[str] = pydantic.Field(
239
257
  default=None,
240
258
  description="Username credential. If not specified, detected with boto3 rules. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html",
@@ -280,12 +298,22 @@ class AthenaConfig(SQLCommonConfig):
280
298
  description="Extract partitions for tables. Partition extraction needs to run a query (`select * from table$partitions`) on the table. Disable this if you don't want to grant select permission.",
281
299
  )
282
300
 
301
+ extract_partitions_using_create_statements: bool = pydantic.Field(
302
+ default=False,
303
+ description="Extract partitions using the `SHOW CREATE TABLE` statement instead of querying the table's partitions directly. This needs to be enabled to extract Iceberg partitions. If extraction fails it falls back to the default partition extraction. This is experimental.",
304
+ )
305
+
283
306
  _s3_staging_dir_population = pydantic_renamed_field(
284
307
  old_name="s3_staging_dir",
285
308
  new_name="query_result_location",
286
309
  print_warning=True,
287
310
  )
288
311
 
312
+ emit_schema_fieldpaths_as_v1: bool = pydantic.Field(
313
+ default=False,
314
+ description="Convert simple field paths to DataHub field path v1 format. Simple column paths are those that do not contain any nested fields.",
315
+ )
316
+
289
317
  profiling: AthenaProfilingConfig = AthenaProfilingConfig()
290
318
 
291
319
  def get_sql_alchemy_url(self):
@@ -320,8 +348,24 @@ class Partitionitem:
320
348
  @capability(
321
349
  SourceCapability.DATA_PROFILING,
322
350
  "Optionally enabled via configuration. Profiling uses sql queries on whole table which can be expensive operation.",
351
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
352
+ )
353
+ @capability(
354
+ SourceCapability.LINEAGE_COARSE,
355
+ "Supported for S3 tables",
356
+ subtype_modifier=[
357
+ SourceCapabilityModifier.VIEW,
358
+ SourceCapabilityModifier.TABLE,
359
+ ],
360
+ )
361
+ @capability(
362
+ SourceCapability.LINEAGE_FINE,
363
+ "Supported for S3 tables",
364
+ subtype_modifier=[
365
+ SourceCapabilityModifier.VIEW,
366
+ SourceCapabilityModifier.TABLE,
367
+ ],
323
368
  )
324
- @capability(SourceCapability.LINEAGE_COARSE, "Supported for S3 tables")
325
369
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
326
370
  class AthenaSource(SQLAlchemySource):
327
371
  """
@@ -472,11 +516,70 @@ class AthenaSource(SQLAlchemySource):
472
516
  return [schema for schema in schemas if schema == athena_config.database]
473
517
  return schemas
474
518
 
519
+ @classmethod
520
+ def _sanitize_identifier(cls, identifier: str) -> str:
521
+ """Sanitize SQL identifiers to prevent injection attacks.
522
+
523
+ Args:
524
+ identifier: The SQL identifier to sanitize
525
+
526
+ Returns:
527
+ Sanitized identifier safe for SQL queries
528
+
529
+ Raises:
530
+ ValueError: If identifier contains unsafe characters
531
+ """
532
+ if not identifier:
533
+ raise ValueError("Identifier cannot be empty")
534
+
535
+ # Allow only alphanumeric characters, underscores, and periods for identifiers
536
+ # This matches Athena's identifier naming rules
537
+ if not _IDENTIFIER_PATTERN.match(identifier):
538
+ raise ValueError(
539
+ f"Identifier '{identifier}' contains unsafe characters. Only alphanumeric characters, underscores, and periods are allowed."
540
+ )
541
+
542
+ return identifier
543
+
475
544
  @classmethod
476
545
  def _casted_partition_key(cls, key: str) -> str:
477
546
  # We need to cast the partition keys to a VARCHAR, since otherwise
478
547
  # Athena may throw an error during concatenation / comparison.
479
- return f"CAST({key} as VARCHAR)"
548
+ sanitized_key = cls._sanitize_identifier(key)
549
+ return f"CAST({sanitized_key} as VARCHAR)"
550
+
551
+ @classmethod
552
+ def _build_max_partition_query(
553
+ cls, schema: str, table: str, partitions: List[str]
554
+ ) -> str:
555
+ """Build SQL query to find the row with maximum partition values.
556
+
557
+ Args:
558
+ schema: Database schema name
559
+ table: Table name
560
+ partitions: List of partition column names
561
+
562
+ Returns:
563
+ SQL query string to find the maximum partition
564
+
565
+ Raises:
566
+ ValueError: If any identifier contains unsafe characters
567
+ """
568
+ # Sanitize all identifiers to prevent SQL injection
569
+ sanitized_schema = cls._sanitize_identifier(schema)
570
+ sanitized_table = cls._sanitize_identifier(table)
571
+ sanitized_partitions = [
572
+ cls._sanitize_identifier(partition) for partition in partitions
573
+ ]
574
+
575
+ casted_keys = [cls._casted_partition_key(key) for key in partitions]
576
+ if len(casted_keys) == 1:
577
+ part_concat = casted_keys[0]
578
+ else:
579
+ separator = "CAST('-' AS VARCHAR)"
580
+ part_concat = f"CONCAT({f', {separator}, '.join(casted_keys)})"
581
+
582
+ return f'select {",".join(sanitized_partitions)} from "{sanitized_schema}"."{sanitized_table}$partitions" where {part_concat} = (select max({part_concat}) from "{sanitized_schema}"."{sanitized_table}$partitions")'
480
583
 
481
584
  @override
482
585
  def get_partitions(
@@ -488,27 +591,37 @@ class AthenaSource(SQLAlchemySource):
488
591
  if not self.cursor:
489
592
  return None
490
593
 
491
- metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
492
- table_name=table, schema_name=schema
493
- )
594
+ if self.config.extract_partitions_using_create_statements:
595
+ try:
596
+ partitions = self._get_partitions_create_table(schema, table)
597
+ except Exception as e:
598
+ logger.warning(
599
+ f"Failed to get partitions from create table statement for {schema}.{table} because of {e}. Falling back to SQLAlchemy.",
600
+ exc_info=True,
601
+ )
602
+
603
+ # If we can't get create table statement, we fall back to SQLAlchemy
604
+ partitions = self._get_partitions_sqlalchemy(schema, table)
605
+ else:
606
+ partitions = self._get_partitions_sqlalchemy(schema, table)
494
607
 
495
- partitions = []
496
- for key in metadata.partition_keys:
497
- if key.name:
498
- partitions.append(key.name)
499
608
  if not partitions:
500
609
  return []
501
610
 
611
+ if (
612
+ not self.config.profiling.enabled
613
+ or not self.config.profiling.partition_profiling_enabled
614
+ ):
615
+ return partitions
616
+
502
617
  with self.report.report_exc(
503
618
  message="Failed to extract partition details",
504
619
  context=f"{schema}.{table}",
505
620
  level=StructuredLogLevel.WARN,
506
621
  ):
507
- # We create an artifical concatenated partition key to be able to query max partition easier
508
- part_concat = " || '-' || ".join(
509
- self._casted_partition_key(key) for key in partitions
622
+ max_partition_query = self._build_max_partition_query(
623
+ schema, table, partitions
510
624
  )
511
- max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
512
625
  ret = self.cursor.execute(max_partition_query)
513
626
  max_partition: Dict[str, str] = {}
514
627
  if ret:
@@ -524,6 +637,56 @@ class AthenaSource(SQLAlchemySource):
524
637
 
525
638
  return partitions
526
639
 
640
+ def _get_partitions_create_table(self, schema: str, table: str) -> List[str]:
641
+ assert self.cursor
642
+ try:
643
+ res = self.cursor.execute(f"SHOW CREATE TABLE `{schema}`.`{table}`")
644
+ except Exception as e:
645
+ # Athena does not support SHOW CREATE TABLE for views
646
+ # and will throw an error. We need to handle this case
647
+ # and caller needs to fallback to sqlalchemy's get partitions call.
648
+ logger.debug(
649
+ f"Failed to get table properties for {schema}.{table}: {e}",
650
+ exc_info=True,
651
+ )
652
+ raise e
653
+ rows = res.fetchall()
654
+
655
+ # Concatenate all rows into a single string with newlines
656
+ create_table_statement = "\n".join(row[0] for row in rows)
657
+
658
+ try:
659
+ athena_table_info = AthenaPropertiesExtractor.get_table_properties(
660
+ create_table_statement
661
+ )
662
+ except Exception as e:
663
+ logger.debug(
664
+ f"Failed to parse table properties for {schema}.{table}: {e} and statement: {create_table_statement}",
665
+ exc_info=True,
666
+ )
667
+ raise e
668
+
669
+ partitions = []
670
+ if (
671
+ athena_table_info.partition_info
672
+ and athena_table_info.partition_info.simple_columns
673
+ ):
674
+ partitions = [
675
+ ci.name for ci in athena_table_info.partition_info.simple_columns
676
+ ]
677
+ return partitions
678
+
679
+ def _get_partitions_sqlalchemy(self, schema: str, table: str) -> List[str]:
680
+ assert self.cursor
681
+ metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
682
+ table_name=table, schema_name=schema
683
+ )
684
+ partitions = []
685
+ for key in metadata.partition_keys:
686
+ if key.name:
687
+ partitions.append(key.name)
688
+ return partitions
689
+
527
690
  # Overwrite to modify the creation of schema fields
528
691
  def get_schema_fields_for_column(
529
692
  self,
@@ -550,6 +713,18 @@ class AthenaSource(SQLAlchemySource):
550
713
  ),
551
714
  )
552
715
 
716
+ # Keeping it as individual check to make it more explicit and easier to understand
717
+ if not self.config.emit_schema_fieldpaths_as_v1:
718
+ return fields
719
+
720
+ if isinstance(
721
+ fields[0].type.type, (RecordTypeClass, MapTypeClass, ArrayTypeClass)
722
+ ):
723
+ return fields
724
+ else:
725
+ fields[0].fieldPath = get_simple_field_path_from_v2_field_path(
726
+ fields[0].fieldPath
727
+ )
553
728
  return fields
554
729
 
555
730
  def generate_partition_profiler_query(
@@ -563,16 +738,34 @@ class AthenaSource(SQLAlchemySource):
563
738
  ).get(table, None)
564
739
 
565
740
  if partition and partition.max_partition:
566
- max_partition_filters = []
567
- for key, value in partition.max_partition.items():
568
- max_partition_filters.append(
569
- f"{self._casted_partition_key(key)} = '{value}'"
741
+ try:
742
+ # Sanitize identifiers to prevent SQL injection
743
+ sanitized_schema = self._sanitize_identifier(schema)
744
+ sanitized_table = self._sanitize_identifier(table)
745
+
746
+ max_partition_filters = []
747
+ for key, value in partition.max_partition.items():
748
+ # Sanitize partition key and properly escape the value
749
+ sanitized_key = self._sanitize_identifier(key)
750
+ # Escape single quotes in the value to prevent injection
751
+ escaped_value = value.replace("'", "''") if value else ""
752
+ max_partition_filters.append(
753
+ f"{self._casted_partition_key(sanitized_key)} = '{escaped_value}'"
754
+ )
755
+ max_partition = str(partition.max_partition)
756
+ return (
757
+ max_partition,
758
+ f'SELECT * FROM "{sanitized_schema}"."{sanitized_table}" WHERE {" AND ".join(max_partition_filters)}',
570
759
  )
571
- max_partition = str(partition.max_partition)
572
- return (
573
- max_partition,
574
- f'SELECT * FROM "{schema}"."{table}" WHERE {" AND ".join(max_partition_filters)}',
575
- )
760
+ except ValueError as e:
761
+ # If sanitization fails due to malicious identifiers,
762
+ # return None to disable partition profiling for this table
763
+ # rather than crashing the entire ingestion
764
+ logger.warning(
765
+ f"Failed to generate partition profiler query for {schema}.{table} due to unsafe identifiers: {e}. "
766
+ f"Partition profiling disabled for this table."
767
+ )
768
+ return None, None
576
769
  return None, None
577
770
 
578
771
  def close(self):