acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,18 @@
1
1
  # This import verifies that the dependencies are available.
2
+ import logging
3
+ from typing import TYPE_CHECKING, Any, List, Optional
2
4
 
3
5
  import pymysql # noqa: F401
4
6
  from pydantic.fields import Field
5
- from sqlalchemy import util
7
+ from sqlalchemy import create_engine, event, inspect, util
6
8
  from sqlalchemy.dialects.mysql import BIT, base
7
9
  from sqlalchemy.dialects.mysql.enumerated import SET
8
10
  from sqlalchemy.engine.reflection import Inspector
9
11
 
12
+ if TYPE_CHECKING:
13
+ from sqlalchemy.engine import Engine
14
+
15
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
10
16
  from datahub.ingestion.api.decorators import (
11
17
  SourceCapability,
12
18
  SupportStatus,
@@ -15,16 +21,27 @@ from datahub.ingestion.api.decorators import (
15
21
  platform_name,
16
22
  support_status,
17
23
  )
24
+ from datahub.ingestion.source.aws.aws_common import (
25
+ AwsConnectionConfig,
26
+ RDSIAMTokenManager,
27
+ )
18
28
  from datahub.ingestion.source.sql.sql_common import (
19
29
  make_sqlalchemy_type,
20
30
  register_custom_type,
21
31
  )
22
32
  from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
33
+ from datahub.ingestion.source.sql.sqlalchemy_uri import parse_host_port
34
+ from datahub.ingestion.source.sql.stored_procedures.base import (
35
+ BaseProcedure,
36
+ )
23
37
  from datahub.ingestion.source.sql.two_tier_sql_source import (
24
38
  TwoTierSQLAlchemyConfig,
25
39
  TwoTierSQLAlchemySource,
26
40
  )
27
41
  from datahub.metadata.schema_classes import BytesTypeClass
42
+ from datahub.utilities.str_enum import StrEnum
43
+
44
+ logger = logging.getLogger(__name__)
28
45
 
29
46
  SET.__repr__ = util.generic_repr # type:ignore
30
47
 
@@ -48,16 +65,49 @@ base.ischema_names["polygon"] = POLYGON
48
65
  base.ischema_names["decimal128"] = DECIMAL128
49
66
 
50
67
 
68
+ class MySQLAuthMode(StrEnum):
69
+ """Authentication mode for MySQL connection."""
70
+
71
+ PASSWORD = "PASSWORD"
72
+ AWS_IAM = "AWS_IAM"
73
+
74
+
51
75
  class MySQLConnectionConfig(SQLAlchemyConnectionConfig):
52
76
  # defaults
53
77
  host_port: str = Field(default="localhost:3306", description="MySQL host URL.")
54
- scheme: str = "mysql+pymysql"
78
+ scheme: HiddenFromDocs[str] = "mysql+pymysql"
79
+
80
+ # Authentication configuration
81
+ auth_mode: MySQLAuthMode = Field(
82
+ default=MySQLAuthMode.PASSWORD,
83
+ description="Authentication mode to use for the MySQL connection. "
84
+ "Options are 'PASSWORD' (default) for standard username/password authentication, "
85
+ "or 'AWS_IAM' for AWS RDS IAM authentication.",
86
+ )
87
+ aws_config: AwsConnectionConfig = Field(
88
+ default_factory=AwsConnectionConfig,
89
+ description="AWS configuration for RDS IAM authentication (only used when auth_mode is AWS_IAM). "
90
+ "Provides full control over AWS credentials, region, profiles, role assumption, retry logic, and proxy settings. "
91
+ "If not explicitly configured, boto3 will automatically use the default credential chain and region from "
92
+ "environment variables (AWS_DEFAULT_REGION, AWS_REGION), AWS config files (~/.aws/config), or IAM role metadata.",
93
+ )
55
94
 
56
95
 
57
96
  class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
58
97
  def get_identifier(self, *, schema: str, table: str) -> str:
59
98
  return f"{schema}.{table}"
60
99
 
100
+ include_stored_procedures: bool = Field(
101
+ default=True,
102
+ description="Include ingest of stored procedures.",
103
+ )
104
+
105
+ procedure_pattern: AllowDenyPattern = Field(
106
+ default=AllowDenyPattern.allow_all(),
107
+ description="Regex patterns for stored procedures to filter in ingestion."
108
+ "Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
109
+ )
110
+
61
111
 
62
112
  @platform_name("MySQL")
63
113
  @config_class(MySQLConfig)
@@ -65,7 +115,6 @@ class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
65
115
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
66
116
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
67
117
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
68
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
69
118
  class MySQLSource(TwoTierSQLAlchemySource):
70
119
  """
71
120
  This plugin extracts the following:
@@ -75,9 +124,27 @@ class MySQLSource(TwoTierSQLAlchemySource):
75
124
  Table, row, and column statistics via optional SQL profiling
76
125
  """
77
126
 
78
- def __init__(self, config, ctx):
127
+ config: MySQLConfig
128
+
129
+ def __init__(self, config: MySQLConfig, ctx: Any):
79
130
  super().__init__(config, ctx, self.get_platform())
80
131
 
132
+ self._rds_iam_token_manager: Optional[RDSIAMTokenManager] = None
133
+ if config.auth_mode == MySQLAuthMode.AWS_IAM:
134
+ hostname, port = parse_host_port(config.host_port, default_port=3306)
135
+ if port is None:
136
+ raise ValueError("Port must be specified for RDS IAM authentication")
137
+
138
+ if not config.username:
139
+ raise ValueError("username is required for RDS IAM authentication")
140
+
141
+ self._rds_iam_token_manager = RDSIAMTokenManager(
142
+ endpoint=hostname,
143
+ username=config.username,
144
+ port=port,
145
+ aws_config=config.aws_config,
146
+ )
147
+
81
148
  def get_platform(self):
82
149
  return "mysql"
83
150
 
@@ -86,6 +153,52 @@ class MySQLSource(TwoTierSQLAlchemySource):
86
153
  config = MySQLConfig.parse_obj(config_dict)
87
154
  return cls(config, ctx)
88
155
 
156
+ def _setup_rds_iam_event_listener(
157
+ self, engine: "Engine", database_name: Optional[str] = None
158
+ ) -> None:
159
+ """Setup SQLAlchemy event listener to inject RDS IAM tokens."""
160
+ if not (
161
+ self.config.auth_mode == MySQLAuthMode.AWS_IAM
162
+ and self._rds_iam_token_manager
163
+ ):
164
+ return
165
+
166
+ def do_connect_listener(_dialect, _conn_rec, _cargs, cparams):
167
+ if not self._rds_iam_token_manager:
168
+ raise RuntimeError("RDS IAM Token Manager is not initialized")
169
+ cparams["password"] = self._rds_iam_token_manager.get_token()
170
+ # PyMySQL requires SSL to be enabled for RDS IAM authentication.
171
+ # Preserve any existing SSL configuration, otherwise enable with default settings.
172
+ # The {"ssl": True} dict is a workaround to make PyMySQL recognize that SSL
173
+ # should be enabled, since the library requires a truthy value in the ssl parameter.
174
+ # See https://pymysql.readthedocs.io/en/latest/modules/connections.html#pymysql.connections.Connection
175
+ cparams["ssl"] = cparams.get("ssl") or {"ssl": True}
176
+
177
+ event.listen(engine, "do_connect", do_connect_listener) # type: ignore[misc]
178
+
179
+ def get_inspectors(self):
180
+ url = self.config.get_sql_alchemy_url()
181
+ logger.debug(f"sql_alchemy_url={url}")
182
+
183
+ engine = create_engine(url, **self.config.options)
184
+ self._setup_rds_iam_event_listener(engine)
185
+
186
+ with engine.connect() as conn:
187
+ inspector = inspect(conn)
188
+ if self.config.database and self.config.database != "":
189
+ databases = [self.config.database]
190
+ else:
191
+ databases = inspector.get_schema_names()
192
+ for db in databases:
193
+ if self.config.database_pattern.allowed(db):
194
+ url = self.config.get_sql_alchemy_url(current_db=db)
195
+ db_engine = create_engine(url, **self.config.options)
196
+ self._setup_rds_iam_event_listener(db_engine, database_name=db)
197
+
198
+ with db_engine.connect() as conn:
199
+ inspector = inspect(conn)
200
+ yield inspector
201
+
89
202
  def add_profile_metadata(self, inspector: Inspector) -> None:
90
203
  if not self.config.is_profiling_enabled():
91
204
  return
@@ -96,3 +209,40 @@ class MySQLSource(TwoTierSQLAlchemySource):
96
209
  self.profile_metadata_info.dataset_name_to_storage_bytes[
97
210
  f"{row.TABLE_SCHEMA}.{row.TABLE_NAME}"
98
211
  ] = row.DATA_LENGTH
212
+
213
+ def get_procedures_for_schema(
214
+ self, inspector: Inspector, schema: str, db_name: str
215
+ ) -> List[BaseProcedure]:
216
+ """
217
+ Get stored procedures for a specific schema.
218
+ """
219
+ base_procedures = []
220
+ with inspector.engine.connect() as conn:
221
+ procedures = conn.execute(
222
+ """
223
+ SELECT ROUTINE_NAME AS name,
224
+ ROUTINE_DEFINITION AS definition,
225
+ EXTERNAL_LANGUAGE AS language
226
+ FROM information_schema.ROUTINES
227
+ WHERE ROUTINE_TYPE = 'PROCEDURE'
228
+ AND ROUTINE_SCHEMA = %s
229
+ """,
230
+ (schema,),
231
+ )
232
+
233
+ procedure_rows = list(procedures)
234
+ for row in procedure_rows:
235
+ base_procedures.append(
236
+ BaseProcedure(
237
+ name=row.name,
238
+ language=row.language,
239
+ argument_signature=None,
240
+ return_type=None,
241
+ procedure_definition=row.definition,
242
+ created=None,
243
+ last_altered=None,
244
+ extra_properties=None,
245
+ comment=None,
246
+ )
247
+ )
248
+ return base_procedures
@@ -31,11 +31,13 @@ from datahub.ingestion.source.sql.sql_common import (
31
31
  SQLAlchemySource,
32
32
  make_sqlalchemy_type,
33
33
  )
34
- from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig
34
+ from datahub.ingestion.source.sql.sql_config import (
35
+ BasicSQLAlchemyConfig,
36
+ )
35
37
 
36
38
  logger = logging.getLogger(__name__)
37
39
 
38
- oracledb.version = "8.3.0"
40
+ oracledb.version = "8.3.0" # type: ignore[assignment]
39
41
  sys.modules["cx_Oracle"] = oracledb
40
42
 
41
43
  extra_oracle_types = {
@@ -71,10 +73,12 @@ class OracleConfig(BasicSQLAlchemyConfig):
71
73
  description="Will be set automatically to default value.",
72
74
  )
73
75
  service_name: Optional[str] = Field(
74
- default=None, description="Oracle service name. If using, omit `database`."
76
+ default=None,
77
+ description="Oracle service name. If using, omit `database`.",
75
78
  )
76
79
  database: Optional[str] = Field(
77
- default=None, description="If using, omit `service_name`."
80
+ default=None,
81
+ description="If using, omit `service_name`.",
78
82
  )
79
83
  add_database_name_to_urn: Optional[bool] = Field(
80
84
  default=False,
@@ -106,10 +110,10 @@ class OracleConfig(BasicSQLAlchemyConfig):
106
110
  return v
107
111
 
108
112
  @pydantic.validator("data_dictionary_mode")
109
- def check_data_dictionary_mode(cls, values):
110
- if values not in ("ALL", "DBA"):
113
+ def check_data_dictionary_mode(cls, value):
114
+ if value not in ("ALL", "DBA"):
111
115
  raise ValueError("Specify one of data dictionary views mode: 'ALL', 'DBA'.")
112
- return values
116
+ return value
113
117
 
114
118
  @pydantic.validator("thick_mode_lib_dir", always=True)
115
119
  def check_thick_mode_lib_dir(cls, v, values):
@@ -123,11 +127,15 @@ class OracleConfig(BasicSQLAlchemyConfig):
123
127
  )
124
128
  return v
125
129
 
126
- def get_sql_alchemy_url(self):
127
- url = super().get_sql_alchemy_url()
130
+ def get_sql_alchemy_url(
131
+ self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
132
+ ) -> str:
133
+ url = super().get_sql_alchemy_url(uri_opts=uri_opts, database=database)
134
+
128
135
  if self.service_name:
129
136
  assert not self.database
130
137
  url = f"{url}/?service_name={self.service_name}"
138
+
131
139
  return url
132
140
 
133
141
  def get_identifier(self, schema: str, table: str) -> str:
@@ -433,7 +441,7 @@ class OracleInspectorObjectWrapper:
433
441
  "\nac.constraint_name,"
434
442
  "\nac.constraint_type,"
435
443
  "\nacc.column_name AS local_column,"
436
- "\nac.r_table_name AS remote_table,"
444
+ "\nac.table_name AS remote_table,"
437
445
  "\nrcc.column_name AS remote_column,"
438
446
  "\nac.r_owner AS remote_owner,"
439
447
  "\nacc.position AS loc_pos,"
@@ -631,7 +639,6 @@ class OracleSource(SQLAlchemySource):
631
639
  - Table, row, and column statistics via optional SQL profiling
632
640
 
633
641
  Using the Oracle source requires that you've also installed the correct drivers; see the [cx_Oracle docs](https://cx-oracle.readthedocs.io/en/latest/user_guide/installation.html). The easiest one is the [Oracle Instant Client](https://www.oracle.com/database/technologies/instant-client.html).
634
-
635
642
  """
636
643
 
637
644
  config: OracleConfig
@@ -661,6 +668,8 @@ class OracleSource(SQLAlchemySource):
661
668
  database name from Connection URL, which does not work when using
662
669
  service instead of database.
663
670
  In that case, it tries to retrieve the database name by sending a query to the DB.
671
+
672
+ Note: This is used as a fallback if database is not specified in the config.
664
673
  """
665
674
 
666
675
  # call default implementation first
@@ -687,7 +696,49 @@ class OracleSource(SQLAlchemySource):
687
696
  # To silent the mypy lint error
688
697
  yield cast(Inspector, inspector)
689
698
 
699
+ def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
700
+ """
701
+ Override the get_db_schema method to ensure proper schema name extraction.
702
+ This method is used during view lineage extraction to determine the default schema
703
+ for unqualified table names in view definitions.
704
+ """
705
+ try:
706
+ # Try to get the schema from the dataset identifier
707
+ parts = dataset_identifier.split(".")
708
+
709
+ # Handle the identifier format differently based on add_database_name_to_urn flag
710
+ if self.config.add_database_name_to_urn:
711
+ if len(parts) >= 3:
712
+ # Format is: database.schema.view when add_database_name_to_urn=True
713
+ db_name = parts[-3]
714
+ schema_name = parts[-2]
715
+ return db_name, schema_name
716
+ elif len(parts) >= 2:
717
+ # Handle the case where database might be missing even with flag enabled
718
+ # If we have a database in the config, use that
719
+ db_name = str(self.config.database)
720
+ schema_name = parts[-2]
721
+ return db_name, schema_name
722
+ else:
723
+ # Format is: schema.view when add_database_name_to_urn=False
724
+ if len(parts) >= 2:
725
+ # When add_database_name_to_urn is False, don't include database in the result
726
+ db_name = None
727
+ schema_name = parts[-2]
728
+ return db_name, schema_name
729
+ except Exception as e:
730
+ logger.warning(
731
+ f"Error extracting schema from identifier {dataset_identifier}: {e}"
732
+ )
733
+
734
+ # Fall back to parent implementation if our approach fails
735
+ db_name, schema_name = super().get_db_schema(dataset_identifier)
736
+ return db_name, schema_name
737
+
690
738
  def get_workunits(self):
739
+ """
740
+ Override get_workunits to patch Oracle dialect for custom types.
741
+ """
691
742
  with patch.dict(
692
743
  "sqlalchemy.dialects.oracle.base.OracleDialect.ischema_names",
693
744
  {klass.__name__: klass for klass in extra_oracle_types},
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
- from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
3
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
4
4
 
5
5
  # This import verifies that the dependencies are available.
6
6
  import psycopg2 # noqa: F401
@@ -14,9 +14,12 @@ import sqlalchemy.dialects.postgresql as custom_types
14
14
  from geoalchemy2 import Geometry # noqa: F401
15
15
  from pydantic import BaseModel
16
16
  from pydantic.fields import Field
17
- from sqlalchemy import create_engine, inspect
17
+ from sqlalchemy import create_engine, event, inspect
18
18
  from sqlalchemy.engine.reflection import Inspector
19
19
 
20
+ if TYPE_CHECKING:
21
+ from sqlalchemy.engine import Engine
22
+
20
23
  from datahub.configuration.common import AllowDenyPattern
21
24
  from datahub.emitter import mce_builder
22
25
  from datahub.emitter.mcp_builder import mcps_from_mce
@@ -30,17 +33,26 @@ from datahub.ingestion.api.decorators import (
30
33
  support_status,
31
34
  )
32
35
  from datahub.ingestion.api.workunit import MetadataWorkUnit
36
+ from datahub.ingestion.source.aws.aws_common import (
37
+ AwsConnectionConfig,
38
+ RDSIAMTokenManager,
39
+ )
33
40
  from datahub.ingestion.source.sql.sql_common import (
34
41
  SQLAlchemySource,
35
42
  SqlWorkUnit,
36
43
  register_custom_type,
37
44
  )
38
45
  from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig
46
+ from datahub.ingestion.source.sql.sqlalchemy_uri import parse_host_port
47
+ from datahub.ingestion.source.sql.stored_procedures.base import (
48
+ BaseProcedure,
49
+ )
39
50
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
40
51
  ArrayTypeClass,
41
52
  BytesTypeClass,
42
53
  MapTypeClass,
43
54
  )
55
+ from datahub.utilities.str_enum import StrEnum
44
56
 
45
57
  logger: logging.Logger = logging.getLogger(__name__)
46
58
 
@@ -97,12 +109,34 @@ class ViewLineageEntry(BaseModel):
97
109
  dependent_schema: str
98
110
 
99
111
 
112
+ class PostgresAuthMode(StrEnum):
113
+ """Authentication mode for PostgreSQL connection."""
114
+
115
+ PASSWORD = "PASSWORD"
116
+ AWS_IAM = "AWS_IAM"
117
+
118
+
100
119
  class BasePostgresConfig(BasicSQLAlchemyConfig):
101
120
  scheme: str = Field(default="postgresql+psycopg2", description="database scheme")
102
121
  schema_pattern: AllowDenyPattern = Field(
103
122
  default=AllowDenyPattern(deny=["information_schema"])
104
123
  )
105
124
 
125
+ # Authentication configuration
126
+ auth_mode: PostgresAuthMode = Field(
127
+ default=PostgresAuthMode.PASSWORD,
128
+ description="Authentication mode to use for the PostgreSQL connection. "
129
+ "Options are 'PASSWORD' (default) for standard username/password authentication, "
130
+ "or 'AWS_IAM' for AWS RDS IAM authentication.",
131
+ )
132
+ aws_config: AwsConnectionConfig = Field(
133
+ default_factory=AwsConnectionConfig,
134
+ description="AWS configuration for RDS IAM authentication (only used when auth_mode is AWS_IAM). "
135
+ "Provides full control over AWS credentials, region, profiles, role assumption, retry logic, and proxy settings. "
136
+ "If not explicitly configured, boto3 will automatically use the default credential chain and region from "
137
+ "environment variables (AWS_DEFAULT_REGION, AWS_REGION), AWS config files (~/.aws/config), or IAM role metadata.",
138
+ )
139
+
106
140
 
107
141
  class PostgresConfig(BasePostgresConfig):
108
142
  database_pattern: AllowDenyPattern = Field(
@@ -124,6 +158,17 @@ class PostgresConfig(BasePostgresConfig):
124
158
  ),
125
159
  )
126
160
 
161
+ include_stored_procedures: bool = Field(
162
+ default=True,
163
+ description="Include ingest of stored procedures.",
164
+ )
165
+
166
+ procedure_pattern: AllowDenyPattern = Field(
167
+ default=AllowDenyPattern.allow_all(),
168
+ description="Regex patterns for stored procedures to filter in ingestion."
169
+ "Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
170
+ )
171
+
127
172
 
128
173
  @platform_name("Postgres")
129
174
  @config_class(PostgresConfig)
@@ -131,12 +176,11 @@ class PostgresConfig(BasePostgresConfig):
131
176
  @capability(SourceCapability.DOMAINS, "Enabled by default")
132
177
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
133
178
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
134
- @capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration")
135
179
  class PostgresSource(SQLAlchemySource):
136
180
  """
137
181
  This plugin extracts the following:
138
182
 
139
- - Metadata for databases, schemas, views, and tables
183
+ - Metadata for databases, schemas, views, tables, and stored procedures
140
184
  - Column types associated with each table
141
185
  - Also supports PostGIS extensions
142
186
  - Table, row, and column statistics via optional SQL profiling
@@ -147,6 +191,22 @@ class PostgresSource(SQLAlchemySource):
147
191
  def __init__(self, config: PostgresConfig, ctx: PipelineContext):
148
192
  super().__init__(config, ctx, self.get_platform())
149
193
 
194
+ self._rds_iam_token_manager: Optional[RDSIAMTokenManager] = None
195
+ if config.auth_mode == PostgresAuthMode.AWS_IAM:
196
+ hostname, port = parse_host_port(config.host_port, default_port=5432)
197
+ if port is None:
198
+ raise ValueError("Port must be specified for RDS IAM authentication")
199
+
200
+ if not config.username:
201
+ raise ValueError("username is required for RDS IAM authentication")
202
+
203
+ self._rds_iam_token_manager = RDSIAMTokenManager(
204
+ endpoint=hostname,
205
+ username=config.username,
206
+ port=port,
207
+ aws_config=config.aws_config,
208
+ )
209
+
150
210
  def get_platform(self):
151
211
  return "postgres"
152
212
 
@@ -155,13 +215,36 @@ class PostgresSource(SQLAlchemySource):
155
215
  config = PostgresConfig.parse_obj(config_dict)
156
216
  return cls(config, ctx)
157
217
 
218
+ def _setup_rds_iam_event_listener(
219
+ self, engine: "Engine", database_name: Optional[str] = None
220
+ ) -> None:
221
+ """Setup SQLAlchemy event listener to inject RDS IAM tokens."""
222
+ if not (
223
+ self.config.auth_mode == PostgresAuthMode.AWS_IAM
224
+ and self._rds_iam_token_manager
225
+ ):
226
+ return
227
+
228
+ def do_connect_listener(_dialect, _conn_rec, _cargs, cparams):
229
+ if not self._rds_iam_token_manager:
230
+ raise RuntimeError("RDS IAM Token Manager is not initialized")
231
+ cparams["password"] = self._rds_iam_token_manager.get_token()
232
+ if cparams.get("sslmode") not in ("require", "verify-ca", "verify-full"):
233
+ cparams["sslmode"] = "require"
234
+
235
+ event.listen(engine, "do_connect", do_connect_listener) # type: ignore[misc]
236
+
158
237
  def get_inspectors(self) -> Iterable[Inspector]:
159
238
  # Note: get_sql_alchemy_url will choose `sqlalchemy_uri` over the passed in database
160
239
  url = self.config.get_sql_alchemy_url(
161
240
  database=self.config.database or self.config.initial_database
162
241
  )
242
+
163
243
  logger.debug(f"sql_alchemy_url={url}")
244
+
164
245
  engine = create_engine(url, **self.config.options)
246
+ self._setup_rds_iam_event_listener(engine)
247
+
165
248
  with engine.connect() as conn:
166
249
  if self.config.database or self.config.sqlalchemy_uri:
167
250
  inspector = inspect(conn)
@@ -169,14 +252,21 @@ class PostgresSource(SQLAlchemySource):
169
252
  else:
170
253
  # pg_database catalog - https://www.postgresql.org/docs/current/catalog-pg-database.html
171
254
  # exclude template databases - https://www.postgresql.org/docs/current/manage-ag-templatedbs.html
255
+ # exclude rdsadmin - AWS RDS administrative database
172
256
  databases = conn.execute(
173
- "SELECT datname from pg_database where datname not in ('template0', 'template1')"
257
+ "SELECT datname from pg_database where datname not in ('template0', 'template1', 'rdsadmin')"
174
258
  )
175
259
  for db in databases:
176
260
  if not self.config.database_pattern.allowed(db["datname"]):
177
261
  continue
262
+
178
263
  url = self.config.get_sql_alchemy_url(database=db["datname"])
179
- with create_engine(url, **self.config.options).connect() as conn:
264
+ db_engine = create_engine(url, **self.config.options)
265
+ self._setup_rds_iam_event_listener(
266
+ db_engine, database_name=db["datname"]
267
+ )
268
+
269
+ with db_engine.connect() as conn:
180
270
  inspector = inspect(conn)
181
271
  yield inspector
182
272
 
@@ -292,3 +382,49 @@ class PostgresSource(SQLAlchemySource):
292
382
  ] = row.table_size
293
383
  except Exception as e:
294
384
  logger.error(f"failed to fetch profile metadata: {e}")
385
+
386
+ def get_procedures_for_schema(
387
+ self, inspector: Inspector, schema: str, db_name: str
388
+ ) -> List[BaseProcedure]:
389
+ """
390
+ Get stored procedures for a specific schema.
391
+ """
392
+ base_procedures = []
393
+ with inspector.engine.connect() as conn:
394
+ procedures = conn.execute(
395
+ """
396
+ SELECT
397
+ p.proname AS name,
398
+ l.lanname AS language,
399
+ pg_get_function_arguments(p.oid) AS arguments,
400
+ pg_get_functiondef(p.oid) AS definition,
401
+ obj_description(p.oid, 'pg_proc') AS comment
402
+ FROM
403
+ pg_proc p
404
+ JOIN
405
+ pg_namespace n ON n.oid = p.pronamespace
406
+ JOIN
407
+ pg_language l ON l.oid = p.prolang
408
+ WHERE
409
+ p.prokind = 'p'
410
+ AND n.nspname = %s;
411
+ """,
412
+ (schema,),
413
+ )
414
+
415
+ procedure_rows = list(procedures)
416
+ for row in procedure_rows:
417
+ base_procedures.append(
418
+ BaseProcedure(
419
+ name=row.name,
420
+ language=row.language,
421
+ argument_signature=row.arguments,
422
+ return_type=None,
423
+ procedure_definition=row.definition,
424
+ created=None,
425
+ last_altered=None,
426
+ comment=row.comment,
427
+ extra_properties=None,
428
+ )
429
+ )
430
+ return base_procedures
@@ -1,11 +1,14 @@
1
+ import functools
1
2
  from textwrap import dedent
2
- from typing import Optional
3
+ from typing import Dict, Optional
3
4
 
4
5
  from pydantic.fields import Field
5
6
  from pyhive.sqlalchemy_presto import PrestoDialect
6
7
  from sqlalchemy import exc, sql
7
8
  from sqlalchemy.engine import reflection
9
+ from sqlalchemy.engine.base import Engine
8
10
 
11
+ from datahub.configuration.common import HiddenFromDocs
9
12
  from datahub.ingestion.api.common import PipelineContext
10
13
  from datahub.ingestion.api.decorators import (
11
14
  SourceCapability,
@@ -85,7 +88,7 @@ PrestoDialect._get_full_table = _get_full_table
85
88
 
86
89
  class PrestoConfig(TrinoConfig):
87
90
  # defaults
88
- scheme: str = Field(default="presto", description="", hidden_from_docs=True)
91
+ scheme: HiddenFromDocs[str] = Field(default="presto")
89
92
 
90
93
 
91
94
  @platform_name("Presto", doc_order=1)
@@ -114,3 +117,18 @@ class PrestoSource(TrinoSource):
114
117
  def create(cls, config_dict, ctx):
115
118
  config = PrestoConfig.parse_obj(config_dict)
116
119
  return cls(config, ctx)
120
+
121
+
122
+ # Unfortunately, the Presto dialect provide catalog_name as a column
123
+ # therefore we we need some workaround to not fail.
124
+ # This is a workaround to not fail which casuses to only get the table comment as property which is still better than to fail.
125
+ @functools.lru_cache
126
+ def gen_catalog_connector_dict(engine: Engine) -> Dict[str, str]:
127
+ query = dedent(
128
+ """
129
+ SELECT *
130
+ FROM "system"."metadata"."catalogs"
131
+ """
132
+ ).strip()
133
+ res = engine.execute(sql.text(query))
134
+ return {row.catalog_name: "" for row in res}