acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,6 @@ import contextlib
3
3
  import dataclasses
4
4
  import functools
5
5
  import logging
6
- import os
7
6
  import threading
8
7
  import uuid
9
8
  from enum import auto
@@ -16,15 +15,18 @@ from datahub.configuration.common import (
16
15
  ConfigurationError,
17
16
  OperationalError,
18
17
  )
18
+ from datahub.configuration.env_vars import (
19
+ get_rest_sink_default_max_threads,
20
+ get_rest_sink_default_mode,
21
+ )
19
22
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
20
23
  from datahub.emitter.mcp_builder import mcps_from_mce
21
24
  from datahub.emitter.rest_emitter import (
22
25
  BATCH_INGEST_MAX_PAYLOAD_LENGTH,
23
- DEFAULT_REST_SINK_ENDPOINT,
24
- DEFAULT_REST_TRACE_MODE,
26
+ DEFAULT_REST_EMITTER_ENDPOINT,
25
27
  DataHubRestEmitter,
28
+ EmitMode,
26
29
  RestSinkEndpoint,
27
- RestTraceMode,
28
30
  )
29
31
  from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
30
32
  from datahub.ingestion.api.sink import (
@@ -34,7 +36,7 @@ from datahub.ingestion.api.sink import (
34
36
  WriteCallback,
35
37
  )
36
38
  from datahub.ingestion.api.workunit import MetadataWorkUnit
37
- from datahub.ingestion.graph.client import DatahubClientConfig
39
+ from datahub.ingestion.graph.config import ClientMode, DatahubClientConfig
38
40
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
39
41
  MetadataChangeEvent,
40
42
  MetadataChangeProposal,
@@ -48,9 +50,7 @@ from datahub.utilities.server_config_util import set_gms_config
48
50
 
49
51
  logger = logging.getLogger(__name__)
50
52
 
51
- _DEFAULT_REST_SINK_MAX_THREADS = int(
52
- os.getenv("DATAHUB_REST_SINK_DEFAULT_MAX_THREADS", 15)
53
- )
53
+ _DEFAULT_REST_SINK_MAX_THREADS = get_rest_sink_default_max_threads()
54
54
 
55
55
 
56
56
  class RestSinkMode(ConfigEnum):
@@ -64,14 +64,14 @@ class RestSinkMode(ConfigEnum):
64
64
 
65
65
 
66
66
  _DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
67
- RestSinkMode, os.getenv("DATAHUB_REST_SINK_DEFAULT_MODE", RestSinkMode.ASYNC_BATCH)
67
+ RestSinkMode, get_rest_sink_default_mode() or RestSinkMode.ASYNC_BATCH
68
68
  )
69
69
 
70
70
 
71
71
  class DatahubRestSinkConfig(DatahubClientConfig):
72
72
  mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
73
- endpoint: RestSinkEndpoint = DEFAULT_REST_SINK_ENDPOINT
74
- default_trace_mode: RestTraceMode = DEFAULT_REST_TRACE_MODE
73
+ endpoint: RestSinkEndpoint = DEFAULT_REST_EMITTER_ENDPOINT
74
+ server_config_refresh_interval: Optional[int] = None
75
75
 
76
76
  # These only apply in async modes.
77
77
  max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
@@ -92,6 +92,7 @@ class DatahubRestSinkConfig(DatahubClientConfig):
92
92
  @dataclasses.dataclass
93
93
  class DataHubRestSinkReport(SinkReport):
94
94
  mode: Optional[RestSinkMode] = None
95
+ endpoint: Optional[RestSinkEndpoint] = None
95
96
  max_threads: Optional[int] = None
96
97
  gms_version: Optional[str] = None
97
98
  pending_requests: int = 0
@@ -134,18 +135,15 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
134
135
  self._emitter_thread_local = threading.local()
135
136
 
136
137
  try:
137
- gms_config = self.emitter.get_server_config()
138
+ gms_config = self.emitter.server_config
138
139
  except Exception as exc:
139
140
  raise ConfigurationError(
140
141
  f"💥 Failed to connect to DataHub with {repr(self.emitter)}"
141
142
  ) from exc
142
143
 
143
- self.report.gms_version = (
144
- gms_config.get("versions", {})
145
- .get("acryldata/datahub", {})
146
- .get("version", None)
147
- )
144
+ self.report.gms_version = gms_config.service_version
148
145
  self.report.mode = self.config.mode
146
+ self.report.endpoint = self.config.endpoint
149
147
  self.report.max_threads = self.config.max_threads
150
148
  logger.debug("Setting env variables to override config")
151
149
  logger.debug("Setting gms config")
@@ -179,7 +177,8 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
179
177
  client_certificate_path=config.client_certificate_path,
180
178
  disable_ssl_verification=config.disable_ssl_verification,
181
179
  openapi_ingestion=config.endpoint == RestSinkEndpoint.OPENAPI,
182
- default_trace_mode=config.default_trace_mode == RestTraceMode.ENABLED,
180
+ client_mode=config.client_mode,
181
+ datahub_component=config.datahub_component,
183
182
  )
184
183
 
185
184
  @property
@@ -190,6 +189,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
190
189
  # https://github.com/psf/requests/issues/1871#issuecomment-32751346
191
190
  thread_local = self._emitter_thread_local
192
191
  if not hasattr(thread_local, "emitter"):
192
+ self.config.client_mode = ClientMode.INGESTION
193
193
  thread_local.emitter = DatahubRestSink._make_emitter(self.config)
194
194
  return thread_local.emitter
195
195
 
@@ -253,9 +253,10 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
253
253
  MetadataChangeProposal,
254
254
  MetadataChangeProposalWrapper,
255
255
  ],
256
+ emit_mode: EmitMode,
256
257
  ) -> None:
257
258
  # TODO: Add timing metrics
258
- self.emitter.emit(record)
259
+ self.emitter.emit(record, emit_mode=emit_mode)
259
260
 
260
261
  def _emit_batch_wrapper(
261
262
  self,
@@ -270,8 +271,10 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
270
271
  ],
271
272
  ) -> None:
272
273
  events: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]] = []
274
+
273
275
  for record in records:
274
276
  event = record[0]
277
+
275
278
  if isinstance(event, MetadataChangeEvent):
276
279
  # Unpack MCEs into MCPs.
277
280
  mcps = mcps_from_mce(event)
@@ -279,7 +282,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
279
282
  else:
280
283
  events.append(event)
281
284
 
282
- chunks = self.emitter.emit_mcps(events)
285
+ chunks = self.emitter.emit_mcps(events, emit_mode=EmitMode.ASYNC)
283
286
  self.report.async_batches_prepared += 1
284
287
  if chunks > 1:
285
288
  self.report.async_batches_split += chunks
@@ -310,6 +313,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
310
313
  partition_key,
311
314
  self._emit_wrapper,
312
315
  record,
316
+ EmitMode.ASYNC,
313
317
  done_callback=functools.partial(
314
318
  self._write_done_callback, record_envelope, write_callback
315
319
  ),
@@ -321,6 +325,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
321
325
  self.executor.submit(
322
326
  partition_key,
323
327
  record,
328
+ EmitMode.ASYNC,
324
329
  done_callback=functools.partial(
325
330
  self._write_done_callback, record_envelope, write_callback
326
331
  ),
@@ -329,7 +334,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
329
334
  else:
330
335
  # execute synchronously
331
336
  try:
332
- self._emit_wrapper(record)
337
+ self._emit_wrapper(record, emit_mode=EmitMode.SYNC_PRIMARY)
333
338
  write_callback.on_success(record_envelope, success_metadata={})
334
339
  except Exception as e:
335
340
  write_callback.on_failure(record_envelope, e, failure_metadata={})
@@ -341,11 +346,14 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
341
346
  ],
342
347
  ) -> None:
343
348
  return self.write_record_async(
344
- RecordEnvelope(item, metadata={}),
345
- NoopWriteCallback(),
349
+ RecordEnvelope(item, metadata={}), NoopWriteCallback()
346
350
  )
347
351
 
348
352
  def close(self):
353
+ # Execute pre-shutdown callbacks first (handled by parent class)
354
+ super().close()
355
+
356
+ # Then perform sink-specific shutdown
349
357
  with self.report.main_thread_blocking_timer:
350
358
  self.executor.shutdown()
351
359
 
@@ -79,6 +79,7 @@ class FileSink(Sink[FileSinkConfig, SinkReport]):
79
79
  write_callback.on_success(record_envelope, {})
80
80
 
81
81
  def close(self):
82
+ super().close()
82
83
  self.file.write("\n]")
83
84
  self.file.close()
84
85
 
@@ -151,7 +151,7 @@ class DataLakeSourceConfig(
151
151
  raise ValueError("platform must not be empty")
152
152
  return platform
153
153
 
154
- @pydantic.root_validator()
154
+ @pydantic.root_validator(skip_on_failure=True)
155
155
  def ensure_profiling_pattern_is_passed_to_profiling(
156
156
  cls, values: Dict[str, Any]
157
157
  ) -> Dict[str, Any]:
@@ -72,7 +72,7 @@ class DataLakeProfilerConfig(ConfigModel):
72
72
  description="Whether to profile for the sample values for all columns.",
73
73
  )
74
74
 
75
- @pydantic.root_validator()
75
+ @pydantic.root_validator(skip_on_failure=True)
76
76
  def ensure_field_level_settings_are_normalized(
77
77
  cls: "DataLakeProfilerConfig", values: Dict[str, Any]
78
78
  ) -> Dict[str, Any]:
@@ -44,7 +44,11 @@ from datahub.ingestion.source.azure.abs_utils import (
44
44
  get_key_prefix,
45
45
  strip_abs_prefix,
46
46
  )
47
- from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
47
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
48
+ from datahub.ingestion.source.data_lake_common.data_lake_utils import (
49
+ ContainerWUCreator,
50
+ add_partition_columns_to_schema,
51
+ )
48
52
  from datahub.ingestion.source.schema_inference import avro, csv_tsv, json, parquet
49
53
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
50
54
  StaleEntityRemovalHandler,
@@ -53,10 +57,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
53
57
  StatefulIngestionSourceBase,
54
58
  )
55
59
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
56
- SchemaField,
57
- SchemaFieldDataType,
58
60
  SchemaMetadata,
59
- StringTypeClass,
60
61
  )
61
62
  from datahub.metadata.schema_classes import (
62
63
  DataPlatformInstanceClass,
@@ -128,6 +129,14 @@ class TableData:
128
129
  @support_status(SupportStatus.INCUBATING)
129
130
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
130
131
  @capability(SourceCapability.TAGS, "Can extract ABS object/container tags if enabled")
132
+ @capability(
133
+ SourceCapability.CONTAINERS,
134
+ "Extract ABS containers and folders",
135
+ subtype_modifier=[
136
+ SourceCapabilityModifier.FOLDER,
137
+ SourceCapabilityModifier.ABS_CONTAINER,
138
+ ],
139
+ )
131
140
  class ABSSource(StatefulIngestionSourceBase):
132
141
  source_config: DataLakeSourceConfig
133
142
  report: DataLakeSourceReport
@@ -223,36 +232,12 @@ class ABSSource(StatefulIngestionSourceBase):
223
232
  fields = sorted(fields, key=lambda f: f.fieldPath)
224
233
 
225
234
  if self.source_config.add_partition_columns_to_schema:
226
- self.add_partition_columns_to_schema(
235
+ add_partition_columns_to_schema(
227
236
  fields=fields, path_spec=path_spec, full_path=table_data.full_path
228
237
  )
229
238
 
230
239
  return fields
231
240
 
232
- def add_partition_columns_to_schema(
233
- self, path_spec: PathSpec, full_path: str, fields: List[SchemaField]
234
- ) -> None:
235
- vars = path_spec.get_named_vars(full_path)
236
- if vars is not None and "partition" in vars:
237
- for partition in vars["partition"].values():
238
- partition_arr = partition.split("=")
239
- if len(partition_arr) != 2:
240
- logger.debug(
241
- f"Could not derive partition key from partition field {partition}"
242
- )
243
- continue
244
- partition_key = partition_arr[0]
245
- fields.append(
246
- SchemaField(
247
- fieldPath=f"{partition_key}",
248
- nativeDataType="string",
249
- type=SchemaFieldDataType(StringTypeClass()),
250
- isPartitioningKey=True,
251
- nullable=True,
252
- recursive=False,
253
- )
254
- )
255
-
256
241
  def _create_table_operation_aspect(self, table_data: TableData) -> OperationClass:
257
242
  reported_time = int(time.time() * 1000)
258
243
 
@@ -533,7 +518,7 @@ class ABSSource(StatefulIngestionSourceBase):
533
518
  )
534
519
  path_spec.sample_files = False
535
520
  for obj in container_client.list_blobs(
536
- prefix=f"{prefix}", results_per_page=PAGE_SIZE
521
+ name_starts_with=f"{prefix}", results_per_page=PAGE_SIZE
537
522
  ):
538
523
  abs_path = self.create_abs_path(obj.name)
539
524
  logger.debug(f"Path: {abs_path}")
@@ -18,6 +18,7 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, Sour
18
18
  from datahub.ingestion.api.source_helpers import auto_workunit_reporter
19
19
  from datahub.ingestion.api.workunit import MetadataWorkUnit
20
20
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
21
+ from datahub.ingestion.graph.config import ClientMode
21
22
  from datahub.metadata.schema_classes import (
22
23
  DomainsClass,
23
24
  GlossaryTermAssociationClass,
@@ -48,7 +49,7 @@ def apply_association_to_container(
48
49
  """
49
50
  urns: List[str] = [container_urn]
50
51
  if not graph:
51
- graph = get_default_graph()
52
+ graph = get_default_graph(ClientMode.INGESTION)
52
53
  logger.info(f"Using {graph}")
53
54
  urns.extend(
54
55
  graph.get_urns_by_filter(
@@ -95,7 +96,7 @@ def apply_association_to_container(
95
96
  class DomainApplyConfig(ConfigModel):
96
97
  assets: List[str] = Field(
97
98
  default_factory=list,
98
- description="List of assets to apply domain hierarchichaly. Currently only containers and datasets are supported",
99
+ description="List of assets to apply domain hierarchically. Currently only containers and datasets are supported",
99
100
  )
100
101
  domain_urn: str = Field(default="")
101
102
 
@@ -103,7 +104,7 @@ class DomainApplyConfig(ConfigModel):
103
104
  class TagApplyConfig(ConfigModel):
104
105
  assets: List[str] = Field(
105
106
  default_factory=list,
106
- description="List of assets to apply tag hierarchichaly. Currently only containers and datasets are supported",
107
+ description="List of assets to apply tag hierarchically. Currently only containers and datasets are supported",
107
108
  )
108
109
  tag_urn: str = Field(default="")
109
110
 
@@ -111,7 +112,7 @@ class TagApplyConfig(ConfigModel):
111
112
  class TermApplyConfig(ConfigModel):
112
113
  assets: List[str] = Field(
113
114
  default_factory=list,
114
- description="List of assets to apply term hierarchichaly. Currently only containers and datasets are supported",
115
+ description="List of assets to apply term hierarchically. Currently only containers and datasets are supported",
115
116
  )
116
117
  term_urn: str = Field(default="")
117
118
 
@@ -119,7 +120,7 @@ class TermApplyConfig(ConfigModel):
119
120
  class OwnerApplyConfig(ConfigModel):
120
121
  assets: List[str] = Field(
121
122
  default_factory=list,
122
- description="List of assets to apply owner hierarchichaly. Currently only containers and datasets are supported",
123
+ description="List of assets to apply owner hierarchically. Currently only containers and datasets are supported",
123
124
  )
124
125
  owner_urn: str = Field(default="")
125
126
 
@@ -1,14 +1,15 @@
1
1
  import logging
2
- import os
3
2
  from datetime import datetime, timedelta, timezone
4
3
  from enum import Enum
5
4
  from http import HTTPStatus
6
5
  from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
6
+ from urllib.parse import parse_qs, urlparse
7
7
 
8
8
  import boto3
9
9
  import requests
10
10
  from boto3.session import Session
11
11
  from botocore.config import DEFAULT_TIMEOUT, Config
12
+ from botocore.exceptions import ClientError, NoCredentialsError
12
13
  from botocore.utils import fix_s3_host
13
14
  from pydantic.fields import Field
14
15
 
@@ -17,6 +18,16 @@ from datahub.configuration.common import (
17
18
  ConfigModel,
18
19
  PermissiveConfigModel,
19
20
  )
21
+ from datahub.configuration.env_vars import (
22
+ get_aws_app_runner_service_id,
23
+ get_aws_execution_env,
24
+ get_aws_lambda_function_name,
25
+ get_aws_role_arn,
26
+ get_aws_web_identity_token_file,
27
+ get_ecs_container_metadata_uri,
28
+ get_ecs_container_metadata_uri_v4,
29
+ get_elastic_beanstalk_environment_name,
30
+ )
20
31
  from datahub.configuration.source_common import EnvConfigMixin
21
32
 
22
33
  logger = logging.getLogger(__name__)
@@ -24,6 +35,7 @@ logger = logging.getLogger(__name__)
24
35
  if TYPE_CHECKING:
25
36
  from mypy_boto3_dynamodb import DynamoDBClient
26
37
  from mypy_boto3_glue import GlueClient
38
+ from mypy_boto3_lakeformation import LakeFormationClient
27
39
  from mypy_boto3_s3 import S3Client, S3ServiceResource
28
40
  from mypy_boto3_sagemaker import SageMakerClient
29
41
  from mypy_boto3_sts import STSClient
@@ -99,27 +111,25 @@ def detect_aws_environment() -> AwsEnvironment:
99
111
  Order matters as some environments may have multiple indicators.
100
112
  """
101
113
  # Check Lambda first as it's most specific
102
- if os.getenv("AWS_LAMBDA_FUNCTION_NAME"):
103
- if os.getenv("AWS_EXECUTION_ENV", "").startswith("CloudFormation"):
114
+ if get_aws_lambda_function_name():
115
+ if (get_aws_execution_env() or "").startswith("CloudFormation"):
104
116
  return AwsEnvironment.CLOUD_FORMATION
105
117
  return AwsEnvironment.LAMBDA
106
118
 
107
119
  # Check EKS (IRSA)
108
- if os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE") and os.getenv("AWS_ROLE_ARN"):
120
+ if get_aws_web_identity_token_file() and get_aws_role_arn():
109
121
  return AwsEnvironment.EKS
110
122
 
111
123
  # Check App Runner
112
- if os.getenv("AWS_APP_RUNNER_SERVICE_ID"):
124
+ if get_aws_app_runner_service_id():
113
125
  return AwsEnvironment.APP_RUNNER
114
126
 
115
127
  # Check ECS
116
- if os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
117
- "ECS_CONTAINER_METADATA_URI"
118
- ):
128
+ if get_ecs_container_metadata_uri_v4() or get_ecs_container_metadata_uri():
119
129
  return AwsEnvironment.ECS
120
130
 
121
131
  # Check Elastic Beanstalk
122
- if os.getenv("ELASTIC_BEANSTALK_ENVIRONMENT_NAME"):
132
+ if get_elastic_beanstalk_environment_name():
123
133
  return AwsEnvironment.BEANSTALK
124
134
 
125
135
  if is_running_on_ec2():
@@ -154,7 +164,7 @@ def get_instance_role_arn() -> Optional[str]:
154
164
  def get_lambda_role_arn() -> Optional[str]:
155
165
  """Get the Lambda function's role ARN"""
156
166
  try:
157
- function_name = os.getenv("AWS_LAMBDA_FUNCTION_NAME")
167
+ function_name = get_aws_lambda_function_name()
158
168
  if not function_name:
159
169
  return None
160
170
 
@@ -180,7 +190,7 @@ def get_current_identity() -> Tuple[Optional[str], Optional[str]]:
180
190
  return role_arn, AwsServicePrincipal.LAMBDA.value
181
191
 
182
192
  elif env == AwsEnvironment.EKS:
183
- role_arn = os.getenv("AWS_ROLE_ARN")
193
+ role_arn = get_aws_role_arn()
184
194
  return role_arn, AwsServicePrincipal.EKS.value
185
195
 
186
196
  elif env == AwsEnvironment.APP_RUNNER:
@@ -193,8 +203,8 @@ def get_current_identity() -> Tuple[Optional[str], Optional[str]]:
193
203
 
194
204
  elif env == AwsEnvironment.ECS:
195
205
  try:
196
- metadata_uri = os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
197
- "ECS_CONTAINER_METADATA_URI"
206
+ metadata_uri = (
207
+ get_ecs_container_metadata_uri_v4() or get_ecs_container_metadata_uri()
198
208
  )
199
209
  if metadata_uri:
200
210
  response = requests.get(f"{metadata_uri}/task", timeout=1)
@@ -454,6 +464,168 @@ class AwsConnectionConfig(ConfigModel):
454
464
  def get_sagemaker_client(self) -> "SageMakerClient":
455
465
  return self.get_session().client("sagemaker", config=self._aws_config())
456
466
 
467
+ def get_lakeformation_client(self) -> "LakeFormationClient":
468
+ return self.get_session().client("lakeformation", config=self._aws_config())
469
+
470
+ def get_rds_client(self):
471
+ """Get an RDS client for generating IAM auth tokens."""
472
+ return self.get_session().client("rds", config=self._aws_config())
473
+
474
+
475
+ def generate_rds_iam_token(
476
+ endpoint: str,
477
+ username: str,
478
+ port: int,
479
+ aws_config: AwsConnectionConfig,
480
+ ) -> str:
481
+ """
482
+ Generate an AWS RDS IAM authentication token.
483
+
484
+ boto3's generate_db_auth_token() returns a presigned URL in the format:
485
+ "hostname:port/?Action=connect&DBUser=username&X-Amz-Date=...&X-Amz-Expires=..."
486
+
487
+ This token should be used as-is by pymysql/psycopg2 drivers.
488
+
489
+ Args:
490
+ endpoint: RDS endpoint hostname
491
+ username: Database username for IAM authentication
492
+ port: Database port (5432 for PostgreSQL, 3306 for MySQL)
493
+ aws_config: AwsConnectionConfig for session management and credentials
494
+
495
+ Returns:
496
+ Authentication token (presigned URL format)
497
+
498
+ Raises:
499
+ ValueError: If AWS credentials are not found or token generation fails
500
+
501
+ """
502
+ try:
503
+ client = aws_config.get_rds_client()
504
+ token = client.generate_db_auth_token(
505
+ DBHostname=endpoint, Port=port, DBUsername=username
506
+ )
507
+ logger.debug(f"Generated RDS IAM token for {username}@{endpoint}:{port}")
508
+ return token
509
+ except NoCredentialsError as e:
510
+ raise ValueError("AWS credentials not found") from e
511
+ except ClientError as e:
512
+ raise ValueError(f"Failed to generate RDS IAM token: {e}") from e
513
+
514
+
515
+ class RDSIAMTokenManager:
516
+ """
517
+ Manages RDS IAM token lifecycle with automatic refresh.
518
+
519
+ RDS IAM tokens include expiration information in the URL parameters.
520
+ This manager parses the token expiry and refreshes before expiration
521
+ to ensure uninterrupted database access.
522
+ """
523
+
524
+ def __init__(
525
+ self,
526
+ endpoint: str,
527
+ username: str,
528
+ port: int,
529
+ aws_config: AwsConnectionConfig,
530
+ refresh_threshold_minutes: int = 5,
531
+ ):
532
+ """
533
+ Initialize the token manager.
534
+
535
+ Args:
536
+ endpoint: RDS endpoint hostname
537
+ username: Database username for IAM authentication
538
+ port: Database port
539
+ aws_config: AwsConnectionConfig for session management and credentials
540
+ refresh_threshold_minutes: Refresh token when this many minutes remain before expiry
541
+ """
542
+ self.endpoint = endpoint
543
+ self.username = username
544
+ self.port = port
545
+ self.aws_config = aws_config
546
+ self.refresh_threshold = timedelta(minutes=refresh_threshold_minutes)
547
+
548
+ self._current_token: Optional[str] = None
549
+ self._token_expires_at: Optional[datetime] = None
550
+
551
+ def get_token(self) -> str:
552
+ """
553
+ Get current token, refreshing if necessary.
554
+
555
+ Returns:
556
+ Valid authentication token
557
+
558
+ Raises:
559
+ RuntimeError: If token generation or refresh fails
560
+ """
561
+ if self._needs_refresh():
562
+ self._refresh_token()
563
+
564
+ assert self._current_token is not None
565
+ return self._current_token
566
+
567
+ def _needs_refresh(self) -> bool:
568
+ """Check if token needs to be refreshed."""
569
+ if self._current_token is None or self._token_expires_at is None:
570
+ return True
571
+
572
+ time_until_expiry = self._token_expires_at - datetime.now(timezone.utc)
573
+ return time_until_expiry <= self.refresh_threshold
574
+
575
+ def _parse_token_expiry(self, token: str) -> datetime:
576
+ """
577
+ Parse token expiry from X-Amz-Date and X-Amz-Expires URL parameters.
578
+
579
+ Args:
580
+ token: RDS IAM authentication token (presigned URL)
581
+
582
+ Returns:
583
+ Expiration datetime in UTC
584
+
585
+ Raises:
586
+ ValueError: If token URL format is invalid or missing required parameters
587
+ """
588
+ try:
589
+ parsed_url = urlparse(token)
590
+ query_params = parse_qs(parsed_url.query)
591
+
592
+ # Extract X-Amz-Date (ISO 8601 format: YYYYMMDDTHHMMSSZ)
593
+ amz_date_list = query_params.get("X-Amz-Date")
594
+ if not amz_date_list:
595
+ raise ValueError("Missing X-Amz-Date parameter in RDS IAM token")
596
+ amz_date_str = amz_date_list[0]
597
+
598
+ # Extract X-Amz-Expires (duration in seconds)
599
+ amz_expires_list = query_params.get("X-Amz-Expires")
600
+ if not amz_expires_list:
601
+ raise ValueError("Missing X-Amz-Expires parameter in RDS IAM token")
602
+ amz_expires_seconds = int(amz_expires_list[0])
603
+
604
+ # Parse X-Amz-Date to datetime
605
+ token_issued_at = datetime.strptime(amz_date_str, "%Y%m%dT%H%M%SZ").replace(
606
+ tzinfo=timezone.utc
607
+ )
608
+
609
+ # Calculate expiration
610
+ return token_issued_at + timedelta(seconds=amz_expires_seconds)
611
+
612
+ except (ValueError, KeyError, IndexError) as e:
613
+ raise ValueError(
614
+ f"Failed to parse RDS IAM token expiry: {e}. Token format may be invalid."
615
+ ) from e
616
+
617
+ def _refresh_token(self) -> None:
618
+ """Generate and store a new token with parsed expiry."""
619
+ logger.info("Refreshing RDS IAM authentication token")
620
+ self._current_token = generate_rds_iam_token(
621
+ endpoint=self.endpoint,
622
+ username=self.username,
623
+ port=self.port,
624
+ aws_config=self.aws_config,
625
+ )
626
+ self._token_expires_at = self._parse_token_expiry(self._current_token)
627
+ logger.debug(f"Token will expire at {self._token_expires_at}")
628
+
457
629
 
458
630
  class AwsSourceConfig(EnvConfigMixin, AwsConnectionConfig):
459
631
  """