acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,331 @@
1
+ # ABOUTME: Central registry for all environment variables used in metadata-ingestion.
2
+ # ABOUTME: All environment variable reads should go through this module for discoverability and maintainability.
3
+
4
+ import os
5
+ from typing import Optional
6
+
7
+ # ============================================================================
8
+ # Core DataHub Configuration
9
+ # ============================================================================
10
+
11
+
12
+ def get_gms_url() -> Optional[str]:
13
+ """Complete GMS URL (takes precedence over separate host/port)."""
14
+ return os.getenv("DATAHUB_GMS_URL")
15
+
16
+
17
+ def get_gms_host() -> Optional[str]:
18
+ """GMS host (fallback for URL, deprecated)."""
19
+ return os.getenv("DATAHUB_GMS_HOST")
20
+
21
+
22
+ def get_gms_port() -> Optional[str]:
23
+ """GMS port number."""
24
+ return os.getenv("DATAHUB_GMS_PORT")
25
+
26
+
27
+ def get_gms_protocol() -> str:
28
+ """Protocol for GMS connection (http/https)."""
29
+ return os.getenv("DATAHUB_GMS_PROTOCOL", "http")
30
+
31
+
32
+ def get_gms_token() -> Optional[str]:
33
+ """Authentication token for GMS."""
34
+ return os.getenv("DATAHUB_GMS_TOKEN")
35
+
36
+
37
+ def get_system_client_id() -> Optional[str]:
38
+ """System client ID for OAuth/auth."""
39
+ return os.getenv("DATAHUB_SYSTEM_CLIENT_ID")
40
+
41
+
42
+ def get_system_client_secret() -> Optional[str]:
43
+ """System client secret for OAuth/auth."""
44
+ return os.getenv("DATAHUB_SYSTEM_CLIENT_SECRET")
45
+
46
+
47
+ def get_skip_config() -> bool:
48
+ """Skip loading config file (forces env variables)."""
49
+ return os.getenv("DATAHUB_SKIP_CONFIG", "").lower() == "true"
50
+
51
+
52
+ def get_gms_base_path() -> str:
53
+ """Base path for GMS API endpoints."""
54
+ return os.getenv("DATAHUB_GMS_BASE_PATH", "")
55
+
56
+
57
+ # ============================================================================
58
+ # REST Emitter Configuration
59
+ # ============================================================================
60
+
61
+
62
+ def get_rest_emitter_default_retry_max_times() -> str:
63
+ """Max retry attempts for failed requests."""
64
+ return os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
65
+
66
+
67
+ def get_rest_emitter_batch_max_payload_bytes() -> int:
68
+ """Maximum payload size in bytes for batch operations."""
69
+ return int(
70
+ os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_BYTES", str(15 * 1024 * 1024))
71
+ )
72
+
73
+
74
+ def get_rest_emitter_batch_max_payload_length() -> int:
75
+ """Maximum number of MCPs per batch."""
76
+ return int(os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", "200"))
77
+
78
+
79
+ def get_emit_mode() -> Optional[str]:
80
+ """Emission mode (SYNC_PRIMARY, SYNC_WAIT, ASYNC, ASYNC_WAIT)."""
81
+ return os.getenv("DATAHUB_EMIT_MODE")
82
+
83
+
84
+ def get_rest_emitter_default_endpoint() -> Optional[str]:
85
+ """REST endpoint type (RESTLI or OPENAPI)."""
86
+ return os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT")
87
+
88
+
89
+ def get_emitter_trace() -> bool:
90
+ """Enable detailed emitter tracing."""
91
+ return os.getenv("DATAHUB_EMITTER_TRACE", "").lower() == "true"
92
+
93
+
94
+ # ============================================================================
95
+ # REST Sink Configuration
96
+ # ============================================================================
97
+
98
+
99
+ def get_rest_sink_default_max_threads() -> int:
100
+ """Max thread pool size for async operations."""
101
+ return int(os.getenv("DATAHUB_REST_SINK_DEFAULT_MAX_THREADS", "15"))
102
+
103
+
104
+ def get_rest_sink_default_mode() -> Optional[str]:
105
+ """Sink mode (SYNC, ASYNC, ASYNC_BATCH)."""
106
+ return os.getenv("DATAHUB_REST_SINK_DEFAULT_MODE")
107
+
108
+
109
+ # ============================================================================
110
+ # Telemetry & Monitoring
111
+ # ============================================================================
112
+
113
+
114
+ def get_telemetry_timeout() -> str:
115
+ """Telemetry timeout in seconds."""
116
+ return os.getenv("DATAHUB_TELEMETRY_TIMEOUT", "10")
117
+
118
+
119
+ def get_sentry_dsn() -> Optional[str]:
120
+ """Sentry error tracking DSN."""
121
+ return os.getenv("SENTRY_DSN")
122
+
123
+
124
+ def get_sentry_environment() -> str:
125
+ """Sentry environment (dev/prod)."""
126
+ return os.getenv("SENTRY_ENVIRONMENT", "dev")
127
+
128
+
129
+ # ============================================================================
130
+ # Logging & Debug Configuration
131
+ # ============================================================================
132
+
133
+
134
+ def get_suppress_logging_manager() -> Optional[str]:
135
+ """Suppress DataHub logging manager initialization."""
136
+ return os.getenv("DATAHUB_SUPPRESS_LOGGING_MANAGER")
137
+
138
+
139
+ def get_no_color() -> bool:
140
+ """Disable colored logging output."""
141
+ return os.getenv("NO_COLOR", "").lower() == "true"
142
+
143
+
144
+ def get_test_mode() -> Optional[str]:
145
+ """Indicates running in test context."""
146
+ return os.getenv("DATAHUB_TEST_MODE")
147
+
148
+
149
+ def get_debug() -> bool:
150
+ """Enable debug mode."""
151
+ return os.getenv("DATAHUB_DEBUG", "").lower() == "true"
152
+
153
+
154
+ # ============================================================================
155
+ # Data Processing Configuration
156
+ # ============================================================================
157
+
158
+
159
+ def get_sql_agg_query_log() -> str:
160
+ """SQL aggregator query logging level."""
161
+ return os.getenv("DATAHUB_SQL_AGG_QUERY_LOG", "DISABLED")
162
+
163
+
164
+ def get_dataset_urn_to_lower() -> str:
165
+ """Convert dataset URNs to lowercase."""
166
+ return os.getenv("DATAHUB_DATASET_URN_TO_LOWER", "false")
167
+
168
+
169
+ # ============================================================================
170
+ # Integration-Specific Configuration
171
+ # ============================================================================
172
+
173
+
174
+ def get_kafka_schema_registry_url() -> Optional[str]:
175
+ """Kafka schema registry URL."""
176
+ return os.getenv("KAFKA_SCHEMAREGISTRY_URL")
177
+
178
+
179
+ def get_spark_version() -> Optional[str]:
180
+ """Spark version (for S3 source)."""
181
+ return os.getenv("SPARK_VERSION")
182
+
183
+
184
+ def get_bigquery_schema_parallelism() -> int:
185
+ """Parallelism level for BigQuery schema extraction."""
186
+ return int(os.getenv("DATAHUB_BIGQUERY_SCHEMA_PARALLELISM", "20"))
187
+
188
+
189
+ def get_snowflake_schema_parallelism() -> int:
190
+ """Parallelism level for Snowflake schema extraction."""
191
+ return int(os.getenv("DATAHUB_SNOWFLAKE_SCHEMA_PARALLELISM", "20"))
192
+
193
+
194
+ def get_powerbi_m_query_parse_timeout() -> int:
195
+ """Timeout for PowerBI M query parsing."""
196
+ return int(os.getenv("DATAHUB_POWERBI_M_QUERY_PARSE_TIMEOUT", "60"))
197
+
198
+
199
+ def get_trace_powerbi_mquery_parser() -> bool:
200
+ """Enable PowerBI M query parser tracing."""
201
+ return os.getenv("DATAHUB_TRACE_POWERBI_MQUERY_PARSER", "").lower() == "true"
202
+
203
+
204
+ def get_lookml_git_test_ssh_key() -> Optional[str]:
205
+ """SSH key for LookML Git tests."""
206
+ return os.getenv("DATAHUB_LOOKML_GIT_TEST_SSH_KEY")
207
+
208
+
209
+ # ============================================================================
210
+ # AWS/Cloud Configuration
211
+ # ============================================================================
212
+
213
+
214
+ def get_aws_lambda_function_name() -> Optional[str]:
215
+ """Indicates running in AWS Lambda."""
216
+ return os.getenv("AWS_LAMBDA_FUNCTION_NAME")
217
+
218
+
219
+ def get_aws_execution_env() -> Optional[str]:
220
+ """AWS execution environment."""
221
+ return os.getenv("AWS_EXECUTION_ENV")
222
+
223
+
224
+ def get_aws_web_identity_token_file() -> Optional[str]:
225
+ """OIDC token file path."""
226
+ return os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE")
227
+
228
+
229
+ def get_aws_role_arn() -> Optional[str]:
230
+ """AWS role ARN for OIDC."""
231
+ return os.getenv("AWS_ROLE_ARN")
232
+
233
+
234
+ def get_aws_app_runner_service_id() -> Optional[str]:
235
+ """AWS App Runner service ID."""
236
+ return os.getenv("AWS_APP_RUNNER_SERVICE_ID")
237
+
238
+
239
+ def get_ecs_container_metadata_uri_v4() -> Optional[str]:
240
+ """ECS metadata endpoint v4."""
241
+ return os.getenv("ECS_CONTAINER_METADATA_URI_V4")
242
+
243
+
244
+ def get_ecs_container_metadata_uri() -> Optional[str]:
245
+ """ECS metadata endpoint v3."""
246
+ return os.getenv("ECS_CONTAINER_METADATA_URI")
247
+
248
+
249
+ def get_elastic_beanstalk_environment_name() -> Optional[str]:
250
+ """Elastic Beanstalk environment."""
251
+ return os.getenv("ELASTIC_BEANSTALK_ENVIRONMENT_NAME")
252
+
253
+
254
+ # ============================================================================
255
+ # Docker & Local Development
256
+ # ============================================================================
257
+
258
+
259
+ def get_compose_project_name() -> str:
260
+ """Docker Compose project name."""
261
+ return os.getenv("DATAHUB_COMPOSE_PROJECT_NAME", "datahub")
262
+
263
+
264
+ def get_docker_compose_base() -> Optional[str]:
265
+ """Base path for Docker Compose files."""
266
+ return os.getenv("DOCKER_COMPOSE_BASE")
267
+
268
+
269
+ def get_datahub_version() -> Optional[str]:
270
+ """DataHub version (set during docker init)."""
271
+ return os.getenv("DATAHUB_VERSION")
272
+
273
+
274
+ def get_mapped_mysql_port() -> Optional[str]:
275
+ """MySQL port mapping (set during docker init)."""
276
+ return os.getenv("DATAHUB_MAPPED_MYSQL_PORT")
277
+
278
+
279
+ def get_mapped_kafka_broker_port() -> Optional[str]:
280
+ """Kafka broker port mapping (set during docker init)."""
281
+ return os.getenv("DATAHUB_MAPPED_KAFKA_BROKER_PORT")
282
+
283
+
284
+ def get_mapped_elastic_port() -> Optional[str]:
285
+ """Elasticsearch port mapping (set during docker init)."""
286
+ return os.getenv("DATAHUB_MAPPED_ELASTIC_PORT")
287
+
288
+
289
+ def get_metadata_service_auth_enabled() -> str:
290
+ """Enable/disable auth in Docker."""
291
+ return os.getenv("METADATA_SERVICE_AUTH_ENABLED", "false")
292
+
293
+
294
+ def get_ui_ingestion_default_cli_version() -> Optional[str]:
295
+ """CLI version for UI ingestion (set during init)."""
296
+ return os.getenv("UI_INGESTION_DEFAULT_CLI_VERSION")
297
+
298
+
299
+ # ============================================================================
300
+ # Utility & Helper Configuration
301
+ # ============================================================================
302
+
303
+
304
+ def get_datahub_component() -> str:
305
+ """Component name for user agent tracking."""
306
+ return os.getenv("DATAHUB_COMPONENT", "datahub")
307
+
308
+
309
+ def get_force_local_quickstart_mapping() -> str:
310
+ """Force local quickstart mapping file."""
311
+ return os.getenv("FORCE_LOCAL_QUICKSTART_MAPPING", "")
312
+
313
+
314
+ def get_dataproduct_external_url() -> Optional[str]:
315
+ """External URL for data products."""
316
+ return os.getenv("DATAHUB_DATAPRODUCT_EXTERNAL_URL")
317
+
318
+
319
+ def get_override_sqlite_version_req() -> str:
320
+ """Override SQLite version requirement."""
321
+ return os.getenv("OVERRIDE_SQLITE_VERSION_REQ", "")
322
+
323
+
324
+ def get_update_entity_registry() -> str:
325
+ """Update entity registry during tests."""
326
+ return os.getenv("UPDATE_ENTITY_REGISTRY", "false")
327
+
328
+
329
+ def get_ci() -> Optional[str]:
330
+ """Indicates running in CI environment."""
331
+ return os.getenv("CI")
@@ -1,15 +1,18 @@
1
- from typing import TypeVar, Union
1
+ from typing import TYPE_CHECKING, Type, TypeVar, Union
2
2
 
3
3
  import pydantic
4
4
 
5
5
  from datahub.ingestion.api.registry import import_path
6
6
 
7
- T = TypeVar("T")
7
+ if TYPE_CHECKING:
8
+ from pydantic.deprecated.class_validators import V1Validator
8
9
 
10
+ _T = TypeVar("_T")
9
11
 
10
- def _pydantic_resolver(v: Union[T, str]) -> T:
12
+
13
+ def _pydantic_resolver(cls: Type, v: Union[str, _T]) -> _T:
11
14
  return import_path(v) if isinstance(v, str) else v
12
15
 
13
16
 
14
- def pydantic_resolve_key(field: str) -> classmethod:
17
+ def pydantic_resolve_key(field: str) -> "V1Validator":
15
18
  return pydantic.validator(field, pre=True, allow_reuse=True)(_pydantic_resolver)
@@ -1,16 +1,36 @@
1
1
  from pydantic import Field, validator
2
2
 
3
3
  from datahub.configuration.common import ConfigModel, ConfigurationError
4
+ from datahub.configuration.env_vars import (
5
+ get_gms_base_path,
6
+ get_kafka_schema_registry_url,
7
+ )
4
8
  from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
5
9
  from datahub.configuration.validate_host_port import validate_host_port
6
10
 
7
11
 
12
+ def _get_schema_registry_url() -> str:
13
+ """Get schema registry URL with proper base path handling."""
14
+ explicit_url = get_kafka_schema_registry_url()
15
+ if explicit_url:
16
+ return explicit_url
17
+
18
+ base_path = get_gms_base_path()
19
+ if base_path in ("/", ""):
20
+ base_path = ""
21
+
22
+ return f"http://localhost:8080{base_path}/schema-registry/api/"
23
+
24
+
8
25
  class _KafkaConnectionConfig(ConfigModel):
9
26
  # bootstrap servers
10
27
  bootstrap: str = "localhost:9092"
11
28
 
12
29
  # schema registry location
13
- schema_registry_url: str = "http://localhost:8080/schema-registry/api/"
30
+ schema_registry_url: str = Field(
31
+ default_factory=_get_schema_registry_url,
32
+ description="Schema registry URL. Can be overridden with KAFKA_SCHEMAREGISTRY_URL environment variable, or will use DATAHUB_GMS_BASE_PATH if not set.",
33
+ )
14
34
 
15
35
  schema_registry_config: dict = Field(
16
36
  default_factory=dict,
@@ -1,20 +1,13 @@
1
1
  import pydantic.version
2
2
  from packaging.version import Version
3
3
 
4
- PYDANTIC_VERSION_2: bool
5
- if Version(pydantic.version.VERSION) >= Version("2.0"):
6
- PYDANTIC_VERSION_2 = True
7
- else:
8
- PYDANTIC_VERSION_2 = False
9
-
4
+ _pydantic_version = Version(pydantic.version.VERSION)
10
5
 
11
- # This can be used to silence deprecation warnings while we migrate.
12
- if PYDANTIC_VERSION_2:
13
- from pydantic import PydanticDeprecatedSince20 # type: ignore
14
- else:
6
+ PYDANTIC_VERSION_2 = _pydantic_version >= Version("2.0")
15
7
 
16
- class PydanticDeprecatedSince20(Warning): # type: ignore
17
- pass
8
+ # The pydantic.Discriminator type was added in v2.5.0.
9
+ # https://docs.pydantic.dev/latest/changelog/#v250-2023-11-13
10
+ PYDANTIC_SUPPORTS_CALLABLE_DISCRIMINATOR = _pydantic_version >= Version("2.5.0")
18
11
 
19
12
 
20
13
  if PYDANTIC_VERSION_2:
@@ -50,7 +43,7 @@ class v1_ConfigModel(v1_BaseModel):
50
43
 
51
44
  __all__ = [
52
45
  "PYDANTIC_VERSION_2",
53
- "PydanticDeprecatedSince20",
46
+ "PYDANTIC_SUPPORTS_CALLABLE_DISCRIMINATOR",
54
47
  "GenericModel",
55
48
  "v1_ConfigModel",
56
49
  "v1_Field",
@@ -1,6 +1,6 @@
1
1
  from typing import Dict, Optional
2
2
 
3
- from pydantic import validator
3
+ import pydantic
4
4
  from pydantic.fields import Field
5
5
 
6
6
  from datahub.configuration.common import ConfigModel
@@ -16,7 +16,7 @@ class PlatformInstanceConfigMixin(ConfigModel):
16
16
  default=None,
17
17
  description="The instance of the platform that all assets produced by this recipe belong to. "
18
18
  "This should be unique within the platform. "
19
- "See https://datahubproject.io/docs/platform-instances/ for more details.",
19
+ "See https://docs.datahub.com/docs/platform-instances/ for more details.",
20
20
  )
21
21
 
22
22
 
@@ -30,7 +30,8 @@ class EnvConfigMixin(ConfigModel):
30
30
  description="The environment that all assets produced by this connector belong to",
31
31
  )
32
32
 
33
- @validator("env")
33
+ @pydantic.field_validator("env", mode="after")
34
+ @classmethod
34
35
  def env_must_be_one_of(cls, v: str) -> str:
35
36
  if v.upper() not in ALL_ENV_TYPES:
36
37
  raise ValueError(f"env must be one of {ALL_ENV_TYPES}, found {v}")
@@ -1,11 +1,14 @@
1
1
  import warnings
2
- from typing import Any, Optional, Type
2
+ from typing import TYPE_CHECKING, Any, Optional, Type
3
3
 
4
4
  import pydantic
5
5
 
6
6
  from datahub.configuration.common import ConfigurationWarning
7
7
  from datahub.utilities.global_warning_util import add_global_warning
8
8
 
9
+ if TYPE_CHECKING:
10
+ from pydantic.deprecated.class_validators import V1RootValidator
11
+
9
12
  _unset = object()
10
13
 
11
14
 
@@ -13,7 +16,7 @@ def pydantic_field_deprecated(
13
16
  field: str,
14
17
  warn_if_value_is_not: Any = _unset,
15
18
  message: Optional[str] = None,
16
- ) -> classmethod:
19
+ ) -> "V1RootValidator":
17
20
  if message:
18
21
  output = message
19
22
  else:
@@ -1,15 +1,18 @@
1
1
  import warnings
2
- from typing import Type
2
+ from typing import TYPE_CHECKING, Type
3
3
 
4
4
  import pydantic
5
5
 
6
6
  from datahub.configuration.common import ConfigurationWarning
7
7
 
8
+ if TYPE_CHECKING:
9
+ from pydantic.deprecated.class_validators import V1RootValidator
10
+
8
11
 
9
12
  def pydantic_removed_field(
10
13
  field: str,
11
14
  print_warning: bool = True,
12
- ) -> classmethod:
15
+ ) -> "V1RootValidator":
13
16
  def _validate_field_removal(cls: Type, values: dict) -> dict:
14
17
  if field in values:
15
18
  if print_warning:
@@ -21,6 +24,9 @@ def pydantic_removed_field(
21
24
  values.pop(field)
22
25
  return values
23
26
 
27
+ # Mark the function as handling a removed field for doc generation
28
+ _validate_field_removal._doc_removed_field = field # type: ignore[attr-defined]
29
+
24
30
  # Hack: Pydantic maintains unique list of validators by referring its __name__.
25
31
  # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
26
32
  # This hack ensures that multiple field removals do not overwrite each other.
@@ -1,11 +1,14 @@
1
1
  import warnings
2
- from typing import Callable, Type, TypeVar
2
+ from typing import TYPE_CHECKING, Callable, Type, TypeVar
3
3
 
4
4
  import pydantic
5
5
 
6
6
  from datahub.configuration.common import ConfigurationWarning
7
7
  from datahub.utilities.global_warning_util import add_global_warning
8
8
 
9
+ if TYPE_CHECKING:
10
+ from pydantic.deprecated.class_validators import V1RootValidator
11
+
9
12
  _T = TypeVar("_T")
10
13
 
11
14
 
@@ -18,7 +21,7 @@ def pydantic_renamed_field(
18
21
  new_name: str,
19
22
  transform: Callable = _default_rename_transform,
20
23
  print_warning: bool = True,
21
- ) -> classmethod:
24
+ ) -> "V1RootValidator":
22
25
  def _validate_field_rename(cls: Type, values: dict) -> dict:
23
26
  if old_name in values:
24
27
  if new_name in values:
@@ -49,6 +52,4 @@ def pydantic_renamed_field(
49
52
  # validator with pre=True gets all the values that were passed in.
50
53
  # Given that a renamed field doesn't show up in the fields list, we can't use
51
54
  # the field-level validator, even with a different field name.
52
- return pydantic.root_validator(pre=True, skip_on_failure=True, allow_reuse=True)(
53
- _validate_field_rename
54
- )
55
+ return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename)
@@ -1,9 +1,12 @@
1
- from typing import Optional, Type, Union
1
+ from typing import TYPE_CHECKING, Optional, Type, Union
2
2
 
3
3
  import pydantic
4
4
 
5
+ if TYPE_CHECKING:
6
+ from pydantic.deprecated.class_validators import V1Validator
5
7
 
6
- def pydantic_multiline_string(field: str) -> classmethod:
8
+
9
+ def pydantic_multiline_string(field: str) -> "V1Validator":
7
10
  """If the field is present and contains an escaped newline, replace it with a real newline.
8
11
 
9
12
  This makes the assumption that the field value is never supposed to have a
@@ -3,7 +3,6 @@
3
3
  import hashlib
4
4
  import json
5
5
  import logging
6
- import os
7
6
  import re
8
7
  import time
9
8
  from datetime import datetime, timezone
@@ -26,6 +25,7 @@ import typing_inspect
26
25
  from avrogen.dict_wrapper import DictWrapper
27
26
  from typing_extensions import assert_never
28
27
 
28
+ from datahub.configuration.env_vars import get_dataset_urn_to_lower
29
29
  from datahub.emitter.enum_helpers import get_enum_options
30
30
  from datahub.metadata.schema_classes import (
31
31
  AssertionKeyClass,
@@ -59,6 +59,7 @@ from datahub.metadata.urns import (
59
59
  DataJobUrn,
60
60
  DataPlatformUrn,
61
61
  DatasetUrn,
62
+ OwnershipTypeUrn,
62
63
  TagUrn,
63
64
  )
64
65
  from datahub.utilities.urn_encoder import UrnEncoder
@@ -71,9 +72,7 @@ ALL_ENV_TYPES: Set[str] = set(get_enum_options(FabricTypeClass))
71
72
 
72
73
  DEFAULT_FLOW_CLUSTER = "prod"
73
74
  UNKNOWN_USER = "urn:li:corpuser:unknown"
74
- DATASET_URN_TO_LOWER: bool = (
75
- os.getenv("DATAHUB_DATASET_URN_TO_LOWER", "false") == "true"
76
- )
75
+ DATASET_URN_TO_LOWER: bool = get_dataset_urn_to_lower() == "true"
77
76
 
78
77
  if TYPE_CHECKING:
79
78
  from datahub.emitter.mcp_builder import DatahubKey
@@ -125,9 +124,7 @@ def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]:
125
124
 
126
125
 
127
126
  def make_data_platform_urn(platform: str) -> str:
128
- if platform.startswith("urn:li:dataPlatform:"):
129
- return platform
130
- return DataPlatformUrn.create_from_id(platform).urn()
127
+ return DataPlatformUrn(platform).urn()
131
128
 
132
129
 
133
130
  def make_dataset_urn(platform: str, name: str, env: str = DEFAULT_ENV) -> str:
@@ -377,6 +374,12 @@ def make_domain_urn(domain: str) -> str:
377
374
  return f"urn:li:domain:{domain}"
378
375
 
379
376
 
377
+ def make_data_product_urn(data_product_id: str) -> str:
378
+ if data_product_id.startswith("urn:li:dataProduct:"):
379
+ return data_product_id
380
+ return f"urn:li:dataProduct:{data_product_id}"
381
+
382
+
380
383
  def make_ml_primary_key_urn(feature_table_name: str, primary_key_name: str) -> str:
381
384
  return f"urn:li:mlPrimaryKey:({feature_table_name},{primary_key_name})"
382
385
 
@@ -408,7 +411,8 @@ def make_ml_model_group_urn(platform: str, group_name: str, env: str) -> str:
408
411
 
409
412
  def validate_ownership_type(ownership_type: str) -> Tuple[str, Optional[str]]:
410
413
  if ownership_type.startswith("urn:li:"):
411
- return OwnershipTypeClass.CUSTOM, ownership_type
414
+ ownership_type_urn = OwnershipTypeUrn.from_string(ownership_type)
415
+ return OwnershipTypeClass.CUSTOM, ownership_type_urn.urn()
412
416
  ownership_type = ownership_type.upper()
413
417
  if ownership_type in get_enum_options(OwnershipTypeClass):
414
418
  return ownership_type, None