acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,36 @@
1
1
  from pydantic import Field, validator
2
2
 
3
3
  from datahub.configuration.common import ConfigModel, ConfigurationError
4
+ from datahub.configuration.env_vars import (
5
+ get_gms_base_path,
6
+ get_kafka_schema_registry_url,
7
+ )
4
8
  from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
5
9
  from datahub.configuration.validate_host_port import validate_host_port
6
10
 
7
11
 
12
+ def _get_schema_registry_url() -> str:
13
+ """Get schema registry URL with proper base path handling."""
14
+ explicit_url = get_kafka_schema_registry_url()
15
+ if explicit_url:
16
+ return explicit_url
17
+
18
+ base_path = get_gms_base_path()
19
+ if base_path in ("/", ""):
20
+ base_path = ""
21
+
22
+ return f"http://localhost:8080{base_path}/schema-registry/api/"
23
+
24
+
8
25
  class _KafkaConnectionConfig(ConfigModel):
9
26
  # bootstrap servers
10
27
  bootstrap: str = "localhost:9092"
11
28
 
12
29
  # schema registry location
13
- schema_registry_url: str = "http://localhost:8080/schema-registry/api/"
30
+ schema_registry_url: str = Field(
31
+ default_factory=_get_schema_registry_url,
32
+ description="Schema registry URL. Can be overridden with KAFKA_SCHEMAREGISTRY_URL environment variable, or will use DATAHUB_GMS_BASE_PATH if not set.",
33
+ )
14
34
 
15
35
  schema_registry_config: dict = Field(
16
36
  default_factory=dict,
@@ -1,20 +1,13 @@
1
1
  import pydantic.version
2
2
  from packaging.version import Version
3
3
 
4
- PYDANTIC_VERSION_2: bool
5
- if Version(pydantic.version.VERSION) >= Version("2.0"):
6
- PYDANTIC_VERSION_2 = True
7
- else:
8
- PYDANTIC_VERSION_2 = False
9
-
4
+ _pydantic_version = Version(pydantic.version.VERSION)
10
5
 
11
- # This can be used to silence deprecation warnings while we migrate.
12
- if PYDANTIC_VERSION_2:
13
- from pydantic import PydanticDeprecatedSince20 # type: ignore
14
- else:
6
+ PYDANTIC_VERSION_2 = _pydantic_version >= Version("2.0")
15
7
 
16
- class PydanticDeprecatedSince20(Warning): # type: ignore
17
- pass
8
+ # The pydantic.Discriminator type was added in v2.5.0.
9
+ # https://docs.pydantic.dev/latest/changelog/#v250-2023-11-13
10
+ PYDANTIC_SUPPORTS_CALLABLE_DISCRIMINATOR = _pydantic_version >= Version("2.5.0")
18
11
 
19
12
 
20
13
  if PYDANTIC_VERSION_2:
@@ -50,7 +43,7 @@ class v1_ConfigModel(v1_BaseModel):
50
43
 
51
44
  __all__ = [
52
45
  "PYDANTIC_VERSION_2",
53
- "PydanticDeprecatedSince20",
46
+ "PYDANTIC_SUPPORTS_CALLABLE_DISCRIMINATOR",
54
47
  "GenericModel",
55
48
  "v1_ConfigModel",
56
49
  "v1_Field",
@@ -1,6 +1,6 @@
1
1
  from typing import Dict, Optional
2
2
 
3
- from pydantic import validator
3
+ import pydantic
4
4
  from pydantic.fields import Field
5
5
 
6
6
  from datahub.configuration.common import ConfigModel
@@ -30,7 +30,8 @@ class EnvConfigMixin(ConfigModel):
30
30
  description="The environment that all assets produced by this connector belong to",
31
31
  )
32
32
 
33
- @validator("env")
33
+ @pydantic.field_validator("env", mode="after")
34
+ @classmethod
34
35
  def env_must_be_one_of(cls, v: str) -> str:
35
36
  if v.upper() not in ALL_ENV_TYPES:
36
37
  raise ValueError(f"env must be one of {ALL_ENV_TYPES}, found {v}")
@@ -1,11 +1,14 @@
1
1
  import warnings
2
- from typing import Any, Optional, Type
2
+ from typing import TYPE_CHECKING, Any, Optional, Type
3
3
 
4
4
  import pydantic
5
5
 
6
6
  from datahub.configuration.common import ConfigurationWarning
7
7
  from datahub.utilities.global_warning_util import add_global_warning
8
8
 
9
+ if TYPE_CHECKING:
10
+ from pydantic.deprecated.class_validators import V1RootValidator
11
+
9
12
  _unset = object()
10
13
 
11
14
 
@@ -13,7 +16,7 @@ def pydantic_field_deprecated(
13
16
  field: str,
14
17
  warn_if_value_is_not: Any = _unset,
15
18
  message: Optional[str] = None,
16
- ) -> classmethod:
19
+ ) -> "V1RootValidator":
17
20
  if message:
18
21
  output = message
19
22
  else:
@@ -1,15 +1,18 @@
1
1
  import warnings
2
- from typing import Type
2
+ from typing import TYPE_CHECKING, Type
3
3
 
4
4
  import pydantic
5
5
 
6
6
  from datahub.configuration.common import ConfigurationWarning
7
7
 
8
+ if TYPE_CHECKING:
9
+ from pydantic.deprecated.class_validators import V1RootValidator
10
+
8
11
 
9
12
  def pydantic_removed_field(
10
13
  field: str,
11
14
  print_warning: bool = True,
12
- ) -> classmethod:
15
+ ) -> "V1RootValidator":
13
16
  def _validate_field_removal(cls: Type, values: dict) -> dict:
14
17
  if field in values:
15
18
  if print_warning:
@@ -21,6 +24,9 @@ def pydantic_removed_field(
21
24
  values.pop(field)
22
25
  return values
23
26
 
27
+ # Mark the function as handling a removed field for doc generation
28
+ _validate_field_removal._doc_removed_field = field # type: ignore[attr-defined]
29
+
24
30
  # Hack: Pydantic maintains unique list of validators by referring its __name__.
25
31
  # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
26
32
  # This hack ensures that multiple field removals do not overwrite each other.
@@ -1,11 +1,14 @@
1
1
  import warnings
2
- from typing import Callable, Type, TypeVar
2
+ from typing import TYPE_CHECKING, Callable, Type, TypeVar
3
3
 
4
4
  import pydantic
5
5
 
6
6
  from datahub.configuration.common import ConfigurationWarning
7
7
  from datahub.utilities.global_warning_util import add_global_warning
8
8
 
9
+ if TYPE_CHECKING:
10
+ from pydantic.deprecated.class_validators import V1RootValidator
11
+
9
12
  _T = TypeVar("_T")
10
13
 
11
14
 
@@ -18,7 +21,7 @@ def pydantic_renamed_field(
18
21
  new_name: str,
19
22
  transform: Callable = _default_rename_transform,
20
23
  print_warning: bool = True,
21
- ) -> classmethod:
24
+ ) -> "V1RootValidator":
22
25
  def _validate_field_rename(cls: Type, values: dict) -> dict:
23
26
  if old_name in values:
24
27
  if new_name in values:
@@ -49,6 +52,4 @@ def pydantic_renamed_field(
49
52
  # validator with pre=True gets all the values that were passed in.
50
53
  # Given that a renamed field doesn't show up in the fields list, we can't use
51
54
  # the field-level validator, even with a different field name.
52
- return pydantic.root_validator(pre=True, skip_on_failure=True, allow_reuse=True)(
53
- _validate_field_rename
54
- )
55
+ return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename)
@@ -1,9 +1,12 @@
1
- from typing import Optional, Type, Union
1
+ from typing import TYPE_CHECKING, Optional, Type, Union
2
2
 
3
3
  import pydantic
4
4
 
5
+ if TYPE_CHECKING:
6
+ from pydantic.deprecated.class_validators import V1Validator
5
7
 
6
- def pydantic_multiline_string(field: str) -> classmethod:
8
+
9
+ def pydantic_multiline_string(field: str) -> "V1Validator":
7
10
  """If the field is present and contains an escaped newline, replace it with a real newline.
8
11
 
9
12
  This makes the assumption that the field value is never supposed to have a
@@ -3,7 +3,6 @@
3
3
  import hashlib
4
4
  import json
5
5
  import logging
6
- import os
7
6
  import re
8
7
  import time
9
8
  from datetime import datetime, timezone
@@ -26,6 +25,7 @@ import typing_inspect
26
25
  from avrogen.dict_wrapper import DictWrapper
27
26
  from typing_extensions import assert_never
28
27
 
28
+ from datahub.configuration.env_vars import get_dataset_urn_to_lower
29
29
  from datahub.emitter.enum_helpers import get_enum_options
30
30
  from datahub.metadata.schema_classes import (
31
31
  AssertionKeyClass,
@@ -72,9 +72,7 @@ ALL_ENV_TYPES: Set[str] = set(get_enum_options(FabricTypeClass))
72
72
 
73
73
  DEFAULT_FLOW_CLUSTER = "prod"
74
74
  UNKNOWN_USER = "urn:li:corpuser:unknown"
75
- DATASET_URN_TO_LOWER: bool = (
76
- os.getenv("DATAHUB_DATASET_URN_TO_LOWER", "false") == "true"
77
- )
75
+ DATASET_URN_TO_LOWER: bool = get_dataset_urn_to_lower() == "true"
78
76
 
79
77
  if TYPE_CHECKING:
80
78
  from datahub.emitter.mcp_builder import DatahubKey
@@ -376,6 +374,12 @@ def make_domain_urn(domain: str) -> str:
376
374
  return f"urn:li:domain:{domain}"
377
375
 
378
376
 
377
+ def make_data_product_urn(data_product_id: str) -> str:
378
+ if data_product_id.startswith("urn:li:dataProduct:"):
379
+ return data_product_id
380
+ return f"urn:li:dataProduct:{data_product_id}"
381
+
382
+
379
383
  def make_ml_primary_key_urn(feature_table_name: str, primary_key_name: str) -> str:
380
384
  return f"urn:li:mlPrimaryKey:({feature_table_name},{primary_key_name})"
381
385
 
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import functools
4
4
  import json
5
5
  import logging
6
- import os
6
+ import re
7
7
  import time
8
8
  from collections import defaultdict
9
9
  from dataclasses import dataclass
@@ -32,7 +32,6 @@ from typing_extensions import deprecated
32
32
  from datahub._version import nice_version_name
33
33
  from datahub.cli import config_utils
34
34
  from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
35
- from datahub.cli.env_utils import get_boolean_env_variable
36
35
  from datahub.configuration.common import (
37
36
  ConfigEnum,
38
37
  ConfigModel,
@@ -41,6 +40,14 @@ from datahub.configuration.common import (
41
40
  TraceTimeoutError,
42
41
  TraceValidationError,
43
42
  )
43
+ from datahub.configuration.env_vars import (
44
+ get_emit_mode,
45
+ get_emitter_trace,
46
+ get_rest_emitter_batch_max_payload_bytes,
47
+ get_rest_emitter_batch_max_payload_length,
48
+ get_rest_emitter_default_endpoint,
49
+ get_rest_emitter_default_retry_max_times,
50
+ )
44
51
  from datahub.emitter.generic_emitter import Emitter
45
52
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
46
53
  from datahub.emitter.request_helper import OpenApiRequest, make_curl_command
@@ -60,6 +67,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
60
67
  MetadataChangeProposal,
61
68
  )
62
69
  from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation
70
+ from datahub.metadata.schema_classes import (
71
+ KEY_ASPECT_NAMES,
72
+ ChangeTypeClass,
73
+ )
63
74
  from datahub.utilities.server_config_util import RestServiceConfig, ServiceFeature
64
75
 
65
76
  if TYPE_CHECKING:
@@ -77,11 +88,9 @@ _DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on
77
88
  504,
78
89
  ]
79
90
  _DEFAULT_RETRY_METHODS = ["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"]
80
- _DEFAULT_RETRY_MAX_TIMES = int(
81
- os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
82
- )
91
+ _DEFAULT_RETRY_MAX_TIMES = int(get_rest_emitter_default_retry_max_times())
83
92
 
84
- _DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
93
+ _DATAHUB_EMITTER_TRACE = get_emitter_trace()
85
94
 
86
95
  _DEFAULT_CLIENT_MODE: ClientMode = ClientMode.SDK
87
96
 
@@ -90,18 +99,32 @@ TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
90
99
  TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
91
100
  TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
92
101
 
93
- # The limit is 16mb. We will use a max of 15mb to have some space
102
+ # The limit is 16,000,000 bytes. We will use a max of 15mb to have some space
94
103
  # for overhead like request headers.
95
104
  # This applies to pretty much all calls to GMS.
96
- INGEST_MAX_PAYLOAD_BYTES = 15 * 1024 * 1024
105
+ INGEST_MAX_PAYLOAD_BYTES = get_rest_emitter_batch_max_payload_bytes()
97
106
 
98
107
  # This limit is somewhat arbitrary. All GMS endpoints will timeout
99
108
  # and return a 500 if processing takes too long. To avoid sending
100
109
  # too much to the backend and hitting a timeout, we try to limit
101
110
  # the number of MCPs we send in a batch.
102
- BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
103
- os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", 200)
104
- )
111
+ BATCH_INGEST_MAX_PAYLOAD_LENGTH = get_rest_emitter_batch_max_payload_length()
112
+
113
+
114
+ def preserve_unicode_escapes(obj: Any) -> Any:
115
+ """Recursively convert unicode characters back to escape sequences"""
116
+ if isinstance(obj, dict):
117
+ return {k: preserve_unicode_escapes(v) for k, v in obj.items()}
118
+ elif isinstance(obj, list):
119
+ return [preserve_unicode_escapes(item) for item in obj]
120
+ elif isinstance(obj, str):
121
+ # Convert non-ASCII characters back to \u escapes
122
+ def escape_unicode(match: Any) -> Any:
123
+ return f"\\u{ord(match.group(0)):04x}"
124
+
125
+ return re.sub(r"[^\x00-\x7F]", escape_unicode, obj)
126
+ else:
127
+ return obj
105
128
 
106
129
 
107
130
  class EmitMode(ConfigEnum):
@@ -124,7 +147,7 @@ class EmitMode(ConfigEnum):
124
147
 
125
148
  _DEFAULT_EMIT_MODE = pydantic.parse_obj_as(
126
149
  EmitMode,
127
- os.getenv("DATAHUB_EMIT_MODE", EmitMode.SYNC_PRIMARY),
150
+ get_emit_mode() or EmitMode.SYNC_PRIMARY,
128
151
  )
129
152
 
130
153
 
@@ -135,7 +158,7 @@ class RestSinkEndpoint(ConfigEnum):
135
158
 
136
159
  DEFAULT_REST_EMITTER_ENDPOINT = pydantic.parse_obj_as(
137
160
  RestSinkEndpoint,
138
- os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT", RestSinkEndpoint.RESTLI),
161
+ get_rest_emitter_default_endpoint() or RestSinkEndpoint.RESTLI,
139
162
  )
140
163
 
141
164
 
@@ -314,6 +337,7 @@ class DataHubRestEmitter(Closeable, Emitter):
314
337
  openapi_ingestion: Optional[bool] = None,
315
338
  client_mode: Optional[ClientMode] = None,
316
339
  datahub_component: Optional[str] = None,
340
+ server_config_refresh_interval: Optional[int] = None,
317
341
  ):
318
342
  if not gms_server:
319
343
  raise ConfigurationError("gms server is required")
@@ -329,6 +353,8 @@ class DataHubRestEmitter(Closeable, Emitter):
329
353
  self._openapi_ingestion = (
330
354
  openapi_ingestion # Re-evaluated after test connection
331
355
  )
356
+ self._server_config_refresh_interval = server_config_refresh_interval
357
+ self._config_fetch_time: Optional[float] = None
332
358
 
333
359
  headers = {
334
360
  "X-RestLi-Protocol-Version": "2.0.0",
@@ -398,7 +424,17 @@ class DataHubRestEmitter(Closeable, Emitter):
398
424
  Raises:
399
425
  ConfigurationError: If there's an error fetching or validating the configuration
400
426
  """
401
- if not hasattr(self, "_server_config") or not self._server_config:
427
+
428
+ if (
429
+ not hasattr(self, "_server_config")
430
+ or self._server_config is None
431
+ or (
432
+ self._server_config_refresh_interval is not None
433
+ and self._config_fetch_time is not None
434
+ and (time.time() - self._config_fetch_time)
435
+ > self._server_config_refresh_interval
436
+ )
437
+ ):
402
438
  if self._session is None or self._gms_server is None:
403
439
  raise ConfigurationError(
404
440
  "Session and URL are required to load configuration"
@@ -419,6 +455,7 @@ class DataHubRestEmitter(Closeable, Emitter):
419
455
  )
420
456
 
421
457
  self._server_config = RestServiceConfig(raw_config=raw_config)
458
+ self._config_fetch_time = time.time()
422
459
  self._post_fetch_server_config()
423
460
 
424
461
  else:
@@ -441,7 +478,7 @@ class DataHubRestEmitter(Closeable, Emitter):
441
478
  if self._openapi_ingestion is None:
442
479
  # No constructor parameter
443
480
  if (
444
- not os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT")
481
+ not get_rest_emitter_default_endpoint()
445
482
  and self._session_config.client_mode == ClientMode.SDK
446
483
  and self._server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
447
484
  ):
@@ -453,6 +490,8 @@ class DataHubRestEmitter(Closeable, Emitter):
453
490
  DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
454
491
  )
455
492
 
493
+ def test_connection(self) -> None:
494
+ self.fetch_server_config()
456
495
  logger.debug(
457
496
  f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
458
497
  )
@@ -460,12 +499,21 @@ class DataHubRestEmitter(Closeable, Emitter):
460
499
  f"{EmitMode.ASYNC_WAIT} {'IS' if self._should_trace(emit_mode=EmitMode.ASYNC_WAIT, warn=False) else 'IS NOT'} supported."
461
500
  )
462
501
 
463
- def test_connection(self) -> None:
464
- self.fetch_server_config()
465
-
466
502
  def get_server_config(self) -> dict:
467
503
  return self.server_config.raw_config
468
504
 
505
+ def invalidate_config_cache(self) -> None:
506
+ """Manually invalidate the configuration cache."""
507
+ if (
508
+ hasattr(self, "_server_config")
509
+ and self._server_config is not None
510
+ and self._server_config_refresh_interval is not None
511
+ ):
512
+ # Set fetch time to beyond TTL in the past to force refresh on next access
513
+ self._config_fetch_time = (
514
+ time.time() - self._server_config_refresh_interval - 1
515
+ )
516
+
469
517
  def to_graph(self) -> "DataHubGraph":
470
518
  from datahub.ingestion.graph.client import DataHubGraph
471
519
 
@@ -538,6 +586,11 @@ class DataHubRestEmitter(Closeable, Emitter):
538
586
  "systemMetadata": system_metadata_obj,
539
587
  }
540
588
  payload = json.dumps(snapshot)
589
+ if len(payload) > INGEST_MAX_PAYLOAD_BYTES:
590
+ logger.warning(
591
+ f"MCE object has size {len(payload)} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
592
+ "so this metadata will likely fail to be emitted."
593
+ )
541
594
 
542
595
  self._emit_generic(url, payload)
543
596
 
@@ -584,15 +637,27 @@ class DataHubRestEmitter(Closeable, Emitter):
584
637
  trace_data = extract_trace_data(response) if response else None
585
638
 
586
639
  else:
587
- url = f"{self._gms_server}/aspects?action=ingestProposal"
640
+ if mcp.changeType == ChangeTypeClass.DELETE:
641
+ if mcp.aspectName not in KEY_ASPECT_NAMES:
642
+ raise OperationalError(
643
+ f"Delete not supported for non key aspect: {mcp.aspectName} for urn: "
644
+ f"{mcp.entityUrn}"
645
+ )
588
646
 
589
- mcp_obj = pre_json_transform(mcp.to_obj())
590
- payload_dict = {
591
- "proposal": mcp_obj,
592
- "async": "true"
593
- if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
594
- else "false",
595
- }
647
+ url = f"{self._gms_server}/entities?action=delete"
648
+ payload_dict = {
649
+ "urn": mcp.entityUrn,
650
+ }
651
+ else:
652
+ url = f"{self._gms_server}/aspects?action=ingestProposal"
653
+
654
+ mcp_obj = preserve_unicode_escapes(pre_json_transform(mcp.to_obj()))
655
+ payload_dict = {
656
+ "proposal": mcp_obj,
657
+ "async": "true"
658
+ if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
659
+ else "false",
660
+ }
596
661
 
597
662
  payload = json.dumps(payload_dict)
598
663
 
@@ -704,16 +769,24 @@ class DataHubRestEmitter(Closeable, Emitter):
704
769
  url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
705
770
 
706
771
  mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
772
+ if len(mcp_objs) == 0:
773
+ return 0
707
774
 
708
775
  # As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
709
776
  # If we will exceed the limit, we need to break it up into chunks.
710
- mcp_obj_chunks: List[List[str]] = []
711
- current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
777
+ mcp_obj_chunks: List[List[str]] = [[]]
778
+ current_chunk_size = 0
712
779
  for mcp_obj in mcp_objs:
780
+ mcp_identifier = f"{mcp_obj.get('entityUrn')}-{mcp_obj.get('aspectName')}"
713
781
  mcp_obj_size = len(json.dumps(mcp_obj))
714
782
  if _DATAHUB_EMITTER_TRACE:
715
783
  logger.debug(
716
- f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
784
+ f"Iterating through object ({mcp_identifier}) with size {mcp_obj_size}"
785
+ )
786
+ if mcp_obj_size > INGEST_MAX_PAYLOAD_BYTES:
787
+ logger.warning(
788
+ f"MCP object {mcp_identifier} has size {mcp_obj_size} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
789
+ "so this metadata will likely fail to be emitted."
717
790
  )
718
791
 
719
792
  if (
@@ -726,7 +799,7 @@ class DataHubRestEmitter(Closeable, Emitter):
726
799
  current_chunk_size = 0
727
800
  mcp_obj_chunks[-1].append(mcp_obj)
728
801
  current_chunk_size += mcp_obj_size
729
- if len(mcp_obj_chunks) > 0:
802
+ if len(mcp_obj_chunks) > 1 or _DATAHUB_EMITTER_TRACE:
730
803
  logger.debug(
731
804
  f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
732
805
  )
datahub/entrypoints.py CHANGED
@@ -10,6 +10,7 @@ import click
10
10
  import datahub._version as datahub_version
11
11
  from datahub.cli.check_cli import check
12
12
  from datahub.cli.cli_utils import (
13
+ enable_auto_decorators,
13
14
  fixup_gms_url,
14
15
  generate_access_token,
15
16
  make_shim_command,
@@ -21,6 +22,7 @@ from datahub.cli.docker_cli import docker
21
22
  from datahub.cli.env_utils import get_boolean_env_variable
22
23
  from datahub.cli.exists_cli import exists
23
24
  from datahub.cli.get_cli import get
25
+ from datahub.cli.graphql_cli import graphql
24
26
  from datahub.cli.ingest_cli import ingest
25
27
  from datahub.cli.migrate import migrate
26
28
  from datahub.cli.put_cli import put
@@ -38,7 +40,6 @@ from datahub.cli.timeline_cli import timeline
38
40
  from datahub.configuration.common import should_show_stack_trace
39
41
  from datahub.ingestion.graph.client import get_default_graph
40
42
  from datahub.ingestion.graph.config import ClientMode
41
- from datahub.telemetry import telemetry
42
43
  from datahub.utilities._custom_package_loader import model_version_name
43
44
  from datahub.utilities.logging_manager import configure_logging
44
45
  from datahub.utilities.server_config_util import get_gms_config
@@ -111,7 +112,6 @@ def datahub(
111
112
  default=False,
112
113
  help="If passed will show server config. Assumes datahub init has happened.",
113
114
  )
114
- @telemetry.with_telemetry()
115
115
  def version(include_server: bool = False) -> None:
116
116
  """Print version number and exit."""
117
117
 
@@ -131,7 +131,6 @@ def version(include_server: bool = False) -> None:
131
131
  default=False,
132
132
  help="If passed then uses password to initialise token.",
133
133
  )
134
- @telemetry.with_telemetry()
135
134
  def init(use_password: bool = False) -> None:
136
135
  """Configure which datahub instance to connect to"""
137
136
 
@@ -171,6 +170,7 @@ datahub.add_command(ingest)
171
170
  datahub.add_command(delete)
172
171
  datahub.add_command(exists)
173
172
  datahub.add_command(get)
173
+ datahub.add_command(graphql)
174
174
  datahub.add_command(put)
175
175
  datahub.add_command(state)
176
176
  datahub.add_command(telemetry_cli)
@@ -218,6 +218,9 @@ except ImportError as e:
218
218
  make_shim_command("actions", "run `pip install acryl-datahub-actions`")
219
219
  )
220
220
 
221
+ # Adding telemetry and upgrade decorators to all commands
222
+ enable_auto_decorators(datahub)
223
+
221
224
 
222
225
  def main(**kwargs):
223
226
  # We use threads in a variety of places within our CLI. The multiprocessing