acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import functools
4
4
  import json
5
5
  import logging
6
- import os
6
+ import re
7
7
  import time
8
8
  from collections import defaultdict
9
9
  from dataclasses import dataclass
@@ -20,18 +20,18 @@ from typing import (
20
20
  Sequence,
21
21
  Tuple,
22
22
  Union,
23
+ overload,
23
24
  )
24
25
 
25
26
  import pydantic
26
27
  import requests
27
- from deprecated import deprecated
28
28
  from requests.adapters import HTTPAdapter, Retry
29
29
  from requests.exceptions import HTTPError, RequestException
30
+ from typing_extensions import deprecated
30
31
 
31
32
  from datahub._version import nice_version_name
32
33
  from datahub.cli import config_utils
33
34
  from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
34
- from datahub.cli.env_utils import get_boolean_env_variable
35
35
  from datahub.configuration.common import (
36
36
  ConfigEnum,
37
37
  ConfigModel,
@@ -40,10 +40,17 @@ from datahub.configuration.common import (
40
40
  TraceTimeoutError,
41
41
  TraceValidationError,
42
42
  )
43
- from datahub.emitter.aspect import JSON_CONTENT_TYPE
43
+ from datahub.configuration.env_vars import (
44
+ get_emit_mode,
45
+ get_emitter_trace,
46
+ get_rest_emitter_batch_max_payload_bytes,
47
+ get_rest_emitter_batch_max_payload_length,
48
+ get_rest_emitter_default_endpoint,
49
+ get_rest_emitter_default_retry_max_times,
50
+ )
44
51
  from datahub.emitter.generic_emitter import Emitter
45
52
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
46
- from datahub.emitter.request_helper import make_curl_command
53
+ from datahub.emitter.request_helper import OpenApiRequest, make_curl_command
47
54
  from datahub.emitter.response_helper import (
48
55
  TraceData,
49
56
  extract_trace_data,
@@ -51,11 +58,20 @@ from datahub.emitter.response_helper import (
51
58
  )
52
59
  from datahub.emitter.serialization_helper import pre_json_transform
53
60
  from datahub.ingestion.api.closeable import Closeable
61
+ from datahub.ingestion.graph.config import (
62
+ DATAHUB_COMPONENT_ENV,
63
+ ClientMode,
64
+ )
54
65
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
55
66
  MetadataChangeEvent,
56
67
  MetadataChangeProposal,
57
68
  )
58
69
  from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation
70
+ from datahub.metadata.schema_classes import (
71
+ KEY_ASPECT_NAMES,
72
+ ChangeTypeClass,
73
+ )
74
+ from datahub.utilities.server_config_util import RestServiceConfig, ServiceFeature
59
75
 
60
76
  if TYPE_CHECKING:
61
77
  from datahub.ingestion.graph.client import DataHubGraph
@@ -72,51 +88,77 @@ _DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on
72
88
  504,
73
89
  ]
74
90
  _DEFAULT_RETRY_METHODS = ["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"]
75
- _DEFAULT_RETRY_MAX_TIMES = int(
76
- os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
77
- )
91
+ _DEFAULT_RETRY_MAX_TIMES = int(get_rest_emitter_default_retry_max_times())
78
92
 
79
- _DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
93
+ _DATAHUB_EMITTER_TRACE = get_emitter_trace()
94
+
95
+ _DEFAULT_CLIENT_MODE: ClientMode = ClientMode.SDK
80
96
 
81
97
  TRACE_PENDING_STATUS = "PENDING"
82
98
  TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
83
99
  TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
84
100
  TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
85
101
 
86
- # The limit is 16mb. We will use a max of 15mb to have some space
102
+ # The limit is 16,000,000 bytes. We will use a max of 15mb to have some space
87
103
  # for overhead like request headers.
88
104
  # This applies to pretty much all calls to GMS.
89
- INGEST_MAX_PAYLOAD_BYTES = 15 * 1024 * 1024
105
+ INGEST_MAX_PAYLOAD_BYTES = get_rest_emitter_batch_max_payload_bytes()
90
106
 
91
107
  # This limit is somewhat arbitrary. All GMS endpoints will timeout
92
108
  # and return a 500 if processing takes too long. To avoid sending
93
109
  # too much to the backend and hitting a timeout, we try to limit
94
110
  # the number of MCPs we send in a batch.
95
- BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
96
- os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", 200)
111
+ BATCH_INGEST_MAX_PAYLOAD_LENGTH = get_rest_emitter_batch_max_payload_length()
112
+
113
+
114
+ def preserve_unicode_escapes(obj: Any) -> Any:
115
+ """Recursively convert unicode characters back to escape sequences"""
116
+ if isinstance(obj, dict):
117
+ return {k: preserve_unicode_escapes(v) for k, v in obj.items()}
118
+ elif isinstance(obj, list):
119
+ return [preserve_unicode_escapes(item) for item in obj]
120
+ elif isinstance(obj, str):
121
+ # Convert non-ASCII characters back to \u escapes
122
+ def escape_unicode(match: Any) -> Any:
123
+ return f"\\u{ord(match.group(0)):04x}"
124
+
125
+ return re.sub(r"[^\x00-\x7F]", escape_unicode, obj)
126
+ else:
127
+ return obj
128
+
129
+
130
+ class EmitMode(ConfigEnum):
131
+ # Fully synchronous processing that updates both primary storage (SQL) and search storage (Elasticsearch) before returning.
132
+ # Provides the strongest consistency guarantee but with the highest cost. Best for critical operations where immediate
133
+ # searchability and consistent reads are required.
134
+ SYNC_WAIT = auto()
135
+ # Synchronously updates the primary storage (SQL) but asynchronously updates search storage (Elasticsearch). Provides
136
+ # a balance between consistency and performance. Suitable for updates that need to be immediately reflected in direct
137
+ # entity retrievals but where search index consistency can be slightly delayed.
138
+ SYNC_PRIMARY = auto()
139
+ # Queues the metadata change for asynchronous processing and returns immediately. The client continues execution without
140
+ # waiting for the change to be fully processed. Best for high-throughput scenarios where eventual consistency is acceptable.
141
+ ASYNC = auto()
142
+ # Queues the metadata change asynchronously but blocks until confirmation that the write has been fully persisted.
143
+ # More efficient than fully synchronous operations due to backend parallelization and batching while still providing
144
+ # strong consistency guarantees. Useful when you need confirmation of successful persistence without sacrificing performance.
145
+ ASYNC_WAIT = auto()
146
+
147
+
148
+ _DEFAULT_EMIT_MODE = pydantic.parse_obj_as(
149
+ EmitMode,
150
+ get_emit_mode() or EmitMode.SYNC_PRIMARY,
97
151
  )
98
152
 
99
153
 
100
- class RestTraceMode(ConfigEnum):
101
- ENABLED = auto()
102
- DISABLED = auto()
103
-
104
-
105
154
  class RestSinkEndpoint(ConfigEnum):
106
155
  RESTLI = auto()
107
156
  OPENAPI = auto()
108
157
 
109
158
 
110
- DEFAULT_REST_SINK_ENDPOINT = pydantic.parse_obj_as(
159
+ DEFAULT_REST_EMITTER_ENDPOINT = pydantic.parse_obj_as(
111
160
  RestSinkEndpoint,
112
- os.getenv("DATAHUB_REST_SINK_DEFAULT_ENDPOINT", RestSinkEndpoint.RESTLI),
113
- )
114
-
115
-
116
- # Supported with v1.0
117
- DEFAULT_REST_TRACE_MODE = pydantic.parse_obj_as(
118
- RestTraceMode,
119
- os.getenv("DATAHUB_REST_TRACE_MODE", RestTraceMode.DISABLED),
161
+ get_rest_emitter_default_endpoint() or RestSinkEndpoint.RESTLI,
120
162
  )
121
163
 
122
164
 
@@ -132,12 +174,24 @@ class RequestsSessionConfig(ConfigModel):
132
174
  ca_certificate_path: Optional[str] = None
133
175
  client_certificate_path: Optional[str] = None
134
176
  disable_ssl_verification: bool = False
177
+ client_mode: Optional[ClientMode] = _DEFAULT_CLIENT_MODE
178
+ datahub_component: Optional[str] = None
135
179
 
136
180
  def build_session(self) -> requests.Session:
137
181
  session = requests.Session()
138
182
 
139
- if self.extra_headers:
140
- session.headers.update(self.extra_headers)
183
+ user_agent = self._get_user_agent_string(session)
184
+
185
+ base_headers = {
186
+ "User-Agent": user_agent,
187
+ "X-DataHub-Client-Mode": self.client_mode.name
188
+ if self.client_mode
189
+ else _DEFAULT_CLIENT_MODE.name,
190
+ "X-DataHub-Py-Cli-Version": nice_version_name(),
191
+ }
192
+
193
+ headers = {**base_headers, **self.extra_headers}
194
+ session.headers.update(headers)
141
195
 
142
196
  if self.client_certificate_path:
143
197
  session.cert = self.client_certificate_path
@@ -185,6 +239,59 @@ class RequestsSessionConfig(ConfigModel):
185
239
 
186
240
  return session
187
241
 
242
+ @classmethod
243
+ def get_client_mode_from_session(
244
+ cls, session: requests.Session
245
+ ) -> Optional[ClientMode]:
246
+ """
247
+ Extract the ClientMode enum from a requests Session by checking the headers.
248
+
249
+ Args:
250
+ session: The requests.Session object to check
251
+
252
+ Returns:
253
+ The corresponding ClientMode enum value if found, None otherwise
254
+ """
255
+ # Check if the session has the X-DataHub-Client-Mode header
256
+ mode_str = session.headers.get("X-DataHub-Client-Mode")
257
+
258
+ if not mode_str:
259
+ return None
260
+
261
+ # Try to convert the string value to enum
262
+ try:
263
+ # First ensure we're working with a str value
264
+ if isinstance(mode_str, bytes):
265
+ mode_str = mode_str.decode("utf-8")
266
+
267
+ # Then find the matching enum value
268
+ for mode in ClientMode:
269
+ if mode.name == mode_str:
270
+ return mode
271
+
272
+ # If we got here, no matching enum was found
273
+ return None
274
+ except Exception:
275
+ # Handle any other errors
276
+ return None
277
+
278
+ def _get_user_agent_string(self, session: requests.Session) -> str:
279
+ """Generate appropriate user agent string based on client mode"""
280
+ version = nice_version_name()
281
+ client_mode = self.client_mode if self.client_mode else _DEFAULT_CLIENT_MODE
282
+
283
+ if "User-Agent" in session.headers:
284
+ user_agent = session.headers["User-Agent"]
285
+ if isinstance(user_agent, bytes):
286
+ requests_user_agent = " " + user_agent.decode("utf-8")
287
+ else:
288
+ requests_user_agent = " " + user_agent
289
+ else:
290
+ requests_user_agent = ""
291
+
292
+ # 1.0 refers to the user agent string version
293
+ return f"DataHub-Client/1.0 ({client_mode.name.lower()}; {self.datahub_component if self.datahub_component else DATAHUB_COMPONENT_ENV}; {version}){requests_user_agent}"
294
+
188
295
 
189
296
  @dataclass
190
297
  class _Chunk:
@@ -210,8 +317,8 @@ class DataHubRestEmitter(Closeable, Emitter):
210
317
  _gms_server: str
211
318
  _token: Optional[str]
212
319
  _session: requests.Session
213
- _openapi_ingestion: bool
214
- _default_trace_mode: bool
320
+ _openapi_ingestion: Optional[bool]
321
+ _server_config: RestServiceConfig
215
322
 
216
323
  def __init__(
217
324
  self,
@@ -227,8 +334,10 @@ class DataHubRestEmitter(Closeable, Emitter):
227
334
  ca_certificate_path: Optional[str] = None,
228
335
  client_certificate_path: Optional[str] = None,
229
336
  disable_ssl_verification: bool = False,
230
- openapi_ingestion: bool = False,
231
- default_trace_mode: bool = False,
337
+ openapi_ingestion: Optional[bool] = None,
338
+ client_mode: Optional[ClientMode] = None,
339
+ datahub_component: Optional[str] = None,
340
+ server_config_refresh_interval: Optional[int] = None,
232
341
  ):
233
342
  if not gms_server:
234
343
  raise ConfigurationError("gms server is required")
@@ -240,21 +349,15 @@ class DataHubRestEmitter(Closeable, Emitter):
240
349
 
241
350
  self._gms_server = fixup_gms_url(gms_server)
242
351
  self._token = token
243
- self.server_config: Dict[str, Any] = {}
244
- self._openapi_ingestion = openapi_ingestion
245
- self._default_trace_mode = default_trace_mode
246
352
  self._session = requests.Session()
247
-
248
- logger.debug(
249
- f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
353
+ self._openapi_ingestion = (
354
+ openapi_ingestion # Re-evaluated after test connection
250
355
  )
251
-
252
- if self._default_trace_mode:
253
- logger.debug("Using API Tracing for ingestion.")
356
+ self._server_config_refresh_interval = server_config_refresh_interval
357
+ self._config_fetch_time: Optional[float] = None
254
358
 
255
359
  headers = {
256
360
  "X-RestLi-Protocol-Version": "2.0.0",
257
- "X-DataHub-Py-Cli-Version": nice_version_name(),
258
361
  "Content-Type": "application/json",
259
362
  }
260
363
  if token:
@@ -300,39 +403,116 @@ class DataHubRestEmitter(Closeable, Emitter):
300
403
  ca_certificate_path=ca_certificate_path,
301
404
  client_certificate_path=client_certificate_path,
302
405
  disable_ssl_verification=disable_ssl_verification,
406
+ client_mode=client_mode,
407
+ datahub_component=datahub_component,
303
408
  )
304
409
 
305
410
  self._session = self._session_config.build_session()
306
411
 
307
- def test_connection(self) -> None:
308
- url = f"{self._gms_server}/config"
309
- response = self._session.get(url)
310
- if response.status_code == 200:
311
- config: dict = response.json()
312
- if config.get("noCode") == "true":
313
- self.server_config = config
314
- return
412
+ @property
413
+ def server_config(self) -> RestServiceConfig:
414
+ return self.fetch_server_config()
315
415
 
316
- else:
416
+ # TODO: This should move to DataHubGraph once it no longer inherits from DataHubRestEmitter
417
+ def fetch_server_config(self) -> RestServiceConfig:
418
+ """
419
+ Fetch configuration from the server if not already loaded.
420
+
421
+ Returns:
422
+ The configuration dictionary
423
+
424
+ Raises:
425
+ ConfigurationError: If there's an error fetching or validating the configuration
426
+ """
427
+
428
+ if (
429
+ not hasattr(self, "_server_config")
430
+ or self._server_config is None
431
+ or (
432
+ self._server_config_refresh_interval is not None
433
+ and self._config_fetch_time is not None
434
+ and (time.time() - self._config_fetch_time)
435
+ > self._server_config_refresh_interval
436
+ )
437
+ ):
438
+ if self._session is None or self._gms_server is None:
317
439
  raise ConfigurationError(
318
- "You seem to have connected to the frontend service instead of the GMS endpoint. "
319
- "The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms). "
320
- "For Acryl users, the endpoint should be https://<name>.acryl.io/gms"
440
+ "Session and URL are required to load configuration"
321
441
  )
322
- else:
323
- logger.debug(
324
- f"Unable to connect to {url} with status_code: {response.status_code}. Response: {response.text}"
325
- )
326
- if response.status_code == 401:
327
- message = f"Unable to connect to {url} - got an authentication error: {response.text}."
442
+
443
+ url = f"{self._gms_server}/config"
444
+ response = self._session.get(url)
445
+
446
+ if response.status_code == 200:
447
+ raw_config = response.json()
448
+
449
+ # Validate that we're connected to the correct service
450
+ if not raw_config.get("noCode") == "true":
451
+ raise ConfigurationError(
452
+ "You seem to have connected to the frontend service instead of the GMS endpoint. "
453
+ "The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms). "
454
+ "For Acryl users, the endpoint should be https://<name>.acryl.io/gms"
455
+ )
456
+
457
+ self._server_config = RestServiceConfig(raw_config=raw_config)
458
+ self._config_fetch_time = time.time()
459
+ self._post_fetch_server_config()
460
+
461
+ else:
462
+ logger.debug(
463
+ f"Unable to connect to {url} with status_code: {response.status_code}. Response: {response.text}"
464
+ )
465
+
466
+ if response.status_code == 401:
467
+ message = f"Unable to connect to {url} - got an authentication error: {response.text}."
468
+ else:
469
+ message = f"Unable to connect to {url} with status_code: {response.status_code}."
470
+
471
+ message += "\nPlease check your configuration and make sure you are talking to the DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)."
472
+ raise ConfigurationError(message)
473
+
474
+ return self._server_config
475
+
476
+ def _post_fetch_server_config(self) -> None:
477
+ # Determine OpenAPI mode
478
+ if self._openapi_ingestion is None:
479
+ # No constructor parameter
480
+ if (
481
+ not get_rest_emitter_default_endpoint()
482
+ and self._session_config.client_mode == ClientMode.SDK
483
+ and self._server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
484
+ ):
485
+ # Enable if SDK client and no environment variable specified
486
+ self._openapi_ingestion = True
328
487
  else:
329
- message = f"Unable to connect to {url} with status_code: {response.status_code}."
330
- message += "\nPlease check your configuration and make sure you are talking to the DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)."
331
- raise ConfigurationError(message)
488
+ # The system env is specifying the value
489
+ self._openapi_ingestion = (
490
+ DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI
491
+ )
492
+
493
+ def test_connection(self) -> None:
494
+ self.fetch_server_config()
495
+ logger.debug(
496
+ f"Using {'OpenAPI' if self._openapi_ingestion else 'Restli'} for ingestion."
497
+ )
498
+ logger.debug(
499
+ f"{EmitMode.ASYNC_WAIT} {'IS' if self._should_trace(emit_mode=EmitMode.ASYNC_WAIT, warn=False) else 'IS NOT'} supported."
500
+ )
332
501
 
333
502
  def get_server_config(self) -> dict:
334
- self.test_connection()
335
- return self.server_config
503
+ return self.server_config.raw_config
504
+
505
+ def invalidate_config_cache(self) -> None:
506
+ """Manually invalidate the configuration cache."""
507
+ if (
508
+ hasattr(self, "_server_config")
509
+ and self._server_config is not None
510
+ and self._server_config_refresh_interval is not None
511
+ ):
512
+ # Set fetch time to beyond TTL in the past to force refresh on next access
513
+ self._config_fetch_time = (
514
+ time.time() - self._server_config_refresh_interval - 1
515
+ )
336
516
 
337
517
  def to_graph(self) -> "DataHubGraph":
338
518
  from datahub.ingestion.graph.client import DataHubGraph
@@ -342,39 +522,24 @@ class DataHubRestEmitter(Closeable, Emitter):
342
522
  def _to_openapi_request(
343
523
  self,
344
524
  mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
345
- async_flag: Optional[bool] = None,
346
- async_default: bool = False,
347
- ) -> Optional[Tuple[str, List[Dict[str, Any]]]]:
348
- if mcp.aspect and mcp.aspectName:
349
- resolved_async_flag = (
350
- async_flag if async_flag is not None else async_default
351
- )
352
- url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
525
+ emit_mode: EmitMode,
526
+ ) -> Optional[OpenApiRequest]:
527
+ """
528
+ Convert a MetadataChangeProposal to an OpenAPI request format.
353
529
 
354
- if isinstance(mcp, MetadataChangeProposalWrapper):
355
- aspect_value = pre_json_transform(
356
- mcp.to_obj(simplified_structure=True)
357
- )["aspect"]["json"]
358
- else:
359
- obj = mcp.aspect.to_obj()
360
- if obj.get("value") and obj.get("contentType") == JSON_CONTENT_TYPE:
361
- obj = json.loads(obj["value"])
362
- aspect_value = pre_json_transform(obj)
363
- return (
364
- url,
365
- [
366
- {
367
- "urn": mcp.entityUrn,
368
- mcp.aspectName: {
369
- "value": aspect_value,
370
- "systemMetadata": mcp.systemMetadata.to_obj()
371
- if mcp.systemMetadata
372
- else None,
373
- },
374
- }
375
- ],
376
- )
377
- return None
530
+ Args:
531
+ mcp: The metadata change proposal
532
+ emit_mode: Client emit mode
533
+
534
+ Returns:
535
+ An OpenApiRequest object or None if the MCP doesn't have required fields
536
+ """
537
+ return OpenApiRequest.from_mcp(
538
+ mcp=mcp,
539
+ gms_server=self._gms_server,
540
+ async_flag=emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT),
541
+ search_sync_flag=emit_mode == EmitMode.SYNC_WAIT,
542
+ )
378
543
 
379
544
  def emit(
380
545
  self,
@@ -385,7 +550,7 @@ class DataHubRestEmitter(Closeable, Emitter):
385
550
  UsageAggregation,
386
551
  ],
387
552
  callback: Optional[Callable[[Exception, str], None]] = None,
388
- async_flag: Optional[bool] = None,
553
+ emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
389
554
  ) -> None:
390
555
  try:
391
556
  if isinstance(item, UsageAggregation):
@@ -393,7 +558,7 @@ class DataHubRestEmitter(Closeable, Emitter):
393
558
  elif isinstance(
394
559
  item, (MetadataChangeProposal, MetadataChangeProposalWrapper)
395
560
  ):
396
- self.emit_mcp(item, async_flag=async_flag)
561
+ self.emit_mcp(item, emit_mode=emit_mode)
397
562
  else:
398
563
  self.emit_mce(item)
399
564
  except Exception as e:
@@ -421,42 +586,84 @@ class DataHubRestEmitter(Closeable, Emitter):
421
586
  "systemMetadata": system_metadata_obj,
422
587
  }
423
588
  payload = json.dumps(snapshot)
589
+ if len(payload) > INGEST_MAX_PAYLOAD_BYTES:
590
+ logger.warning(
591
+ f"MCE object has size {len(payload)} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
592
+ "so this metadata will likely fail to be emitted."
593
+ )
424
594
 
425
595
  self._emit_generic(url, payload)
426
596
 
597
+ @overload
598
+ @deprecated("Use emit_mode instead of async_flag")
599
+ def emit_mcp(
600
+ self,
601
+ mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
602
+ *,
603
+ async_flag: Optional[bool] = None,
604
+ ) -> None: ...
605
+
606
+ @overload
607
+ def emit_mcp(
608
+ self,
609
+ mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
610
+ *,
611
+ emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
612
+ wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
613
+ ) -> None: ...
614
+
427
615
  def emit_mcp(
428
616
  self,
429
617
  mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
430
618
  async_flag: Optional[bool] = None,
431
- trace_flag: Optional[bool] = None,
432
- trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
619
+ emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
620
+ wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
433
621
  ) -> None:
622
+ if async_flag is True:
623
+ emit_mode = EmitMode.ASYNC
624
+
434
625
  ensure_has_system_metadata(mcp)
435
626
 
436
627
  trace_data = None
437
628
 
438
629
  if self._openapi_ingestion:
439
- request = self._to_openapi_request(mcp, async_flag, async_default=False)
630
+ request = self._to_openapi_request(mcp, emit_mode)
440
631
  if request:
441
- response = self._emit_generic(request[0], payload=request[1])
632
+ response = self._emit_generic(
633
+ request.url, payload=request.payload, method=request.method
634
+ )
442
635
 
443
- if self._should_trace(async_flag, trace_flag):
636
+ if self._should_trace(emit_mode):
444
637
  trace_data = extract_trace_data(response) if response else None
445
638
 
446
639
  else:
447
- url = f"{self._gms_server}/aspects?action=ingestProposal"
640
+ if mcp.changeType == ChangeTypeClass.DELETE:
641
+ if mcp.aspectName not in KEY_ASPECT_NAMES:
642
+ raise OperationalError(
643
+ f"Delete not supported for non key aspect: {mcp.aspectName} for urn: "
644
+ f"{mcp.entityUrn}"
645
+ )
448
646
 
449
- mcp_obj = pre_json_transform(mcp.to_obj())
450
- payload_dict = {"proposal": mcp_obj}
647
+ url = f"{self._gms_server}/entities?action=delete"
648
+ payload_dict = {
649
+ "urn": mcp.entityUrn,
650
+ }
651
+ else:
652
+ url = f"{self._gms_server}/aspects?action=ingestProposal"
451
653
 
452
- if async_flag is not None:
453
- payload_dict["async"] = "true" if async_flag else "false"
654
+ mcp_obj = preserve_unicode_escapes(pre_json_transform(mcp.to_obj()))
655
+ payload_dict = {
656
+ "proposal": mcp_obj,
657
+ "async": "true"
658
+ if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
659
+ else "false",
660
+ }
454
661
 
455
662
  payload = json.dumps(payload_dict)
456
663
 
457
664
  response = self._emit_generic(url, payload)
458
665
 
459
- if self._should_trace(async_flag, trace_flag):
666
+ if self._should_trace(emit_mode):
460
667
  trace_data = (
461
668
  extract_trace_data_from_mcps(response, [mcp]) if response else None
462
669
  )
@@ -464,15 +671,14 @@ class DataHubRestEmitter(Closeable, Emitter):
464
671
  if trace_data:
465
672
  self._await_status(
466
673
  [trace_data],
467
- trace_timeout,
674
+ wait_timeout,
468
675
  )
469
676
 
470
677
  def emit_mcps(
471
678
  self,
472
679
  mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
473
- async_flag: Optional[bool] = None,
474
- trace_flag: Optional[bool] = None,
475
- trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
680
+ emit_mode: EmitMode = _DEFAULT_EMIT_MODE,
681
+ wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
476
682
  ) -> int:
477
683
  if _DATAHUB_EMITTER_TRACE:
478
684
  logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
@@ -481,43 +687,46 @@ class DataHubRestEmitter(Closeable, Emitter):
481
687
  ensure_has_system_metadata(mcp)
482
688
 
483
689
  if self._openapi_ingestion:
484
- return self._emit_openapi_mcps(mcps, async_flag, trace_flag, trace_timeout)
690
+ return self._emit_openapi_mcps(mcps, emit_mode, wait_timeout)
485
691
  else:
486
- return self._emit_restli_mcps(mcps, async_flag)
692
+ return self._emit_restli_mcps(mcps, emit_mode)
487
693
 
488
694
  def _emit_openapi_mcps(
489
695
  self,
490
696
  mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
491
- async_flag: Optional[bool] = None,
492
- trace_flag: Optional[bool] = None,
493
- trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
697
+ emit_mode: EmitMode,
698
+ wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
494
699
  ) -> int:
495
700
  """
496
- 1. Grouping MCPs by their entity URL
701
+ 1. Grouping MCPs by their HTTP method and entity URL and HTTP method
497
702
  2. Breaking down large batches into smaller chunks based on both:
498
703
  * Total byte size (INGEST_MAX_PAYLOAD_BYTES)
499
704
  * Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
500
705
 
501
706
  The Chunk class encapsulates both the items and their byte size tracking
502
- Serializing the items only once with json.dumps(request[1]) and reusing that
707
+ Serializing the items only once with json.dumps(request.payload) and reusing that
503
708
  The chunking logic handles edge cases (always accepting at least one item per chunk)
504
709
  The joining logic is efficient with a simple string concatenation
505
710
 
506
711
  :param mcps: metadata change proposals to transmit
507
- :param async_flag: the mode
712
+ :param emit_mode: the mode to emit the MCPs
713
+ :param wait_timeout: timeout for blocking queue
508
714
  :return: number of requests
509
715
  """
510
- # group by entity url
511
- batches: Dict[str, List[_Chunk]] = defaultdict(
716
+ # Group by entity URL and HTTP method
717
+ batches: Dict[Tuple[str, str], List[_Chunk]] = defaultdict(
512
718
  lambda: [_Chunk(items=[])]
513
719
  ) # Initialize with one empty Chunk
514
720
 
515
721
  for mcp in mcps:
516
- request = self._to_openapi_request(mcp, async_flag, async_default=True)
722
+ request = self._to_openapi_request(mcp, emit_mode)
517
723
  if request:
518
- current_chunk = batches[request[0]][-1] # Get the last chunk
519
- # Only serialize once
520
- serialized_item = json.dumps(request[1][0])
724
+ # Create a composite key with both method and URL
725
+ key = (request.method, request.url)
726
+ current_chunk = batches[key][-1] # Get the last chunk
727
+
728
+ # Only serialize once - we're serializing a single payload item
729
+ serialized_item = json.dumps(request.payload[0])
521
730
  item_bytes = len(serialized_item.encode())
522
731
 
523
732
  # If adding this item would exceed max_bytes, create a new chunk
@@ -527,18 +736,20 @@ class DataHubRestEmitter(Closeable, Emitter):
527
736
  or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
528
737
  ):
529
738
  new_chunk = _Chunk(items=[])
530
- batches[request[0]].append(new_chunk)
739
+ batches[key].append(new_chunk)
531
740
  current_chunk = new_chunk
532
741
 
533
742
  current_chunk.add_item(serialized_item)
534
743
 
535
744
  responses = []
536
- for url, chunks in batches.items():
745
+ for (method, url), chunks in batches.items():
537
746
  for chunk in chunks:
538
- response = self._emit_generic(url, payload=_Chunk.join(chunk))
747
+ response = self._emit_generic(
748
+ url, payload=_Chunk.join(chunk), method=method
749
+ )
539
750
  responses.append(response)
540
751
 
541
- if self._should_trace(async_flag, trace_flag, async_default=True):
752
+ if self._should_trace(emit_mode):
542
753
  trace_data = []
543
754
  for response in responses:
544
755
  data = extract_trace_data(response) if response else None
@@ -546,28 +757,36 @@ class DataHubRestEmitter(Closeable, Emitter):
546
757
  trace_data.append(data)
547
758
 
548
759
  if trace_data:
549
- self._await_status(trace_data, trace_timeout)
760
+ self._await_status(trace_data, wait_timeout)
550
761
 
551
762
  return len(responses)
552
763
 
553
764
  def _emit_restli_mcps(
554
765
  self,
555
766
  mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
556
- async_flag: Optional[bool] = None,
767
+ emit_mode: EmitMode,
557
768
  ) -> int:
558
769
  url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
559
770
 
560
771
  mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
772
+ if len(mcp_objs) == 0:
773
+ return 0
561
774
 
562
775
  # As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
563
776
  # If we will exceed the limit, we need to break it up into chunks.
564
- mcp_obj_chunks: List[List[str]] = []
565
- current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
777
+ mcp_obj_chunks: List[List[str]] = [[]]
778
+ current_chunk_size = 0
566
779
  for mcp_obj in mcp_objs:
780
+ mcp_identifier = f"{mcp_obj.get('entityUrn')}-{mcp_obj.get('aspectName')}"
567
781
  mcp_obj_size = len(json.dumps(mcp_obj))
568
782
  if _DATAHUB_EMITTER_TRACE:
569
783
  logger.debug(
570
- f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
784
+ f"Iterating through object ({mcp_identifier}) with size {mcp_obj_size}"
785
+ )
786
+ if mcp_obj_size > INGEST_MAX_PAYLOAD_BYTES:
787
+ logger.warning(
788
+ f"MCP object {mcp_identifier} has size {mcp_obj_size} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
789
+ "so this metadata will likely fail to be emitted."
571
790
  )
572
791
 
573
792
  if (
@@ -580,7 +799,7 @@ class DataHubRestEmitter(Closeable, Emitter):
580
799
  current_chunk_size = 0
581
800
  mcp_obj_chunks[-1].append(mcp_obj)
582
801
  current_chunk_size += mcp_obj_size
583
- if len(mcp_obj_chunks) > 0:
802
+ if len(mcp_obj_chunks) > 1 or _DATAHUB_EMITTER_TRACE:
584
803
  logger.debug(
585
804
  f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
586
805
  )
@@ -588,16 +807,19 @@ class DataHubRestEmitter(Closeable, Emitter):
588
807
  for mcp_obj_chunk in mcp_obj_chunks:
589
808
  # TODO: We're calling json.dumps on each MCP object twice, once to estimate
590
809
  # the size when chunking, and again for the actual request.
591
- payload_dict: dict = {"proposals": mcp_obj_chunk}
592
- if async_flag is not None:
593
- payload_dict["async"] = "true" if async_flag else "false"
810
+ payload_dict: dict = {
811
+ "proposals": mcp_obj_chunk,
812
+ "async": "true"
813
+ if emit_mode in (EmitMode.ASYNC, EmitMode.ASYNC_WAIT)
814
+ else "false",
815
+ }
594
816
 
595
817
  payload = json.dumps(payload_dict)
596
818
  self._emit_generic(url, payload)
597
819
 
598
820
  return len(mcp_obj_chunks)
599
821
 
600
- @deprecated
822
+ @deprecated("Use emit with a datasetUsageStatistics aspect instead")
601
823
  def emit_usage(self, usageStats: UsageAggregation) -> None:
602
824
  url = f"{self._gms_server}/usageStats?action=batchIngest"
603
825
 
@@ -608,11 +830,13 @@ class DataHubRestEmitter(Closeable, Emitter):
608
830
  payload = json.dumps(snapshot)
609
831
  self._emit_generic(url, payload)
610
832
 
611
- def _emit_generic(self, url: str, payload: Union[str, Any]) -> requests.Response:
833
+ def _emit_generic(
834
+ self, url: str, payload: Union[str, Any], method: str = "POST"
835
+ ) -> requests.Response:
612
836
  if not isinstance(payload, str):
613
837
  payload = json.dumps(payload)
614
838
 
615
- curl_command = make_curl_command(self._session, "POST", url, payload)
839
+ curl_command = make_curl_command(self._session, method, url, payload)
616
840
  payload_size = len(payload)
617
841
  if payload_size > INGEST_MAX_PAYLOAD_BYTES:
618
842
  # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
@@ -625,7 +849,8 @@ class DataHubRestEmitter(Closeable, Emitter):
625
849
  curl_command,
626
850
  )
627
851
  try:
628
- response = self._session.post(url, data=payload)
852
+ method_func = getattr(self._session, method.lower())
853
+ response = method_func(url, data=payload) if payload else method_func(url)
629
854
  response.raise_for_status()
630
855
  return response
631
856
  except HTTPError as e:
@@ -661,7 +886,7 @@ class DataHubRestEmitter(Closeable, Emitter):
661
886
  def _await_status(
662
887
  self,
663
888
  trace_data: List[TraceData],
664
- trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
889
+ wait_timeout: Optional[timedelta] = timedelta(seconds=3600),
665
890
  ) -> None:
666
891
  """Verify the status of asynchronous write operations.
667
892
  Args:
@@ -671,8 +896,8 @@ class DataHubRestEmitter(Closeable, Emitter):
671
896
  TraceTimeoutError: If verification fails or times out
672
897
  TraceValidationError: Expected write was not completed successfully
673
898
  """
674
- if trace_timeout is None:
675
- raise ValueError("trace_timeout cannot be None")
899
+ if wait_timeout is None:
900
+ raise ValueError("wait_timeout cannot be None")
676
901
 
677
902
  try:
678
903
  if not trace_data:
@@ -685,9 +910,9 @@ class DataHubRestEmitter(Closeable, Emitter):
685
910
  current_backoff = TRACE_INITIAL_BACKOFF
686
911
 
687
912
  while trace.data:
688
- if datetime.now() - start_time > trace_timeout:
913
+ if datetime.now() - start_time > wait_timeout:
689
914
  raise TraceTimeoutError(
690
- f"Timeout waiting for async write completion after {trace_timeout.total_seconds()} seconds"
915
+ f"Timeout waiting for async write completion after {wait_timeout.total_seconds()} seconds"
691
916
  )
692
917
 
693
918
  base_url = f"{self._gms_server}/openapi/v1/trace/write"
@@ -700,7 +925,7 @@ class DataHubRestEmitter(Closeable, Emitter):
700
925
  for aspect_name, aspect_status in aspects.items():
701
926
  if not aspect_status["success"]:
702
927
  error_msg = (
703
- f"Unable to validate async write to DataHub GMS: "
928
+ f"Unable to validate async write {trace.trace_id} ({trace.extract_timestamp()}) to DataHub GMS: "
704
929
  f"Persistence failure for URN '{urn}' aspect '{aspect_name}'. "
705
930
  f"Status: {aspect_status}"
706
931
  )
@@ -739,17 +964,28 @@ class DataHubRestEmitter(Closeable, Emitter):
739
964
  logger.error(f"Error during status verification: {str(e)}")
740
965
  raise
741
966
 
742
- def _should_trace(
743
- self,
744
- async_flag: Optional[bool] = None,
745
- trace_flag: Optional[bool] = None,
746
- async_default: bool = False,
747
- ) -> bool:
748
- resolved_trace_flag = (
749
- trace_flag if trace_flag is not None else self._default_trace_mode
750
- )
751
- resolved_async_flag = async_flag if async_flag is not None else async_default
752
- return resolved_trace_flag and resolved_async_flag
967
+ def _should_trace(self, emit_mode: EmitMode, warn: bool = True) -> bool:
968
+ if emit_mode == EmitMode.ASYNC_WAIT:
969
+ if not bool(self._openapi_ingestion):
970
+ if warn:
971
+ logger.warning(
972
+ f"{emit_mode} requested but is only available when using OpenAPI."
973
+ )
974
+ return False
975
+ elif getattr(
976
+ self, "server_config", None
977
+ ) is None or not self.server_config.supports_feature(
978
+ ServiceFeature.API_TRACING
979
+ ):
980
+ if warn:
981
+ logger.warning(
982
+ f"{emit_mode} requested but is only available with a newer GMS version."
983
+ )
984
+ return False
985
+ else:
986
+ return True
987
+ else:
988
+ return False
753
989
 
754
990
  def __repr__(self) -> str:
755
991
  token_str = (