acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -2,18 +2,34 @@ import dataclasses
2
2
  import json
3
3
  import logging
4
4
  import pprint
5
- from dataclasses import dataclass
5
+ from collections import defaultdict
6
+ from dataclasses import dataclass, field
6
7
  from datetime import datetime, timedelta
7
8
  from enum import Enum
8
- from typing import Any, Optional, runtime_checkable
9
+ from typing import Any, Dict, List, Optional, Set, Union, cast, runtime_checkable
9
10
 
10
11
  import humanfriendly
11
12
  import pydantic
12
13
  from pydantic import BaseModel
14
+ from tabulate import tabulate
13
15
  from typing_extensions import Literal, Protocol
14
16
 
17
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
18
+ from datahub.emitter.mcp_builder import mcps_from_mce
19
+ from datahub.ingestion.api.closeable import Closeable
15
20
  from datahub.ingestion.api.report_helpers import format_datetime_relative
21
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
22
+ from datahub.ingestion.autogenerated.lineage_helper import is_lineage_aspect
23
+ from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
24
+ from datahub.metadata.schema_classes import (
25
+ MetadataChangeProposalClass,
26
+ StatusClass,
27
+ SubTypesClass,
28
+ UpstreamLineageClass,
29
+ )
30
+ from datahub.utilities.file_backed_collections import FileBackedDict
16
31
  from datahub.utilities.lossy_collections import LossyList
32
+ from datahub.utilities.urns.urn import guess_platform_name
17
33
 
18
34
  logger = logging.getLogger(__name__)
19
35
  LogLevel = Literal["ERROR", "WARNING", "INFO", "DEBUG"]
@@ -26,6 +42,15 @@ class SupportsAsObj(Protocol):
26
42
 
27
43
  @dataclass
28
44
  class Report(SupportsAsObj):
45
+ def __post_init__(self) -> None:
46
+ self.platform: Optional[str] = None
47
+
48
+ def set_platform(self, platform: str) -> None:
49
+ self.platform = platform
50
+
51
+ def get_platform(self) -> Optional[str]:
52
+ return self.platform
53
+
29
54
  @staticmethod
30
55
  def to_str(some_val: Any) -> str:
31
56
  if isinstance(some_val, Enum):
@@ -82,7 +107,58 @@ class Report(SupportsAsObj):
82
107
  }
83
108
 
84
109
  def as_string(self) -> str:
85
- return pprint.pformat(self.as_obj(), width=150, sort_dicts=False)
110
+ self_obj = self.as_obj()
111
+ _aspects_by_subtypes = self_obj.pop("aspects_by_subtypes", None)
112
+
113
+ # Format the main report data
114
+ result = pprint.pformat(self_obj, width=150, sort_dicts=False)
115
+
116
+ # Add aspects_by_subtypes table if it exists
117
+ if _aspects_by_subtypes:
118
+ result += "\n\nAspects by Subtypes:\n"
119
+ result += self._format_aspects_by_subtypes_table(_aspects_by_subtypes)
120
+
121
+ return result
122
+
123
+ def _format_aspects_by_subtypes_table(
124
+ self, aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]]
125
+ ) -> str:
126
+ """Format aspects_by_subtypes data as a table with aspects as rows and entity/subtype as columns."""
127
+ if not aspects_by_subtypes:
128
+ return "No aspects by subtypes data available."
129
+
130
+ all_aspects: set[str] = {
131
+ aspect
132
+ for subtypes in aspects_by_subtypes.values()
133
+ for aspects in subtypes.values()
134
+ for aspect in aspects
135
+ }
136
+
137
+ aspect_rows = sorted(all_aspects)
138
+
139
+ entity_subtype_columns = []
140
+ for entity_type, subtypes in aspects_by_subtypes.items():
141
+ for subtype in subtypes:
142
+ entity_subtype_columns.append(f"{entity_type} ({subtype})")
143
+
144
+ entity_subtype_columns.sort()
145
+
146
+ headers = ["Aspect"] + entity_subtype_columns
147
+
148
+ table_data = [
149
+ [aspect]
150
+ + [
151
+ aspects.get(aspect, 0)
152
+ for subtypes in aspects_by_subtypes.values()
153
+ for aspects in subtypes.values()
154
+ ]
155
+ for aspect in aspect_rows
156
+ ]
157
+
158
+ if table_data:
159
+ return tabulate(table_data, headers=headers, tablefmt="grid")
160
+ else:
161
+ return "No aspects by subtypes data available."
86
162
 
87
163
  def as_json(self) -> str:
88
164
  return json.dumps(self.as_obj())
@@ -90,6 +166,15 @@ class Report(SupportsAsObj):
90
166
  # TODO add helper method for warning / failure status + counts?
91
167
 
92
168
 
169
+ @dataclass
170
+ class SourceReportSubtypes:
171
+ urn: str
172
+ entity_type: str
173
+ subType: str = field(default="unknown")
174
+ aspects: Dict[str, int] = field(default_factory=dict)
175
+ soft_deleted: bool = field(default=False)
176
+
177
+
93
178
  class ReportAttribute(BaseModel):
94
179
  severity: LogLevel = "DEBUG"
95
180
  help: Optional[str] = None
@@ -108,6 +193,299 @@ class ReportAttribute(BaseModel):
108
193
  logger.log(level=self.logger_sev, msg=msg, stacklevel=3)
109
194
 
110
195
 
196
+ @dataclass
197
+ class ExamplesReport(Report, Closeable):
198
+ aspects: Dict[str, Dict[str, int]] = field(
199
+ default_factory=lambda: defaultdict(lambda: defaultdict(int))
200
+ )
201
+ # This counts existence of aspects for each entity/subtype
202
+ # This is used for the UI to calculate %age of entities with the aspect
203
+ aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]] = field(
204
+ default_factory=lambda: defaultdict(
205
+ lambda: defaultdict(lambda: defaultdict(int))
206
+ )
207
+ )
208
+ # This counts all aspects for each entity/subtype
209
+ aspects_by_subtypes_full_count: Dict[str, Dict[str, Dict[str, int]]] = field(
210
+ default_factory=lambda: defaultdict(
211
+ lambda: defaultdict(lambda: defaultdict(int))
212
+ )
213
+ )
214
+ samples: Dict[str, Dict[str, List[str]]] = field(
215
+ default_factory=lambda: defaultdict(lambda: defaultdict(list))
216
+ )
217
+ compute_stats_time_seconds: float = 0.0
218
+ _file_based_dict: Optional[FileBackedDict[SourceReportSubtypes]] = None
219
+
220
+ # We are adding this to make querying easier for fine-grained lineage
221
+ _fine_grained_lineage_special_case_name = "fineGrainedLineages"
222
+ _samples_to_add: int = 20
223
+ _lineage_aspects_seen: Set[str] = field(default_factory=set)
224
+
225
+ def __post_init__(self) -> None:
226
+ super().__post_init__()
227
+ self._file_based_dict = FileBackedDict(
228
+ tablename="urn_aspects",
229
+ extra_columns={
230
+ "urn": lambda val: val.urn,
231
+ "entityType": lambda val: val.entity_type,
232
+ "subTypes": lambda val: val.subType,
233
+ "aspects": lambda val: json.dumps(val.aspects),
234
+ "soft_deleted": lambda val: val.soft_deleted,
235
+ },
236
+ )
237
+
238
+ def close(self) -> None:
239
+ self.compute_stats()
240
+ if self._file_based_dict is not None:
241
+ self._file_based_dict.close()
242
+ self._file_based_dict = None
243
+
244
+ def _build_aspects_where_clause(self, aspects: List[str]) -> str:
245
+ """Build WHERE clause for matching any of the given aspects."""
246
+ if not aspects:
247
+ return ""
248
+
249
+ conditions = []
250
+ for aspect in aspects:
251
+ conditions.append(f"aspects LIKE '%{aspect}%'")
252
+
253
+ return " OR ".join(conditions)
254
+
255
+ def _collect_samples_by_subtype(self, where_clause: str, sample_key: str) -> None:
256
+ """Helper method to collect samples organized by subtype for a given where clause."""
257
+
258
+ subtype_query = f"""
259
+ SELECT DISTINCT subTypes
260
+ FROM urn_aspects
261
+ WHERE {where_clause}
262
+ """
263
+ assert self._file_based_dict is not None
264
+ subtypes = set()
265
+ for row in self._file_based_dict.sql_query(subtype_query):
266
+ sub_type = row["subTypes"] or "unknown"
267
+ subtypes.add(sub_type)
268
+
269
+ for sub_type in subtypes:
270
+ query = f"""
271
+ SELECT urn
272
+ FROM urn_aspects
273
+ WHERE {where_clause} AND subTypes = ?
274
+ limit {self._samples_to_add}
275
+ """
276
+
277
+ for row in self._file_based_dict.sql_query(query, (sub_type,)):
278
+ self.samples[sample_key][sub_type].append(row["urn"])
279
+
280
+ def _collect_samples_by_aspects(self, aspects: List[str], sample_key: str) -> None:
281
+ """Helper method to collect samples for entities that have any of the given aspects."""
282
+ if not aspects:
283
+ return
284
+
285
+ where_clause = self._build_aspects_where_clause(aspects)
286
+ self._collect_samples_by_subtype(where_clause, sample_key)
287
+
288
+ def _collect_samples_by_lineage_aspects(
289
+ self, aspects: List[str], sample_key: str
290
+ ) -> None:
291
+ """Helper method to collect samples for entities that have any of the given lineage aspects.
292
+
293
+ Lineage aspects are stored in JSON format and require quote escaping in LIKE clauses.
294
+ """
295
+ if not aspects:
296
+ return
297
+
298
+ lineage_conditions = []
299
+ for aspect in aspects:
300
+ lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
301
+
302
+ where_clause = " OR ".join(lineage_conditions)
303
+ self._collect_samples_by_subtype(where_clause, sample_key)
304
+
305
+ def _collect_samples_with_all_conditions(self, sample_key: str) -> None:
306
+ """
307
+ Collect samples for entities that have lineage, profiling, and usage aspects.
308
+ These specific 3 cases are added here as these URNs will be shown in the UI. Subject to change in future.
309
+ """
310
+ if not self._lineage_aspects_seen:
311
+ return
312
+ assert self._file_based_dict is not None
313
+
314
+ # Build lineage conditions using the same logic as _collect_samples_by_lineage_aspects
315
+ lineage_conditions = []
316
+ for aspect in self._lineage_aspects_seen:
317
+ lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
318
+ lineage_where_clause = " OR ".join(lineage_conditions)
319
+
320
+ # Build profiling conditions using the same logic as _collect_samples_by_aspects
321
+ profiling_where_clause = self._build_aspects_where_clause(["datasetProfile"])
322
+
323
+ # Build usage conditions using the same logic as _collect_samples_by_aspects
324
+ usage_where_clause = self._build_aspects_where_clause(
325
+ [
326
+ "datasetUsageStatistics",
327
+ "chartUsageStatistics",
328
+ "dashboardUsageStatistics",
329
+ ]
330
+ )
331
+
332
+ query = f"""
333
+ SELECT urn, subTypes
334
+ FROM urn_aspects
335
+ WHERE ({lineage_where_clause})
336
+ AND ({profiling_where_clause})
337
+ AND ({usage_where_clause})
338
+ limit {self._samples_to_add}
339
+ """
340
+
341
+ for row in self._file_based_dict.sql_query(query):
342
+ sub_type = row["subTypes"] or "unknown"
343
+ self.samples[sample_key][sub_type].append(row["urn"])
344
+
345
+ def _has_fine_grained_lineage(
346
+ self, mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]
347
+ ) -> bool:
348
+ if isinstance(mcp.aspect, UpstreamLineageClass):
349
+ upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
350
+ if upstream_lineage.fineGrainedLineages:
351
+ return True
352
+ return False
353
+
354
+ def _update_file_based_dict(
355
+ self,
356
+ urn: str,
357
+ entityType: str,
358
+ aspectName: str,
359
+ mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper],
360
+ ) -> None:
361
+ platform_name = guess_platform_name(urn)
362
+ if platform_name != self.get_platform():
363
+ return
364
+ if is_lineage_aspect(entityType, aspectName):
365
+ self._lineage_aspects_seen.add(aspectName)
366
+ has_fine_grained_lineage = self._has_fine_grained_lineage(mcp)
367
+
368
+ sub_type = "unknown"
369
+ if isinstance(mcp.aspect, SubTypesClass):
370
+ sub_type = mcp.aspect.typeNames[0]
371
+
372
+ assert self._file_based_dict is not None
373
+ if urn in self._file_based_dict:
374
+ if sub_type != "unknown":
375
+ self._file_based_dict[urn].subType = sub_type
376
+ aspects_dict = self._file_based_dict[urn].aspects
377
+ if aspectName in aspects_dict:
378
+ aspects_dict[aspectName] += 1
379
+ else:
380
+ aspects_dict[aspectName] = 1
381
+ if has_fine_grained_lineage:
382
+ if self._fine_grained_lineage_special_case_name in aspects_dict:
383
+ aspects_dict[self._fine_grained_lineage_special_case_name] += 1
384
+ else:
385
+ aspects_dict[self._fine_grained_lineage_special_case_name] = 1
386
+ self._file_based_dict.mark_dirty(urn)
387
+ else:
388
+ aspects_dict = {aspectName: 1}
389
+ if has_fine_grained_lineage:
390
+ aspects_dict[self._fine_grained_lineage_special_case_name] = 1
391
+ self._file_based_dict[urn] = SourceReportSubtypes(
392
+ urn=urn,
393
+ entity_type=entityType,
394
+ subType=sub_type,
395
+ aspects=aspects_dict,
396
+ )
397
+ if (
398
+ isinstance(mcp.aspect, StatusClass)
399
+ and mcp is not None
400
+ and mcp.aspect is not None
401
+ ):
402
+ self._file_based_dict[urn].soft_deleted = mcp.aspect.removed
403
+ self._file_based_dict.mark_dirty(urn)
404
+
405
+ def _store_workunit_data(self, wu: MetadataWorkUnit) -> None:
406
+ urn = wu.get_urn()
407
+
408
+ if not isinstance(wu.metadata, MetadataChangeEvent):
409
+ mcps = [wu.metadata]
410
+ else:
411
+ mcps = list(mcps_from_mce(wu.metadata))
412
+
413
+ for mcp in mcps:
414
+ entityType = mcp.entityType
415
+ aspectName = mcp.aspectName
416
+
417
+ if aspectName is None:
418
+ continue
419
+
420
+ self._update_file_based_dict(urn, entityType, aspectName, mcp)
421
+
422
+ def compute_stats(self) -> None:
423
+ start_time = datetime.now()
424
+ if self._file_based_dict is None:
425
+ return
426
+
427
+ query = """
428
+ SELECT entityType, subTypes, aspects, count(*) as count
429
+ FROM urn_aspects
430
+ WHERE soft_deleted = 0
431
+ GROUP BY entityType, subTypes, aspects
432
+ """
433
+
434
+ entity_subtype_aspect_counts: Dict[str, Dict[str, Dict[str, int]]] = (
435
+ defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
436
+ )
437
+ entity_subtype_aspect_counts_exist: Dict[str, Dict[str, Dict[str, int]]] = (
438
+ defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
439
+ )
440
+
441
+ for row in self._file_based_dict.sql_query(query):
442
+ entity_type = row["entityType"]
443
+ sub_type = row["subTypes"]
444
+ count = row["count"]
445
+ aspects_raw = row["aspects"] or "[]"
446
+
447
+ aspects = json.loads(aspects_raw)
448
+ for aspect, aspect_count in aspects.items():
449
+ entity_subtype_aspect_counts[entity_type][sub_type][aspect] += (
450
+ aspect_count * count
451
+ )
452
+ entity_subtype_aspect_counts_exist[entity_type][sub_type][aspect] += (
453
+ count
454
+ )
455
+
456
+ self.aspects.clear()
457
+ self.aspects_by_subtypes.clear()
458
+ self.aspects_by_subtypes_full_count.clear()
459
+ for entity_type, subtype_counts in entity_subtype_aspect_counts.items():
460
+ for sub_type, aspect_counts in subtype_counts.items():
461
+ for aspect, count in aspect_counts.items():
462
+ self.aspects[entity_type][aspect] += count
463
+ self.aspects_by_subtypes_full_count[entity_type][sub_type] = dict(
464
+ aspect_counts
465
+ )
466
+
467
+ for entity_type, subtype_counts in entity_subtype_aspect_counts_exist.items():
468
+ for sub_type, aspect_counts in subtype_counts.items():
469
+ self.aspects_by_subtypes[entity_type][sub_type] = dict(aspect_counts)
470
+
471
+ self.samples.clear()
472
+ self._collect_samples_by_aspects(["datasetProfile"], "profiling")
473
+ self._collect_samples_by_aspects(
474
+ [
475
+ "datasetUsageStatistics",
476
+ "chartUsageStatistics",
477
+ "dashboardUsageStatistics",
478
+ ],
479
+ "usage",
480
+ )
481
+ self._collect_samples_by_lineage_aspects(
482
+ list(self._lineage_aspects_seen), "lineage"
483
+ )
484
+ self._collect_samples_with_all_conditions("all_3")
485
+ end_time = datetime.now()
486
+ self.compute_stats_time_seconds += (end_time - start_time).total_seconds()
487
+
488
+
111
489
  class EntityFilterReport(ReportAttribute):
112
490
  type: str
113
491
 
@@ -1,7 +1,8 @@
1
1
  import datetime
2
+ import logging
2
3
  from abc import ABCMeta, abstractmethod
3
4
  from dataclasses import dataclass, field
4
- from typing import Any, Generic, Optional, Type, TypeVar, cast
5
+ from typing import Any, Callable, Generic, List, Optional, Type, TypeVar, cast
5
6
 
6
7
  from typing_extensions import Self
7
8
 
@@ -12,6 +13,8 @@ from datahub.ingestion.api.report import Report
12
13
  from datahub.utilities.lossy_collections import LossyList
13
14
  from datahub.utilities.type_annotations import get_class_from_annotation
14
15
 
16
+ logger = logging.getLogger(__name__)
17
+
15
18
 
16
19
  @dataclass
17
20
  class SinkReport(Report):
@@ -89,6 +92,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
89
92
  ctx: PipelineContext
90
93
  config: SinkConfig
91
94
  report: SinkReportType
95
+ _pre_shutdown_callbacks: List[Callable[[], None]]
92
96
 
93
97
  @classmethod
94
98
  def get_config_class(cls) -> Type[SinkConfig]:
@@ -106,6 +110,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
106
110
  self.ctx = ctx
107
111
  self.config = config
108
112
  self.report = self.get_report_class()()
113
+ self._pre_shutdown_callbacks = []
109
114
 
110
115
  self.__post_init__()
111
116
 
@@ -144,8 +149,28 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
144
149
  def get_report(self) -> SinkReportType:
145
150
  return self.report
146
151
 
152
+ def register_pre_shutdown_callback(self, callback: Callable[[], None]) -> None:
153
+ """Register a callback to be executed before the sink shuts down.
154
+
155
+ This is useful for components that need to send final reports or cleanup
156
+ operations before the sink's resources are released.
157
+ """
158
+ self._pre_shutdown_callbacks.append(callback)
159
+
147
160
  def close(self) -> None:
148
- pass
161
+ """Close the sink and clean up resources.
162
+
163
+ This method executes any registered pre-shutdown callbacks before
164
+ performing the actual shutdown. Subclasses should override this method
165
+ to provide sink-specific cleanup logic while calling super().close()
166
+ to ensure callbacks are executed.
167
+ """
168
+ # Execute pre-shutdown callbacks before shutdown
169
+ for callback in self._pre_shutdown_callbacks:
170
+ try:
171
+ callback()
172
+ except Exception as e:
173
+ logger.warning(f"Pre-shutdown callback failed: {e}", exc_info=True)
149
174
 
150
175
  def configured(self) -> str:
151
176
  """Override this method to output a human-readable and scrubbed version of the configured sink"""