acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,10 @@
1
1
  import logging
2
2
  from enum import Enum
3
3
  from pathlib import Path
4
- from typing import Iterable, List, Optional, Union
4
+ from typing import Iterable, List, Optional, Type, Union
5
5
 
6
6
  import yaml
7
- from pydantic import StrictStr, validator
7
+ from pydantic import Field, StrictStr, validator
8
8
  from ruamel.yaml import YAML
9
9
 
10
10
  from datahub.configuration.common import ConfigModel
@@ -48,7 +48,7 @@ VALID_ENTITY_TYPE_URNS = [
48
48
  _VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."
49
49
 
50
50
 
51
- def _validate_entity_type_urn(v: str) -> str:
51
+ def _validate_entity_type_urn(cls: Type, v: str) -> str:
52
52
  urn = Urn.make_entity_type_urn(v)
53
53
  if urn not in VALID_ENTITY_TYPE_URNS:
54
54
  raise ValueError(
@@ -68,7 +68,7 @@ class TypeQualifierAllowedTypes(ConfigModel):
68
68
 
69
69
  class StructuredProperties(ConfigModel):
70
70
  id: Optional[str] = None
71
- urn: Optional[str] = None
71
+ urn: Optional[str] = Field(None, validate_default=True)
72
72
  qualified_name: Optional[str] = None
73
73
  type: str
74
74
  value_entity_types: Optional[List[str]] = None
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from typing import Any, Dict, List, Optional
3
3
 
4
- from gql import gql
4
+ from gql import GraphQLRequest
5
5
 
6
6
  from datahub.api.graphql.base import BaseApi
7
7
 
@@ -79,10 +79,12 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
79
79
  if custom_properties is not None:
80
80
  variable_values["customProperties"] = custom_properties
81
81
 
82
- result = self.client.execute(
83
- gql(Operation.REPORT_OPERATION_MUTATION), variable_values
82
+ request = GraphQLRequest(
83
+ Operation.REPORT_OPERATION_MUTATION, variable_values=variable_values
84
84
  )
85
85
 
86
+ result = self.client.execute(request)
87
+
86
88
  return result["reportOperation"]
87
89
 
88
90
  def query_operations(
@@ -109,12 +111,12 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
109
111
  :param partition: The partition to check the operation.
110
112
  """
111
113
 
112
- result = self.client.execute(
113
- gql(Operation.QUERY_OPERATIONS),
114
+ request = GraphQLRequest(
115
+ Operation.QUERY_OPERATIONS,
114
116
  variable_values={
115
117
  "urn": urn,
116
118
  "startTimeMillis": start_time_millis,
117
- "end_time_millis": end_time_millis,
119
+ "endTimeMillis": end_time_millis,
118
120
  "limit": limit,
119
121
  "filter": self.gen_filter(
120
122
  {
@@ -125,6 +127,8 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
125
127
  ),
126
128
  },
127
129
  )
130
+
131
+ result = self.client.execute(request)
128
132
  if "dataset" in result and "operations" in result["dataset"]:
129
133
  operations = []
130
134
  if source_type is not None:
datahub/cli/check_cli.py CHANGED
@@ -9,6 +9,7 @@ from datetime import datetime
9
9
  from typing import Any, Dict, List, Optional, Union
10
10
 
11
11
  import click
12
+ from tabulate import tabulate
12
13
 
13
14
  from datahub._version import __package_name__
14
15
  from datahub.cli.json_file import check_mce_file
@@ -21,7 +22,7 @@ from datahub.ingestion.run.pipeline import Pipeline
21
22
  from datahub.ingestion.sink.sink_registry import sink_registry
22
23
  from datahub.ingestion.source.source_registry import source_registry
23
24
  from datahub.ingestion.transformer.transform_registry import transform_registry
24
- from datahub.telemetry import telemetry
25
+ from datahub.upgrade import upgrade
25
26
  from datahub.utilities.file_backed_collections import (
26
27
  ConnectionWrapper,
27
28
  FileBackedDict,
@@ -47,7 +48,6 @@ def check() -> None:
47
48
  @click.option(
48
49
  "--unpack-mces", default=False, is_flag=True, help="Converts MCEs into MCPs"
49
50
  )
50
- @telemetry.with_telemetry()
51
51
  def metadata_file(json_file: str, rewrite: bool, unpack_mces: bool) -> None:
52
52
  """Check the schema of a metadata (MCE or MCP) JSON file."""
53
53
 
@@ -105,7 +105,6 @@ def metadata_file(json_file: str, rewrite: bool, unpack_mces: bool) -> None:
105
105
  default=(),
106
106
  help="[Advanced] Paths in the deepdiff object to ignore",
107
107
  )
108
- @telemetry.with_telemetry()
109
108
  def metadata_diff(
110
109
  actual_file: str, expected_file: str, verbose: bool, ignore_path: List[str]
111
110
  ) -> None:
@@ -142,7 +141,6 @@ def metadata_diff(
142
141
  type=str,
143
142
  default=None,
144
143
  )
145
- @telemetry.with_telemetry()
146
144
  def plugins(source: Optional[str], verbose: bool) -> None:
147
145
  """List the enabled ingestion plugins."""
148
146
 
@@ -234,7 +232,7 @@ def sql_format(sql: str, platform: str) -> None:
234
232
  default=True,
235
233
  help="Run in offline mode and disable schema-aware parsing.",
236
234
  )
237
- @telemetry.with_telemetry()
235
+ @upgrade.check_upgrade
238
236
  def sql_lineage(
239
237
  sql: Optional[str],
240
238
  sql_file: Optional[str],
@@ -297,7 +295,6 @@ def sql_lineage(
297
295
  type=str,
298
296
  help="the input to validate",
299
297
  )
300
- @telemetry.with_telemetry()
301
298
  def test_allow_deny(config: str, input: str, pattern_key: str) -> None:
302
299
  """Test input string against AllowDeny pattern in a DataHub recipe.
303
300
 
@@ -346,7 +343,6 @@ def test_allow_deny(config: str, input: str, pattern_key: str) -> None:
346
343
  type=str,
347
344
  help="The input to validate",
348
345
  )
349
- @telemetry.with_telemetry()
350
346
  def test_path_spec(config: str, input: str, path_spec_key: str) -> None:
351
347
  """Test input path string against PathSpec patterns in a DataHub recipe.
352
348
 
@@ -471,6 +467,7 @@ WHERE
471
467
 
472
468
 
473
469
  @check.command()
470
+ @upgrade.check_upgrade
474
471
  def server_config() -> None:
475
472
  """Print the server config."""
476
473
  graph = get_default_graph(ClientMode.CLI)
@@ -478,3 +475,87 @@ def server_config() -> None:
478
475
  server_config = graph.get_server_config()
479
476
 
480
477
  click.echo(pprint.pformat(server_config))
478
+
479
+
480
+ @check.command()
481
+ @click.option(
482
+ "--urn", required=False, help="The urn or urn pattern (supports % for wildcard)"
483
+ )
484
+ @click.option("--aspect", default=None, help="Filter to a specific aspect name.")
485
+ @click.option(
486
+ "--start", type=int, default=None, help="Row number of sql store to restore from."
487
+ )
488
+ @click.option("--batch-size", type=int, default=None, help="How many rows to restore.")
489
+ @click.option(
490
+ "--file",
491
+ required=False,
492
+ type=click.Path(exists=True, dir_okay=True, readable=True),
493
+ help="File absolute path containing URNs (one per line) to restore indices",
494
+ )
495
+ @upgrade.check_upgrade
496
+ def restore_indices(
497
+ urn: Optional[str],
498
+ aspect: Optional[str],
499
+ start: Optional[int],
500
+ batch_size: Optional[int],
501
+ file: Optional[str],
502
+ ) -> None:
503
+ """Resync metadata changes into the search and graph indices."""
504
+ if urn is None and file is None:
505
+ raise click.UsageError("Either --urn or --file must be provided")
506
+ graph = get_default_graph(ClientMode.CLI)
507
+
508
+ graph.restore_indices(
509
+ urn_pattern=urn,
510
+ aspect=aspect,
511
+ start=start,
512
+ batch_size=batch_size,
513
+ file=file,
514
+ )
515
+
516
+
517
+ @check.command()
518
+ @upgrade.check_upgrade
519
+ def get_kafka_consumer_offsets() -> None:
520
+ """Get Kafka consumer offsets from the DataHub API."""
521
+ graph = get_default_graph(ClientMode.CLI)
522
+ result = graph.get_kafka_consumer_offsets()
523
+
524
+ table_data = []
525
+ headers = [
526
+ "Topic",
527
+ "Consumer Group",
528
+ "Schema",
529
+ "Partition",
530
+ "Offset",
531
+ "Lag",
532
+ "Avg Lag",
533
+ "Max Lag",
534
+ "Total Lag",
535
+ ]
536
+
537
+ for topic, consumers in result.items():
538
+ for consumer_group, schemas in consumers.items():
539
+ for schema, data in schemas.items():
540
+ metrics = data.get("metrics", {})
541
+ partitions = data.get("partitions", {})
542
+
543
+ for partition, partition_data in partitions.items():
544
+ table_data.append(
545
+ [
546
+ topic,
547
+ consumer_group,
548
+ schema,
549
+ partition,
550
+ partition_data.get("offset", "N/A"),
551
+ partition_data.get("lag", "N/A"),
552
+ metrics.get("avgLag", "N/A"),
553
+ metrics.get("maxLag", "N/A"),
554
+ metrics.get("totalLag", "N/A"),
555
+ ]
556
+ )
557
+
558
+ if table_data:
559
+ click.echo(tabulate(table_data, headers=headers, tablefmt="grid"))
560
+ else:
561
+ click.echo("No Kafka consumer offset data found.")
datahub/cli/cli_utils.py CHANGED
@@ -3,6 +3,7 @@ import logging
3
3
  import time
4
4
  import typing
5
5
  from datetime import datetime
6
+ from functools import wraps
6
7
  from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
7
8
 
8
9
  import click
@@ -424,3 +425,65 @@ def ensure_has_system_metadata(
424
425
  props = metadata.properties
425
426
  props["clientId"] = datahub_version.__package_name__
426
427
  props["clientVersion"] = datahub_version.__version__
428
+
429
+
430
+ def enable_auto_decorators(main_group: click.Group) -> None:
431
+ """
432
+ Enable automatic decorators for all click commands.
433
+ This wraps existing command callback functions to add upgrade and telemetry decorators.
434
+ """
435
+
436
+ def has_decorator(func: Any, module_pattern: str, function_pattern: str) -> bool:
437
+ """Check if function already has a specific decorator"""
438
+ if hasattr(func, "__wrapped__"):
439
+ current_func = func
440
+ while hasattr(current_func, "__wrapped__"):
441
+ # Check if this wrapper matches the module and function patterns
442
+ if (
443
+ hasattr(current_func, "__module__")
444
+ and module_pattern in current_func.__module__
445
+ and hasattr(current_func, "__name__")
446
+ and function_pattern in current_func.__name__
447
+ ):
448
+ return True
449
+ current_func = current_func.__wrapped__
450
+ return False
451
+
452
+ def has_telemetry_decorator(func):
453
+ return has_decorator(func, "telemetry", "with_telemetry")
454
+
455
+ def wrap_command_callback(command_obj):
456
+ """Wrap a command's callback function to add decorators"""
457
+ if hasattr(command_obj, "callback") and command_obj.callback:
458
+ original_callback = command_obj.callback
459
+
460
+ # Import here to avoid circular imports
461
+ from datahub.telemetry import telemetry
462
+
463
+ decorated_callback = original_callback
464
+
465
+ if not has_telemetry_decorator(decorated_callback):
466
+ log.debug(
467
+ f"Applying telemetry decorator to {original_callback.__module__}.{original_callback.__name__}"
468
+ )
469
+ decorated_callback = telemetry.with_telemetry()(decorated_callback)
470
+
471
+ # Preserve the original function's metadata
472
+ decorated_callback = wraps(original_callback)(decorated_callback)
473
+
474
+ command_obj.callback = decorated_callback
475
+
476
+ def wrap_group_commands(group_obj):
477
+ """Recursively wrap all commands in a group"""
478
+ if hasattr(group_obj, "commands"):
479
+ for _, command_obj in group_obj.commands.items():
480
+ if isinstance(command_obj, click.Group):
481
+ # Recursively wrap sub-groups
482
+ wrap_group_commands(command_obj)
483
+ else:
484
+ # Wrap individual commands
485
+ wrap_command_callback(command_obj)
486
+
487
+ wrap_group_commands(main_group)
488
+
489
+ log.debug("Auto-decorators enabled successfully")
@@ -11,7 +11,16 @@ import click
11
11
  import yaml
12
12
  from pydantic import BaseModel, ValidationError
13
13
 
14
- from datahub.cli.env_utils import get_boolean_env_variable
14
+ from datahub.configuration.env_vars import (
15
+ get_gms_host,
16
+ get_gms_port,
17
+ get_gms_protocol,
18
+ get_gms_token,
19
+ get_gms_url,
20
+ get_skip_config,
21
+ get_system_client_id,
22
+ get_system_client_secret,
23
+ )
15
24
  from datahub.ingestion.graph.config import DatahubClientConfig
16
25
 
17
26
  logger = logging.getLogger(__name__)
@@ -36,15 +45,15 @@ class MissingConfigError(Exception):
36
45
 
37
46
 
38
47
  def get_system_auth() -> Optional[str]:
39
- system_client_id = os.environ.get(ENV_DATAHUB_SYSTEM_CLIENT_ID)
40
- system_client_secret = os.environ.get(ENV_DATAHUB_SYSTEM_CLIENT_SECRET)
48
+ system_client_id = get_system_client_id()
49
+ system_client_secret = get_system_client_secret()
41
50
  if system_client_id is not None and system_client_secret is not None:
42
51
  return f"Basic {system_client_id}:{system_client_secret}"
43
52
  return None
44
53
 
45
54
 
46
55
  def _should_skip_config() -> bool:
47
- return get_boolean_env_variable(ENV_SKIP_CONFIG, False)
56
+ return get_skip_config()
48
57
 
49
58
 
50
59
  def persist_raw_datahub_config(config: dict) -> None:
@@ -67,11 +76,11 @@ class DatahubConfig(BaseModel):
67
76
 
68
77
 
69
78
  def _get_config_from_env() -> Tuple[Optional[str], Optional[str]]:
70
- host = os.environ.get(ENV_METADATA_HOST)
71
- port = os.environ.get(ENV_METADATA_PORT)
72
- token = os.environ.get(ENV_METADATA_TOKEN)
73
- protocol = os.environ.get(ENV_METADATA_PROTOCOL, "http")
74
- url = os.environ.get(ENV_METADATA_HOST_URL)
79
+ host = get_gms_host()
80
+ port = get_gms_port()
81
+ token = get_gms_token()
82
+ protocol = get_gms_protocol()
83
+ url = get_gms_url()
75
84
  if port is not None:
76
85
  url = f"{protocol}://{host}:{port}"
77
86
  return url, token
@@ -108,7 +117,6 @@ def load_client_config() -> DatahubClientConfig:
108
117
  datahub_config: DatahubClientConfig = DatahubConfig.parse_obj(
109
118
  client_config_dict
110
119
  ).gms
111
-
112
120
  return datahub_config
113
121
  except ValidationError as e:
114
122
  click.echo(f"Error loading your {CONDENSED_DATAHUB_CONFIG_PATH}")
@@ -3,6 +3,7 @@ import logging
3
3
  import click
4
4
 
5
5
  from datahub.ingestion.source.apply.datahub_apply import apply_association_to_container
6
+ from datahub.upgrade import upgrade
6
7
 
7
8
  logger = logging.getLogger(__name__)
8
9
 
@@ -16,6 +17,7 @@ def container() -> None:
16
17
  @container.command()
17
18
  @click.option("--container-urn", required=True, type=str)
18
19
  @click.option("--tag-urn", required=True, type=str)
20
+ @upgrade.check_upgrade
19
21
  def tag(container_urn: str, tag_urn: str) -> None:
20
22
  """Add patch to add a tag to all datasets in a container"""
21
23
  apply_association_to_container(container_urn, tag_urn, "tag")
@@ -24,6 +26,7 @@ def tag(container_urn: str, tag_urn: str) -> None:
24
26
  @container.command()
25
27
  @click.option("--container-urn", required=True, type=str)
26
28
  @click.option("--term-urn", required=True, type=str)
29
+ @upgrade.check_upgrade
27
30
  def term(container_urn: str, term_urn: str) -> None:
28
31
  """Add patch to add a term to all datasets in a container"""
29
32
  apply_association_to_container(container_urn, term_urn, "term")
@@ -32,6 +35,7 @@ def term(container_urn: str, term_urn: str) -> None:
32
35
  @container.command()
33
36
  @click.option("--container-urn", required=True, type=str)
34
37
  @click.option("--owner-urn", required=True, type=str)
38
+ @upgrade.check_upgrade
35
39
  def owner(container_urn: str, owner_urn: str) -> None:
36
40
  """Add patch to add a owner to all datasets in a container"""
37
41
  apply_association_to_container(container_urn, owner_urn, "owner")
@@ -40,6 +44,7 @@ def owner(container_urn: str, owner_urn: str) -> None:
40
44
  @container.command()
41
45
  @click.option("--container-urn", required=True, type=str)
42
46
  @click.option("--domain-urn", required=True, type=str)
47
+ @upgrade.check_upgrade
43
48
  def domain(container_urn: str, domain_urn: str) -> None:
44
49
  """Add patch to add a domain to all datasets in a container"""
45
50
  apply_association_to_container(container_urn, domain_urn, "domain")
datahub/cli/delete_cli.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  import random
3
+ import sys
3
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
5
  from dataclasses import dataclass
5
6
  from datetime import datetime
@@ -17,7 +18,6 @@ from datahub.emitter.aspect import ASPECT_MAP, TIMESERIES_ASPECT_MAP
17
18
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
18
19
  from datahub.ingestion.graph.config import ClientMode
19
20
  from datahub.ingestion.graph.filters import RemovedStatusFilter
20
- from datahub.telemetry import telemetry
21
21
  from datahub.upgrade import upgrade
22
22
  from datahub.utilities.perf_timer import PerfTimer
23
23
  from datahub.utilities.urns.urn import guess_entity_type
@@ -115,7 +115,7 @@ class DeletionResult:
115
115
  help="specifies soft/hard deletion",
116
116
  )
117
117
  @click.option("-n", "--dry-run", required=False, is_flag=True)
118
- @telemetry.with_telemetry()
118
+ @upgrade.check_upgrade
119
119
  def by_registry(
120
120
  registry_id: str,
121
121
  soft: bool,
@@ -170,7 +170,7 @@ def by_registry(
170
170
  @click.option(
171
171
  "-f", "--force", required=False, is_flag=True, help="force the delete if set"
172
172
  )
173
- @telemetry.with_telemetry()
173
+ @upgrade.check_upgrade
174
174
  def references(urn: str, dry_run: bool, force: bool) -> None:
175
175
  """
176
176
  Delete all references to an entity (but not the entity itself).
@@ -231,8 +231,9 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
231
231
  default=3000,
232
232
  type=int,
233
233
  help="Batch size when querying for entities to un-soft delete."
234
- "Maximum 10000. Large batch sizes may cause timeouts.",
234
+ "Maximum 5000. Large batch sizes may cause timeouts.",
235
235
  )
236
+ @upgrade.check_upgrade
236
237
  def undo_by_filter(
237
238
  urn: Optional[str], platform: Optional[str], batch_size: int
238
239
  ) -> None:
@@ -317,6 +318,19 @@ def undo_by_filter(
317
318
  is_flag=True,
318
319
  help="Recursively delete all contained entities (only for containers and dataPlatformInstances)",
319
320
  )
321
+ @click.option(
322
+ "--streaming-batch",
323
+ required=False,
324
+ is_flag=True,
325
+ help="Use streaming batch deletion for recursive operations. Benefit of being resumable for large hierarchies where getting all URNs at once can take a long time.",
326
+ )
327
+ @click.option(
328
+ "--streaming-batch-size",
329
+ required=False,
330
+ default=12000,
331
+ type=int,
332
+ help="Batch size for streaming batch deletion for recursive operations.",
333
+ )
320
334
  @click.option(
321
335
  "--start-time",
322
336
  required=False,
@@ -336,7 +350,7 @@ def undo_by_filter(
336
350
  default=3000,
337
351
  type=int,
338
352
  help="Batch size when querying for entities to delete."
339
- "Maximum 10000. Large batch sizes may cause timeouts.",
353
+ "Maximum 5000. Large batch sizes may cause timeouts.",
340
354
  )
341
355
  @click.option(
342
356
  "-n",
@@ -356,7 +370,6 @@ def undo_by_filter(
356
370
  "--workers", type=int, default=1, help="Num of workers to use for deletion."
357
371
  )
358
372
  @upgrade.check_upgrade
359
- @telemetry.with_telemetry()
360
373
  def by_filter(
361
374
  urn: Optional[str],
362
375
  urn_file: Optional[str],
@@ -368,6 +381,8 @@ def by_filter(
368
381
  entity_type: Optional[str],
369
382
  query: Optional[str],
370
383
  recursive: bool,
384
+ streaming_batch: bool,
385
+ streaming_batch_size: int,
371
386
  start_time: Optional[datetime],
372
387
  end_time: Optional[datetime],
373
388
  batch_size: int,
@@ -386,6 +401,7 @@ def by_filter(
386
401
  env=env,
387
402
  query=query,
388
403
  recursive=recursive,
404
+ streaming_batch=streaming_batch,
389
405
  )
390
406
  soft_delete_filter = _validate_user_soft_delete_flags(
391
407
  soft=soft, aspect=aspect, only_soft_deleted=only_soft_deleted
@@ -417,26 +433,27 @@ def by_filter(
417
433
  # Determine which urns to delete.
418
434
  delete_by_urn = bool(urn) and not recursive
419
435
  if urn:
420
- urns = [urn]
421
-
422
436
  if recursive:
423
- # Add children urns to the list.
424
- if guess_entity_type(urn) == "dataPlatformInstance":
425
- urns.extend(
426
- graph.get_urns_by_filter(
427
- platform_instance=urn,
428
- status=soft_delete_filter,
429
- batch_size=batch_size,
430
- )
431
- )
432
- else:
433
- urns.extend(
434
- graph.get_urns_by_filter(
435
- container=urn,
436
- status=soft_delete_filter,
437
- batch_size=batch_size,
438
- )
439
- )
437
+ _delete_urns_streaming_recursive(
438
+ graph=graph,
439
+ parent_urn=urn,
440
+ aspect_name=aspect,
441
+ soft=soft,
442
+ dry_run=dry_run,
443
+ start_time=start_time,
444
+ end_time=end_time,
445
+ workers=workers,
446
+ soft_delete_filter=soft_delete_filter,
447
+ batch_size=batch_size,
448
+ force=force,
449
+ streaming_batch_size=streaming_batch_size
450
+ if streaming_batch
451
+ else sys.maxsize,
452
+ )
453
+ return
454
+
455
+ else:
456
+ urns = [urn]
440
457
  elif urn_file:
441
458
  with open(urn_file, "r") as r:
442
459
  urns = []
@@ -452,6 +469,7 @@ def by_filter(
452
469
  query=query,
453
470
  status=soft_delete_filter,
454
471
  batch_size=batch_size,
472
+ skip_cache=True,
455
473
  )
456
474
  )
457
475
  if len(urns) == 0:
@@ -557,6 +575,7 @@ def _validate_user_urn_and_filters(
557
575
  env: Optional[str],
558
576
  query: Optional[str],
559
577
  recursive: bool,
578
+ streaming_batch: bool,
560
579
  ) -> None:
561
580
  # Check urn / filters options.
562
581
  if urn:
@@ -592,6 +611,12 @@ def _validate_user_urn_and_filters(
592
611
  f"This will only delete {urn}. Use --recursive to delete all contained entities."
593
612
  )
594
613
 
614
+ # Check streaming flag.
615
+ if streaming_batch and not recursive:
616
+ raise click.UsageError(
617
+ "The --streaming-batch flag can only be used with --recursive."
618
+ )
619
+
595
620
 
596
621
  def _validate_user_soft_delete_flags(
597
622
  soft: bool, aspect: Optional[str], only_soft_deleted: bool
@@ -654,8 +679,8 @@ def _validate_user_aspect_flags(
654
679
  def _validate_batch_size(batch_size: int) -> None:
655
680
  if batch_size <= 0:
656
681
  raise click.UsageError("Batch size must be a positive integer.")
657
- elif batch_size > 10000:
658
- raise click.UsageError("Batch size cannot exceed 10,000.")
682
+ elif batch_size > 5000:
683
+ raise click.UsageError("Batch size cannot exceed 5,000.")
659
684
 
660
685
 
661
686
  def _delete_one_urn(
@@ -738,3 +763,76 @@ def _delete_one_urn(
738
763
  num_timeseries_records=ts_rows_affected,
739
764
  num_referenced_entities=referenced_entities_affected,
740
765
  )
766
+
767
+
768
+ def _delete_urns_streaming_recursive(
769
+ graph: DataHubGraph,
770
+ parent_urn: str,
771
+ aspect_name: Optional[str],
772
+ soft: bool,
773
+ dry_run: bool,
774
+ start_time: Optional[datetime],
775
+ end_time: Optional[datetime],
776
+ workers: int,
777
+ soft_delete_filter: RemovedStatusFilter,
778
+ batch_size: int,
779
+ force: bool,
780
+ streaming_batch_size: int,
781
+ ) -> None:
782
+ """Streaming recursive batch deletion that processes URNs in batches."""
783
+
784
+ entity_type = guess_entity_type(parent_urn)
785
+ click.echo(f"Starting recursive deletion of {entity_type} {parent_urn}")
786
+
787
+ if not force and not dry_run:
788
+ click.confirm(
789
+ f"This will recursively delete {parent_urn} and all its contained entities. Do you want to continue?",
790
+ abort=True,
791
+ )
792
+
793
+ urns = []
794
+
795
+ if entity_type == "dataPlatformInstance":
796
+ child_urns_iter = graph.get_urns_by_filter(
797
+ platform_instance=parent_urn,
798
+ status=soft_delete_filter,
799
+ batch_size=batch_size,
800
+ # Important to skip cache so we can resume from where we left off.
801
+ skip_cache=True,
802
+ )
803
+ else:
804
+ child_urns_iter = graph.get_urns_by_filter(
805
+ container=parent_urn,
806
+ status=soft_delete_filter,
807
+ batch_size=batch_size,
808
+ # Important to skip cache so we can resume from where we left off.
809
+ skip_cache=True,
810
+ )
811
+
812
+ for child_urn in child_urns_iter:
813
+ urns.append(child_urn)
814
+ if len(urns) >= streaming_batch_size:
815
+ _delete_urns_parallel(
816
+ graph=graph,
817
+ urns=urns,
818
+ aspect_name=aspect_name,
819
+ soft=soft,
820
+ dry_run=dry_run,
821
+ delete_by_urn=False,
822
+ start_time=start_time,
823
+ end_time=end_time,
824
+ workers=workers,
825
+ )
826
+ urns = []
827
+ urns.append(parent_urn)
828
+ _delete_urns_parallel(
829
+ graph=graph,
830
+ urns=urns,
831
+ aspect_name=aspect_name,
832
+ soft=soft,
833
+ dry_run=dry_run,
834
+ delete_by_urn=False,
835
+ start_time=start_time,
836
+ end_time=end_time,
837
+ workers=workers,
838
+ )