acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
datahub/cli/delete_cli.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  import random
3
+ import sys
3
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
5
  from dataclasses import dataclass
5
6
  from datetime import datetime
@@ -15,8 +16,8 @@ from datahub.cli import cli_utils
15
16
  from datahub.configuration.datetimes import ClickDatetime
16
17
  from datahub.emitter.aspect import ASPECT_MAP, TIMESERIES_ASPECT_MAP
17
18
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
19
+ from datahub.ingestion.graph.config import ClientMode
18
20
  from datahub.ingestion.graph.filters import RemovedStatusFilter
19
- from datahub.telemetry import telemetry
20
21
  from datahub.upgrade import upgrade
21
22
  from datahub.utilities.perf_timer import PerfTimer
22
23
  from datahub.utilities.urns.urn import guess_entity_type
@@ -48,7 +49,7 @@ def delete() -> None:
48
49
 
49
50
  See `datahub delete by-filter` for the list of available filters.
50
51
 
51
- See https://datahubproject.io/docs/how/delete-metadata for more detailed docs.
52
+ See https://docs.datahub.com/docs/how/delete-metadata for more detailed docs.
52
53
  """
53
54
  pass
54
55
 
@@ -114,7 +115,7 @@ class DeletionResult:
114
115
  help="specifies soft/hard deletion",
115
116
  )
116
117
  @click.option("-n", "--dry-run", required=False, is_flag=True)
117
- @telemetry.with_telemetry()
118
+ @upgrade.check_upgrade
118
119
  def by_registry(
119
120
  registry_id: str,
120
121
  soft: bool,
@@ -124,7 +125,7 @@ def by_registry(
124
125
  Delete all metadata written using the given registry id and version pair.
125
126
  """
126
127
 
127
- client = get_default_graph()
128
+ client = get_default_graph(ClientMode.CLI)
128
129
 
129
130
  if soft and not dry_run:
130
131
  raise click.UsageError(
@@ -169,13 +170,13 @@ def by_registry(
169
170
  @click.option(
170
171
  "-f", "--force", required=False, is_flag=True, help="force the delete if set"
171
172
  )
172
- @telemetry.with_telemetry()
173
+ @upgrade.check_upgrade
173
174
  def references(urn: str, dry_run: bool, force: bool) -> None:
174
175
  """
175
176
  Delete all references to an entity (but not the entity itself).
176
177
  """
177
178
 
178
- graph = get_default_graph()
179
+ graph = get_default_graph(ClientMode.CLI)
179
180
  logger.info(f"Using graph: {graph}")
180
181
 
181
182
  references_count, related_aspects = graph.delete_references_to_urn(
@@ -230,15 +231,16 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
230
231
  default=3000,
231
232
  type=int,
232
233
  help="Batch size when querying for entities to un-soft delete."
233
- "Maximum 10000. Large batch sizes may cause timeouts.",
234
+ "Maximum 5000. Large batch sizes may cause timeouts.",
234
235
  )
236
+ @upgrade.check_upgrade
235
237
  def undo_by_filter(
236
238
  urn: Optional[str], platform: Optional[str], batch_size: int
237
239
  ) -> None:
238
240
  """
239
241
  Undo soft deletion by filters
240
242
  """
241
- graph = get_default_graph()
243
+ graph = get_default_graph(ClientMode.CLI)
242
244
  logger.info(f"Using {graph}")
243
245
  if urn:
244
246
  graph.set_soft_delete_status(urn=urn, delete=False)
@@ -316,6 +318,19 @@ def undo_by_filter(
316
318
  is_flag=True,
317
319
  help="Recursively delete all contained entities (only for containers and dataPlatformInstances)",
318
320
  )
321
+ @click.option(
322
+ "--streaming-batch",
323
+ required=False,
324
+ is_flag=True,
325
+ help="Use streaming batch deletion for recursive operations. Benefit of being resumable for large hierarchies where getting all URNs at once can take a long time.",
326
+ )
327
+ @click.option(
328
+ "--streaming-batch-size",
329
+ required=False,
330
+ default=12000,
331
+ type=int,
332
+ help="Batch size for streaming batch deletion for recursive operations.",
333
+ )
319
334
  @click.option(
320
335
  "--start-time",
321
336
  required=False,
@@ -335,7 +350,7 @@ def undo_by_filter(
335
350
  default=3000,
336
351
  type=int,
337
352
  help="Batch size when querying for entities to delete."
338
- "Maximum 10000. Large batch sizes may cause timeouts.",
353
+ "Maximum 5000. Large batch sizes may cause timeouts.",
339
354
  )
340
355
  @click.option(
341
356
  "-n",
@@ -355,7 +370,6 @@ def undo_by_filter(
355
370
  "--workers", type=int, default=1, help="Num of workers to use for deletion."
356
371
  )
357
372
  @upgrade.check_upgrade
358
- @telemetry.with_telemetry()
359
373
  def by_filter(
360
374
  urn: Optional[str],
361
375
  urn_file: Optional[str],
@@ -367,6 +381,8 @@ def by_filter(
367
381
  entity_type: Optional[str],
368
382
  query: Optional[str],
369
383
  recursive: bool,
384
+ streaming_batch: bool,
385
+ streaming_batch_size: int,
370
386
  start_time: Optional[datetime],
371
387
  end_time: Optional[datetime],
372
388
  batch_size: int,
@@ -385,6 +401,7 @@ def by_filter(
385
401
  env=env,
386
402
  query=query,
387
403
  recursive=recursive,
404
+ streaming_batch=streaming_batch,
388
405
  )
389
406
  soft_delete_filter = _validate_user_soft_delete_flags(
390
407
  soft=soft, aspect=aspect, only_soft_deleted=only_soft_deleted
@@ -395,8 +412,8 @@ def by_filter(
395
412
 
396
413
  if not force and not soft and not dry_run:
397
414
  message = (
398
- "Hard deletion will permanently delete data from DataHub and can be slow. "
399
- "We generally recommend using soft deletes instead. "
415
+ "Hard deletion will permanently delete data and can significantly slow down your instance while being executed. "
416
+ "We strongly recommend using soft deletes instead. "
400
417
  "Do you want to continue?"
401
418
  )
402
419
  if only_soft_deleted:
@@ -410,32 +427,33 @@ def by_filter(
410
427
  abort=True,
411
428
  )
412
429
 
413
- graph = get_default_graph()
430
+ graph = get_default_graph(ClientMode.CLI)
414
431
  logger.info(f"Using {graph}")
415
432
 
416
433
  # Determine which urns to delete.
417
434
  delete_by_urn = bool(urn) and not recursive
418
435
  if urn:
419
- urns = [urn]
420
-
421
436
  if recursive:
422
- # Add children urns to the list.
423
- if guess_entity_type(urn) == "dataPlatformInstance":
424
- urns.extend(
425
- graph.get_urns_by_filter(
426
- platform_instance=urn,
427
- status=soft_delete_filter,
428
- batch_size=batch_size,
429
- )
430
- )
431
- else:
432
- urns.extend(
433
- graph.get_urns_by_filter(
434
- container=urn,
435
- status=soft_delete_filter,
436
- batch_size=batch_size,
437
- )
438
- )
437
+ _delete_urns_streaming_recursive(
438
+ graph=graph,
439
+ parent_urn=urn,
440
+ aspect_name=aspect,
441
+ soft=soft,
442
+ dry_run=dry_run,
443
+ start_time=start_time,
444
+ end_time=end_time,
445
+ workers=workers,
446
+ soft_delete_filter=soft_delete_filter,
447
+ batch_size=batch_size,
448
+ force=force,
449
+ streaming_batch_size=streaming_batch_size
450
+ if streaming_batch
451
+ else sys.maxsize,
452
+ )
453
+ return
454
+
455
+ else:
456
+ urns = [urn]
439
457
  elif urn_file:
440
458
  with open(urn_file, "r") as r:
441
459
  urns = []
@@ -451,6 +469,7 @@ def by_filter(
451
469
  query=query,
452
470
  status=soft_delete_filter,
453
471
  batch_size=batch_size,
472
+ skip_cache=True,
454
473
  )
455
474
  )
456
475
  if len(urns) == 0:
@@ -556,6 +575,7 @@ def _validate_user_urn_and_filters(
556
575
  env: Optional[str],
557
576
  query: Optional[str],
558
577
  recursive: bool,
578
+ streaming_batch: bool,
559
579
  ) -> None:
560
580
  # Check urn / filters options.
561
581
  if urn:
@@ -591,6 +611,12 @@ def _validate_user_urn_and_filters(
591
611
  f"This will only delete {urn}. Use --recursive to delete all contained entities."
592
612
  )
593
613
 
614
+ # Check streaming flag.
615
+ if streaming_batch and not recursive:
616
+ raise click.UsageError(
617
+ "The --streaming-batch flag can only be used with --recursive."
618
+ )
619
+
594
620
 
595
621
  def _validate_user_soft_delete_flags(
596
622
  soft: bool, aspect: Optional[str], only_soft_deleted: bool
@@ -653,8 +679,8 @@ def _validate_user_aspect_flags(
653
679
  def _validate_batch_size(batch_size: int) -> None:
654
680
  if batch_size <= 0:
655
681
  raise click.UsageError("Batch size must be a positive integer.")
656
- elif batch_size > 10000:
657
- raise click.UsageError("Batch size cannot exceed 10,000.")
682
+ elif batch_size > 5000:
683
+ raise click.UsageError("Batch size cannot exceed 5,000.")
658
684
 
659
685
 
660
686
  def _delete_one_urn(
@@ -737,3 +763,76 @@ def _delete_one_urn(
737
763
  num_timeseries_records=ts_rows_affected,
738
764
  num_referenced_entities=referenced_entities_affected,
739
765
  )
766
+
767
+
768
+ def _delete_urns_streaming_recursive(
769
+ graph: DataHubGraph,
770
+ parent_urn: str,
771
+ aspect_name: Optional[str],
772
+ soft: bool,
773
+ dry_run: bool,
774
+ start_time: Optional[datetime],
775
+ end_time: Optional[datetime],
776
+ workers: int,
777
+ soft_delete_filter: RemovedStatusFilter,
778
+ batch_size: int,
779
+ force: bool,
780
+ streaming_batch_size: int,
781
+ ) -> None:
782
+ """Streaming recursive batch deletion that processes URNs in batches."""
783
+
784
+ entity_type = guess_entity_type(parent_urn)
785
+ click.echo(f"Starting recursive deletion of {entity_type} {parent_urn}")
786
+
787
+ if not force and not dry_run:
788
+ click.confirm(
789
+ f"This will recursively delete {parent_urn} and all its contained entities. Do you want to continue?",
790
+ abort=True,
791
+ )
792
+
793
+ urns = []
794
+
795
+ if entity_type == "dataPlatformInstance":
796
+ child_urns_iter = graph.get_urns_by_filter(
797
+ platform_instance=parent_urn,
798
+ status=soft_delete_filter,
799
+ batch_size=batch_size,
800
+ # Important to skip cache so we can resume from where we left off.
801
+ skip_cache=True,
802
+ )
803
+ else:
804
+ child_urns_iter = graph.get_urns_by_filter(
805
+ container=parent_urn,
806
+ status=soft_delete_filter,
807
+ batch_size=batch_size,
808
+ # Important to skip cache so we can resume from where we left off.
809
+ skip_cache=True,
810
+ )
811
+
812
+ for child_urn in child_urns_iter:
813
+ urns.append(child_urn)
814
+ if len(urns) >= streaming_batch_size:
815
+ _delete_urns_parallel(
816
+ graph=graph,
817
+ urns=urns,
818
+ aspect_name=aspect_name,
819
+ soft=soft,
820
+ dry_run=dry_run,
821
+ delete_by_urn=False,
822
+ start_time=start_time,
823
+ end_time=end_time,
824
+ workers=workers,
825
+ )
826
+ urns = []
827
+ urns.append(parent_urn)
828
+ _delete_urns_parallel(
829
+ graph=graph,
830
+ urns=urns,
831
+ aspect_name=aspect_name,
832
+ soft=soft,
833
+ dry_run=dry_run,
834
+ delete_by_urn=False,
835
+ start_time=start_time,
836
+ end_time=end_time,
837
+ workers=workers,
838
+ )
@@ -1,8 +1,9 @@
1
1
  import enum
2
2
  import os
3
+ import pathlib
3
4
  from contextlib import contextmanager
4
5
  from dataclasses import dataclass
5
- from typing import Any, Dict, Iterator, List, Optional
6
+ from typing import Any, Dict, Iterator, List, Optional, Set
6
7
 
7
8
  import docker
8
9
  import docker.errors
@@ -10,11 +11,13 @@ import docker.models.containers
10
11
  import yaml
11
12
 
12
13
  from datahub.configuration.common import ExceptionWithProps
14
+ from datahub.configuration.env_vars import get_compose_project_name
13
15
 
14
16
  # Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.
15
- MIN_MEMORY_NEEDED = 3.8 # GB
17
+ MIN_MEMORY_NEEDED = 4.3 # GB
18
+ MIN_DISK_SPACE_NEEDED = 13 # GB
16
19
 
17
- DOCKER_COMPOSE_PROJECT_NAME = os.getenv("DATAHUB_COMPOSE_PROJECT_NAME", "datahub")
20
+ DOCKER_COMPOSE_PROJECT_NAME = get_compose_project_name()
18
21
  DATAHUB_COMPOSE_PROJECT_FILTER = {
19
22
  "label": f"com.docker.compose.project={DOCKER_COMPOSE_PROJECT_NAME}"
20
23
  }
@@ -37,6 +40,10 @@ class DockerLowMemoryError(Exception):
37
40
  SHOW_STACK_TRACE = False
38
41
 
39
42
 
43
+ class DockerLowDiskSpaceError(Exception):
44
+ SHOW_STACK_TRACE = False
45
+
46
+
40
47
  class DockerComposeVersionError(Exception):
41
48
  SHOW_STACK_TRACE = False
42
49
 
@@ -102,6 +109,24 @@ def run_quickstart_preflight_checks(client: docker.DockerClient) -> None:
102
109
  "You can increase the memory allocated to Docker in the Docker settings."
103
110
  )
104
111
 
112
+ result = client.containers.run(
113
+ "alpine:latest",
114
+ "sh -c \"df -B1 -P / | awk 'NR==2{print $2, $4}'\"", # total, available
115
+ remove=True,
116
+ stdout=True,
117
+ stderr=True,
118
+ )
119
+
120
+ output = result.decode("utf-8").strip()
121
+ total_bytes, available_bytes = map(int, output.split())
122
+
123
+ available_gb = available_bytes / (1024**3)
124
+ if available_gb < MIN_DISK_SPACE_NEEDED:
125
+ raise DockerLowDiskSpaceError(
126
+ f"Total Docker disk space available {available_gb:.2f}GB is below the minimum threshold {MIN_DISK_SPACE_NEEDED}GB. "
127
+ "You can increase the disk space allocated to Docker in the Docker settings or free up disk space`"
128
+ )
129
+
105
130
 
106
131
  class ContainerStatus(enum.Enum):
107
132
  OK = "is ok"
@@ -126,10 +151,24 @@ class DockerContainerStatus:
126
151
  @dataclass
127
152
  class QuickstartStatus:
128
153
  containers: List[DockerContainerStatus]
154
+ volumes: Set[str]
155
+ # On moving to compose profiles, this CLI will no longer support running quickstart instances from earlier versions.
156
+ # While the check command can work, upgrades or
157
+ running_unsupported_version: bool
158
+
159
+ def __init__(
160
+ self,
161
+ containers: List[DockerContainerStatus],
162
+ volumes: List[str],
163
+ running_unsupported_version: bool = False,
164
+ ):
165
+ self.containers = containers
166
+ self.running_unsupported_version = running_unsupported_version
167
+ self.volumes = set(volumes)
129
168
 
130
169
  def errors(self) -> List[str]:
131
170
  if not self.containers:
132
- return ["quickstart.sh or dev.sh is not running"]
171
+ return ["datahub is not running"]
133
172
 
134
173
  return [
135
174
  f"{container.name} {container.status.value}"
@@ -176,6 +215,26 @@ class QuickstartStatus:
176
215
  },
177
216
  )
178
217
 
218
+ def get_containers(self) -> Set[str]:
219
+ if self.containers:
220
+ return {container.name for container in self.containers}
221
+ else:
222
+ return set()
223
+
224
+
225
+ def detect_legacy_quickstart_compose(containers: Set[str]) -> bool:
226
+ return "zookeeper" in containers
227
+
228
+
229
+ def _get_services_from_compose(compose_file: str) -> Set[str]:
230
+ with open(compose_file) as config_file:
231
+ return yaml.safe_load(config_file).get("services", {}).keys()
232
+
233
+
234
+ def _get_volumes_from_compose(compose_file: str) -> Set[str]:
235
+ with open(compose_file) as config_file:
236
+ return yaml.safe_load(config_file).get("volumes", {}).keys()
237
+
179
238
 
180
239
  def check_docker_quickstart() -> QuickstartStatus:
181
240
  container_statuses: List[DockerContainerStatus] = []
@@ -188,7 +247,7 @@ def check_docker_quickstart() -> QuickstartStatus:
188
247
  ignore_removed=True,
189
248
  )
190
249
  if len(containers) == 0:
191
- return QuickstartStatus([])
250
+ return QuickstartStatus([], [], running_unsupported_version=False)
192
251
 
193
252
  # load the expected containers from the docker-compose file
194
253
  config_files = (
@@ -197,16 +256,17 @@ def check_docker_quickstart() -> QuickstartStatus:
197
256
  .split(",")
198
257
  )
199
258
 
200
- # If using profiles, alternative check
259
+ # If using profiles, alternative check ##TODO: Does this really work? Check mixpanel for usage of this.
201
260
  if config_files and "/profiles/" in config_files[0]:
202
261
  return check_docker_quickstart_profiles(client)
203
262
 
204
263
  all_containers = set()
205
264
  for config_file in config_files:
206
- with open(config_file) as config_file:
207
- all_containers.update(
208
- yaml.safe_load(config_file).get("services", {}).keys()
209
- )
265
+ all_containers.update(_get_services_from_compose(config_file))
266
+
267
+ all_volumes = set()
268
+ for config_file in config_files:
269
+ all_volumes.update(_get_volumes_from_compose(config_file))
210
270
 
211
271
  existing_containers = set()
212
272
  # Check that the containers are running and healthy.
@@ -240,8 +300,12 @@ def check_docker_quickstart() -> QuickstartStatus:
240
300
  container_statuses.append(
241
301
  DockerContainerStatus(missing, ContainerStatus.MISSING)
242
302
  )
243
-
244
- return QuickstartStatus(container_statuses)
303
+ running_unsupported_version = detect_legacy_quickstart_compose(all_containers)
304
+ return QuickstartStatus(
305
+ containers=container_statuses,
306
+ volumes=list(all_volumes),
307
+ running_unsupported_version=running_unsupported_version,
308
+ )
245
309
 
246
310
 
247
311
  def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartStatus:
@@ -254,7 +318,7 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
254
318
  ignore_removed=True,
255
319
  )
256
320
  if len(containers) == 0:
257
- return QuickstartStatus([])
321
+ return QuickstartStatus([], [], running_unsupported_version=False)
258
322
 
259
323
  existing_containers = set()
260
324
  # Check that the containers are running and healthy.
@@ -273,4 +337,36 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
273
337
 
274
338
  container_statuses.append(DockerContainerStatus(name, status))
275
339
 
276
- return QuickstartStatus(container_statuses)
340
+ # TODO: Can this be handled with older verions?
341
+ return QuickstartStatus(
342
+ container_statuses, volumes=[], running_unsupported_version=False
343
+ )
344
+
345
+
346
+ def check_upgrade_supported(
347
+ quickstart_compose_file: List[pathlib.Path], quickstart_status: QuickstartStatus
348
+ ) -> bool:
349
+ if (
350
+ quickstart_status.running_unsupported_version
351
+ ): # we detected a legacy quickstart service
352
+ return False
353
+
354
+ if not quickstart_status.get_containers(): # no containers are running
355
+ return True
356
+
357
+ compose_services = set()
358
+ compose_volumes = set()
359
+
360
+ for compose_file in quickstart_compose_file:
361
+ compose_services.update(_get_services_from_compose(str(compose_file)))
362
+ compose_volumes.update(_get_volumes_from_compose(str(compose_file)))
363
+
364
+ # if all services and volumes are not the same, the state in the volumes may not be compatible with the new services.
365
+ # We are checking for containers and volumes per the compose file, not necessarily all of them being present
366
+ if (
367
+ compose_services == quickstart_status.get_containers()
368
+ and compose_volumes == quickstart_status.volumes
369
+ ):
370
+ return True
371
+ else:
372
+ return False