acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
datahub/cli/delete_cli.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  import random
3
+ import sys
3
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
5
  from dataclasses import dataclass
5
6
  from datetime import datetime
@@ -17,7 +18,6 @@ from datahub.emitter.aspect import ASPECT_MAP, TIMESERIES_ASPECT_MAP
17
18
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
18
19
  from datahub.ingestion.graph.config import ClientMode
19
20
  from datahub.ingestion.graph.filters import RemovedStatusFilter
20
- from datahub.telemetry import telemetry
21
21
  from datahub.upgrade import upgrade
22
22
  from datahub.utilities.perf_timer import PerfTimer
23
23
  from datahub.utilities.urns.urn import guess_entity_type
@@ -115,7 +115,7 @@ class DeletionResult:
115
115
  help="specifies soft/hard deletion",
116
116
  )
117
117
  @click.option("-n", "--dry-run", required=False, is_flag=True)
118
- @telemetry.with_telemetry()
118
+ @upgrade.check_upgrade
119
119
  def by_registry(
120
120
  registry_id: str,
121
121
  soft: bool,
@@ -170,7 +170,7 @@ def by_registry(
170
170
  @click.option(
171
171
  "-f", "--force", required=False, is_flag=True, help="force the delete if set"
172
172
  )
173
- @telemetry.with_telemetry()
173
+ @upgrade.check_upgrade
174
174
  def references(urn: str, dry_run: bool, force: bool) -> None:
175
175
  """
176
176
  Delete all references to an entity (but not the entity itself).
@@ -231,8 +231,9 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
231
231
  default=3000,
232
232
  type=int,
233
233
  help="Batch size when querying for entities to un-soft delete."
234
- "Maximum 10000. Large batch sizes may cause timeouts.",
234
+ "Maximum 5000. Large batch sizes may cause timeouts.",
235
235
  )
236
+ @upgrade.check_upgrade
236
237
  def undo_by_filter(
237
238
  urn: Optional[str], platform: Optional[str], batch_size: int
238
239
  ) -> None:
@@ -317,6 +318,19 @@ def undo_by_filter(
317
318
  is_flag=True,
318
319
  help="Recursively delete all contained entities (only for containers and dataPlatformInstances)",
319
320
  )
321
+ @click.option(
322
+ "--streaming-batch",
323
+ required=False,
324
+ is_flag=True,
325
+ help="Use streaming batch deletion for recursive operations. Benefit of being resumable for large hierarchies where getting all URNs at once can take a long time.",
326
+ )
327
+ @click.option(
328
+ "--streaming-batch-size",
329
+ required=False,
330
+ default=12000,
331
+ type=int,
332
+ help="Batch size for streaming batch deletion for recursive operations.",
333
+ )
320
334
  @click.option(
321
335
  "--start-time",
322
336
  required=False,
@@ -336,7 +350,7 @@ def undo_by_filter(
336
350
  default=3000,
337
351
  type=int,
338
352
  help="Batch size when querying for entities to delete."
339
- "Maximum 10000. Large batch sizes may cause timeouts.",
353
+ "Maximum 5000. Large batch sizes may cause timeouts.",
340
354
  )
341
355
  @click.option(
342
356
  "-n",
@@ -356,7 +370,6 @@ def undo_by_filter(
356
370
  "--workers", type=int, default=1, help="Num of workers to use for deletion."
357
371
  )
358
372
  @upgrade.check_upgrade
359
- @telemetry.with_telemetry()
360
373
  def by_filter(
361
374
  urn: Optional[str],
362
375
  urn_file: Optional[str],
@@ -368,6 +381,8 @@ def by_filter(
368
381
  entity_type: Optional[str],
369
382
  query: Optional[str],
370
383
  recursive: bool,
384
+ streaming_batch: bool,
385
+ streaming_batch_size: int,
371
386
  start_time: Optional[datetime],
372
387
  end_time: Optional[datetime],
373
388
  batch_size: int,
@@ -386,6 +401,7 @@ def by_filter(
386
401
  env=env,
387
402
  query=query,
388
403
  recursive=recursive,
404
+ streaming_batch=streaming_batch,
389
405
  )
390
406
  soft_delete_filter = _validate_user_soft_delete_flags(
391
407
  soft=soft, aspect=aspect, only_soft_deleted=only_soft_deleted
@@ -417,26 +433,27 @@ def by_filter(
417
433
  # Determine which urns to delete.
418
434
  delete_by_urn = bool(urn) and not recursive
419
435
  if urn:
420
- urns = [urn]
421
-
422
436
  if recursive:
423
- # Add children urns to the list.
424
- if guess_entity_type(urn) == "dataPlatformInstance":
425
- urns.extend(
426
- graph.get_urns_by_filter(
427
- platform_instance=urn,
428
- status=soft_delete_filter,
429
- batch_size=batch_size,
430
- )
431
- )
432
- else:
433
- urns.extend(
434
- graph.get_urns_by_filter(
435
- container=urn,
436
- status=soft_delete_filter,
437
- batch_size=batch_size,
438
- )
439
- )
437
+ _delete_urns_streaming_recursive(
438
+ graph=graph,
439
+ parent_urn=urn,
440
+ aspect_name=aspect,
441
+ soft=soft,
442
+ dry_run=dry_run,
443
+ start_time=start_time,
444
+ end_time=end_time,
445
+ workers=workers,
446
+ soft_delete_filter=soft_delete_filter,
447
+ batch_size=batch_size,
448
+ force=force,
449
+ streaming_batch_size=streaming_batch_size
450
+ if streaming_batch
451
+ else sys.maxsize,
452
+ )
453
+ return
454
+
455
+ else:
456
+ urns = [urn]
440
457
  elif urn_file:
441
458
  with open(urn_file, "r") as r:
442
459
  urns = []
@@ -557,6 +574,7 @@ def _validate_user_urn_and_filters(
557
574
  env: Optional[str],
558
575
  query: Optional[str],
559
576
  recursive: bool,
577
+ streaming_batch: bool,
560
578
  ) -> None:
561
579
  # Check urn / filters options.
562
580
  if urn:
@@ -592,6 +610,12 @@ def _validate_user_urn_and_filters(
592
610
  f"This will only delete {urn}. Use --recursive to delete all contained entities."
593
611
  )
594
612
 
613
+ # Check streaming flag.
614
+ if streaming_batch and not recursive:
615
+ raise click.UsageError(
616
+ "The --streaming-batch flag can only be used with --recursive."
617
+ )
618
+
595
619
 
596
620
  def _validate_user_soft_delete_flags(
597
621
  soft: bool, aspect: Optional[str], only_soft_deleted: bool
@@ -654,8 +678,8 @@ def _validate_user_aspect_flags(
654
678
  def _validate_batch_size(batch_size: int) -> None:
655
679
  if batch_size <= 0:
656
680
  raise click.UsageError("Batch size must be a positive integer.")
657
- elif batch_size > 10000:
658
- raise click.UsageError("Batch size cannot exceed 10,000.")
681
+ elif batch_size > 5000:
682
+ raise click.UsageError("Batch size cannot exceed 5,000.")
659
683
 
660
684
 
661
685
  def _delete_one_urn(
@@ -738,3 +762,76 @@ def _delete_one_urn(
738
762
  num_timeseries_records=ts_rows_affected,
739
763
  num_referenced_entities=referenced_entities_affected,
740
764
  )
765
+
766
+
767
+ def _delete_urns_streaming_recursive(
768
+ graph: DataHubGraph,
769
+ parent_urn: str,
770
+ aspect_name: Optional[str],
771
+ soft: bool,
772
+ dry_run: bool,
773
+ start_time: Optional[datetime],
774
+ end_time: Optional[datetime],
775
+ workers: int,
776
+ soft_delete_filter: RemovedStatusFilter,
777
+ batch_size: int,
778
+ force: bool,
779
+ streaming_batch_size: int,
780
+ ) -> None:
781
+ """Streaming recursive batch deletion that processes URNs in batches."""
782
+
783
+ entity_type = guess_entity_type(parent_urn)
784
+ click.echo(f"Starting recursive deletion of {entity_type} {parent_urn}")
785
+
786
+ if not force and not dry_run:
787
+ click.confirm(
788
+ f"This will recursively delete {parent_urn} and all its contained entities. Do you want to continue?",
789
+ abort=True,
790
+ )
791
+
792
+ urns = []
793
+
794
+ if entity_type == "dataPlatformInstance":
795
+ child_urns_iter = graph.get_urns_by_filter(
796
+ platform_instance=parent_urn,
797
+ status=soft_delete_filter,
798
+ batch_size=batch_size,
799
+ # Important to skip cache so we can resume from where we left off.
800
+ skip_cache=True,
801
+ )
802
+ else:
803
+ child_urns_iter = graph.get_urns_by_filter(
804
+ container=parent_urn,
805
+ status=soft_delete_filter,
806
+ batch_size=batch_size,
807
+ # Important to skip cache so we can resume from where we left off.
808
+ skip_cache=True,
809
+ )
810
+
811
+ for child_urn in child_urns_iter:
812
+ urns.append(child_urn)
813
+ if len(urns) >= streaming_batch_size:
814
+ _delete_urns_parallel(
815
+ graph=graph,
816
+ urns=urns,
817
+ aspect_name=aspect_name,
818
+ soft=soft,
819
+ dry_run=dry_run,
820
+ delete_by_urn=False,
821
+ start_time=start_time,
822
+ end_time=end_time,
823
+ workers=workers,
824
+ )
825
+ urns = []
826
+ urns.append(parent_urn)
827
+ _delete_urns_parallel(
828
+ graph=graph,
829
+ urns=urns,
830
+ aspect_name=aspect_name,
831
+ soft=soft,
832
+ dry_run=dry_run,
833
+ delete_by_urn=False,
834
+ start_time=start_time,
835
+ end_time=end_time,
836
+ workers=workers,
837
+ )
@@ -1,8 +1,9 @@
1
1
  import enum
2
2
  import os
3
+ import pathlib
3
4
  from contextlib import contextmanager
4
5
  from dataclasses import dataclass
5
- from typing import Any, Dict, Iterator, List, Optional
6
+ from typing import Any, Dict, Iterator, List, Optional, Set
6
7
 
7
8
  import docker
8
9
  import docker.errors
@@ -13,6 +14,7 @@ from datahub.configuration.common import ExceptionWithProps
13
14
 
14
15
  # Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.
15
16
  MIN_MEMORY_NEEDED = 3.8 # GB
17
+ MIN_DISK_SPACE_NEEDED = 12 # GB
16
18
 
17
19
  DOCKER_COMPOSE_PROJECT_NAME = os.getenv("DATAHUB_COMPOSE_PROJECT_NAME", "datahub")
18
20
  DATAHUB_COMPOSE_PROJECT_FILTER = {
@@ -37,6 +39,10 @@ class DockerLowMemoryError(Exception):
37
39
  SHOW_STACK_TRACE = False
38
40
 
39
41
 
42
+ class DockerLowDiskSpaceError(Exception):
43
+ SHOW_STACK_TRACE = False
44
+
45
+
40
46
  class DockerComposeVersionError(Exception):
41
47
  SHOW_STACK_TRACE = False
42
48
 
@@ -102,6 +108,24 @@ def run_quickstart_preflight_checks(client: docker.DockerClient) -> None:
102
108
  "You can increase the memory allocated to Docker in the Docker settings."
103
109
  )
104
110
 
111
+ result = client.containers.run(
112
+ "alpine:latest",
113
+ "sh -c \"df -B1 / | tail -1 | awk '{print $2, $4}'\"", # total, available
114
+ remove=True,
115
+ stdout=True,
116
+ stderr=True,
117
+ )
118
+
119
+ output = result.decode("utf-8").strip()
120
+ total_bytes, available_bytes = map(int, output.split())
121
+
122
+ available_gb = available_bytes / (1024**3)
123
+ if available_gb < MIN_DISK_SPACE_NEEDED:
124
+ raise DockerLowDiskSpaceError(
125
+ f"Total Docker disk space available {available_gb:.2f}GB is below the minimum threshold {MIN_DISK_SPACE_NEEDED}GB. "
126
+ "You can increase the disk space allocated to Docker in the Docker settings or free up disk space`"
127
+ )
128
+
105
129
 
106
130
  class ContainerStatus(enum.Enum):
107
131
  OK = "is ok"
@@ -126,10 +150,24 @@ class DockerContainerStatus:
126
150
  @dataclass
127
151
  class QuickstartStatus:
128
152
  containers: List[DockerContainerStatus]
153
+ volumes: Set[str]
154
+ # On moving to compose profiles, this CLI will no longer support running quickstart instances from earlier versions.
155
+ # While the check command can work, upgrades or
156
+ running_unsupported_version: bool
157
+
158
+ def __init__(
159
+ self,
160
+ containers: List[DockerContainerStatus],
161
+ volumes: List[str],
162
+ running_unsupported_version: bool = False,
163
+ ):
164
+ self.containers = containers
165
+ self.running_unsupported_version = running_unsupported_version
166
+ self.volumes = set(volumes)
129
167
 
130
168
  def errors(self) -> List[str]:
131
169
  if not self.containers:
132
- return ["quickstart.sh or dev.sh is not running"]
170
+ return ["datahub is not running"]
133
171
 
134
172
  return [
135
173
  f"{container.name} {container.status.value}"
@@ -176,6 +214,26 @@ class QuickstartStatus:
176
214
  },
177
215
  )
178
216
 
217
+ def get_containers(self) -> Set[str]:
218
+ if self.containers:
219
+ return {container.name for container in self.containers}
220
+ else:
221
+ return set()
222
+
223
+
224
+ def detect_legacy_quickstart_compose(containers: Set[str]) -> bool:
225
+ return "zookeeper" in containers
226
+
227
+
228
+ def _get_services_from_compose(compose_file: str) -> Set[str]:
229
+ with open(compose_file) as config_file:
230
+ return yaml.safe_load(config_file).get("services", {}).keys()
231
+
232
+
233
+ def _get_volumes_from_compose(compose_file: str) -> Set[str]:
234
+ with open(compose_file) as config_file:
235
+ return yaml.safe_load(config_file).get("volumes", {}).keys()
236
+
179
237
 
180
238
  def check_docker_quickstart() -> QuickstartStatus:
181
239
  container_statuses: List[DockerContainerStatus] = []
@@ -188,7 +246,7 @@ def check_docker_quickstart() -> QuickstartStatus:
188
246
  ignore_removed=True,
189
247
  )
190
248
  if len(containers) == 0:
191
- return QuickstartStatus([])
249
+ return QuickstartStatus([], [], running_unsupported_version=False)
192
250
 
193
251
  # load the expected containers from the docker-compose file
194
252
  config_files = (
@@ -197,16 +255,17 @@ def check_docker_quickstart() -> QuickstartStatus:
197
255
  .split(",")
198
256
  )
199
257
 
200
- # If using profiles, alternative check
258
+ # If using profiles, alternative check ##TODO: Does this really work? Check mixpanel for usage of this.
201
259
  if config_files and "/profiles/" in config_files[0]:
202
260
  return check_docker_quickstart_profiles(client)
203
261
 
204
262
  all_containers = set()
205
263
  for config_file in config_files:
206
- with open(config_file) as config_file:
207
- all_containers.update(
208
- yaml.safe_load(config_file).get("services", {}).keys()
209
- )
264
+ all_containers.update(_get_services_from_compose(config_file))
265
+
266
+ all_volumes = set()
267
+ for config_file in config_files:
268
+ all_volumes.update(_get_volumes_from_compose(config_file))
210
269
 
211
270
  existing_containers = set()
212
271
  # Check that the containers are running and healthy.
@@ -240,8 +299,12 @@ def check_docker_quickstart() -> QuickstartStatus:
240
299
  container_statuses.append(
241
300
  DockerContainerStatus(missing, ContainerStatus.MISSING)
242
301
  )
243
-
244
- return QuickstartStatus(container_statuses)
302
+ running_unsupported_version = detect_legacy_quickstart_compose(all_containers)
303
+ return QuickstartStatus(
304
+ containers=container_statuses,
305
+ volumes=list(all_volumes),
306
+ running_unsupported_version=running_unsupported_version,
307
+ )
245
308
 
246
309
 
247
310
  def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartStatus:
@@ -254,7 +317,7 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
254
317
  ignore_removed=True,
255
318
  )
256
319
  if len(containers) == 0:
257
- return QuickstartStatus([])
320
+ return QuickstartStatus([], [], running_unsupported_version=False)
258
321
 
259
322
  existing_containers = set()
260
323
  # Check that the containers are running and healthy.
@@ -273,4 +336,36 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
273
336
 
274
337
  container_statuses.append(DockerContainerStatus(name, status))
275
338
 
276
- return QuickstartStatus(container_statuses)
339
+ # TODO: Can this be handled with older verions?
340
+ return QuickstartStatus(
341
+ container_statuses, volumes=[], running_unsupported_version=False
342
+ )
343
+
344
+
345
+ def check_upgrade_supported(
346
+ quickstart_compose_file: List[pathlib.Path], quickstart_status: QuickstartStatus
347
+ ) -> bool:
348
+ if (
349
+ quickstart_status.running_unsupported_version
350
+ ): # we detected a legacy quickstart service
351
+ return False
352
+
353
+ if not quickstart_status.get_containers(): # no containers are running
354
+ return True
355
+
356
+ compose_services = set()
357
+ compose_volumes = set()
358
+
359
+ for compose_file in quickstart_compose_file:
360
+ compose_services.update(_get_services_from_compose(str(compose_file)))
361
+ compose_volumes.update(_get_volumes_from_compose(str(compose_file)))
362
+
363
+ # if all services and volumes are not the same, the state in the volumes may not be compatible with the new services.
364
+ # We are checking for containers and volumes per the compose file, not necessarily all of them being present
365
+ if (
366
+ compose_services == quickstart_status.get_containers()
367
+ and compose_volumes == quickstart_status.volumes
368
+ ):
369
+ return True
370
+ else:
371
+ return False