acryl-datahub 1.1.0.4rc3__py3-none-any.whl → 1.1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (149) hide show
  1. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2499 -2501
  2. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +149 -131
  3. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/cli/check_cli.py +65 -11
  7. datahub/cli/cli_utils.py +63 -0
  8. datahub/cli/container_cli.py +5 -0
  9. datahub/cli/delete_cli.py +3 -4
  10. datahub/cli/docker_check.py +107 -12
  11. datahub/cli/docker_cli.py +149 -227
  12. datahub/cli/exists_cli.py +0 -2
  13. datahub/cli/get_cli.py +0 -2
  14. datahub/cli/iceberg_cli.py +5 -0
  15. datahub/cli/ingest_cli.py +3 -15
  16. datahub/cli/migrate.py +2 -0
  17. datahub/cli/put_cli.py +1 -4
  18. datahub/cli/quickstart_versioning.py +50 -7
  19. datahub/cli/specific/assertions_cli.py +0 -4
  20. datahub/cli/specific/datacontract_cli.py +0 -3
  21. datahub/cli/specific/dataproduct_cli.py +0 -11
  22. datahub/cli/specific/dataset_cli.py +1 -8
  23. datahub/cli/specific/forms_cli.py +0 -4
  24. datahub/cli/specific/group_cli.py +0 -2
  25. datahub/cli/specific/structuredproperties_cli.py +1 -4
  26. datahub/cli/specific/user_cli.py +0 -2
  27. datahub/cli/state_cli.py +0 -2
  28. datahub/cli/timeline_cli.py +0 -2
  29. datahub/emitter/rest_emitter.py +24 -8
  30. datahub/entrypoints.py +4 -3
  31. datahub/ingestion/api/decorators.py +15 -3
  32. datahub/ingestion/api/report.py +332 -3
  33. datahub/ingestion/api/sink.py +3 -0
  34. datahub/ingestion/api/source.py +47 -45
  35. datahub/ingestion/autogenerated/__init__.py +0 -0
  36. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  37. datahub/ingestion/autogenerated/lineage.json +401 -0
  38. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  39. datahub/ingestion/extractor/schema_util.py +13 -4
  40. datahub/ingestion/graph/client.py +73 -30
  41. datahub/ingestion/run/pipeline.py +54 -2
  42. datahub/ingestion/sink/datahub_rest.py +12 -0
  43. datahub/ingestion/source/abs/source.py +1 -1
  44. datahub/ingestion/source/aws/glue.py +1 -1
  45. datahub/ingestion/source/azure/azure_common.py +2 -2
  46. datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
  47. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  48. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  49. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  50. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  51. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  52. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  53. datahub/ingestion/source/common/subtypes.py +45 -0
  54. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  55. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  56. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  57. datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
  58. datahub/ingestion/source/dbt/dbt_common.py +3 -1
  59. datahub/ingestion/source/dremio/dremio_api.py +38 -27
  60. datahub/ingestion/source/dremio/dremio_source.py +7 -7
  61. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  62. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  63. datahub/ingestion/source/ge_data_profiler.py +28 -20
  64. datahub/ingestion/source/hex/api.py +26 -1
  65. datahub/ingestion/source/identity/azure_ad.py +1 -1
  66. datahub/ingestion/source/identity/okta.py +1 -14
  67. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  68. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  69. datahub/ingestion/source/mlflow.py +11 -1
  70. datahub/ingestion/source/mock_data/__init__.py +0 -0
  71. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  72. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  73. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  74. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  75. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  76. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  77. datahub/ingestion/source/preset.py +2 -2
  78. datahub/ingestion/source/redshift/usage.py +4 -3
  79. datahub/ingestion/source/s3/report.py +4 -2
  80. datahub/ingestion/source/s3/source.py +367 -115
  81. datahub/ingestion/source/salesforce.py +6 -3
  82. datahub/ingestion/source/sigma/sigma.py +6 -1
  83. datahub/ingestion/source/slack/slack.py +2 -1
  84. datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
  85. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  86. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  87. datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
  88. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  89. datahub/ingestion/source/sql/athena.py +119 -12
  90. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  91. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  92. datahub/ingestion/source/sql/mssql/source.py +24 -15
  93. datahub/ingestion/source/sql/oracle.py +1 -1
  94. datahub/ingestion/source/sql/sql_common.py +11 -0
  95. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  96. datahub/ingestion/source/sql/teradata.py +997 -235
  97. datahub/ingestion/source/sql/vertica.py +10 -6
  98. datahub/ingestion/source/sql_queries.py +2 -2
  99. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  100. datahub/ingestion/source/superset.py +57 -2
  101. datahub/ingestion/source/tableau/tableau.py +57 -37
  102. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  103. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  104. datahub/ingestion/source/unity/proxy.py +4 -3
  105. datahub/ingestion/source/unity/source.py +56 -30
  106. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  107. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  108. datahub/metadata/_internal_schema_classes.py +1253 -536
  109. datahub/metadata/_urns/urn_defs.py +1797 -1685
  110. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  111. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  112. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  113. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  114. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  115. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  116. datahub/metadata/schema.avsc +16614 -16538
  117. datahub/metadata/schemas/ContainerProperties.avsc +2 -0
  118. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  119. datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
  120. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  121. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  122. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  123. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  124. datahub/metadata/schemas/DataJobInfo.avsc +2 -0
  125. datahub/metadata/schemas/DataProcessKey.avsc +2 -0
  126. datahub/metadata/schemas/DatasetKey.avsc +4 -1
  127. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  128. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
  129. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  130. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
  131. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
  132. datahub/metadata/schemas/MLModelKey.avsc +2 -0
  133. datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
  134. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  135. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  136. datahub/sdk/datajob.py +39 -15
  137. datahub/sdk/lineage_client.py +2 -0
  138. datahub/sdk/main_client.py +14 -2
  139. datahub/sdk/search_client.py +4 -3
  140. datahub/specific/dataproduct.py +4 -0
  141. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  142. datahub/sql_parsing/sqlglot_lineage.py +40 -13
  143. datahub/telemetry/telemetry.py +17 -11
  144. datahub/upgrade/upgrade.py +46 -13
  145. datahub/utilities/server_config_util.py +8 -0
  146. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  147. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
  148. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
  149. {acryl_datahub-1.1.0.4rc3.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0
@@ -39,6 +39,7 @@ datahub-business-glossary = datahub.ingestion.source.metadata.business_glossary:
39
39
  datahub-debug = datahub.ingestion.source.debug.datahub_debug:DataHubDebugSource
40
40
  datahub-gc = datahub.ingestion.source.gc.datahub_gc:DataHubGcSource
41
41
  datahub-lineage-file = datahub.ingestion.source.metadata.lineage:LineageFileSource
42
+ datahub-mock-data = datahub.ingestion.source.mock_data.datahub_mock_data:DataHubMockDataSource
42
43
  dbt = datahub.ingestion.source.dbt.dbt_core:DBTCoreSource
43
44
  dbt-cloud = datahub.ingestion.source.dbt.dbt_cloud:DBTCloudSource
44
45
  delta-lake = datahub.ingestion.source.delta_lake:DeltaLakeSource
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.1.0.4rc3"
3
+ __version__ = "1.1.0.5"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -383,7 +383,7 @@ class Dataset(StrictModel):
383
383
  urn: Optional[str] = None
384
384
  description: Optional[str] = None
385
385
  name: Optional[str] = None
386
- schema_metadata: Optional[SchemaSpecification] = Field(alias="schema")
386
+ schema_metadata: Optional[SchemaSpecification] = Field(default=None, alias="schema")
387
387
  downstreams: Optional[List[str]] = None
388
388
  properties: Optional[Dict[str, str]] = None
389
389
  subtype: Optional[str] = None
datahub/cli/check_cli.py CHANGED
@@ -9,6 +9,7 @@ from datetime import datetime
9
9
  from typing import Any, Dict, List, Optional, Union
10
10
 
11
11
  import click
12
+ from tabulate import tabulate
12
13
 
13
14
  from datahub._version import __package_name__
14
15
  from datahub.cli.json_file import check_mce_file
@@ -21,7 +22,7 @@ from datahub.ingestion.run.pipeline import Pipeline
21
22
  from datahub.ingestion.sink.sink_registry import sink_registry
22
23
  from datahub.ingestion.source.source_registry import source_registry
23
24
  from datahub.ingestion.transformer.transform_registry import transform_registry
24
- from datahub.telemetry import telemetry
25
+ from datahub.upgrade import upgrade
25
26
  from datahub.utilities.file_backed_collections import (
26
27
  ConnectionWrapper,
27
28
  FileBackedDict,
@@ -47,7 +48,6 @@ def check() -> None:
47
48
  @click.option(
48
49
  "--unpack-mces", default=False, is_flag=True, help="Converts MCEs into MCPs"
49
50
  )
50
- @telemetry.with_telemetry()
51
51
  def metadata_file(json_file: str, rewrite: bool, unpack_mces: bool) -> None:
52
52
  """Check the schema of a metadata (MCE or MCP) JSON file."""
53
53
 
@@ -105,7 +105,6 @@ def metadata_file(json_file: str, rewrite: bool, unpack_mces: bool) -> None:
105
105
  default=(),
106
106
  help="[Advanced] Paths in the deepdiff object to ignore",
107
107
  )
108
- @telemetry.with_telemetry()
109
108
  def metadata_diff(
110
109
  actual_file: str, expected_file: str, verbose: bool, ignore_path: List[str]
111
110
  ) -> None:
@@ -142,7 +141,6 @@ def metadata_diff(
142
141
  type=str,
143
142
  default=None,
144
143
  )
145
- @telemetry.with_telemetry()
146
144
  def plugins(source: Optional[str], verbose: bool) -> None:
147
145
  """List the enabled ingestion plugins."""
148
146
 
@@ -234,7 +232,7 @@ def sql_format(sql: str, platform: str) -> None:
234
232
  default=True,
235
233
  help="Run in offline mode and disable schema-aware parsing.",
236
234
  )
237
- @telemetry.with_telemetry()
235
+ @upgrade.check_upgrade
238
236
  def sql_lineage(
239
237
  sql: Optional[str],
240
238
  sql_file: Optional[str],
@@ -297,7 +295,6 @@ def sql_lineage(
297
295
  type=str,
298
296
  help="the input to validate",
299
297
  )
300
- @telemetry.with_telemetry()
301
298
  def test_allow_deny(config: str, input: str, pattern_key: str) -> None:
302
299
  """Test input string against AllowDeny pattern in a DataHub recipe.
303
300
 
@@ -346,7 +343,6 @@ def test_allow_deny(config: str, input: str, pattern_key: str) -> None:
346
343
  type=str,
347
344
  help="The input to validate",
348
345
  )
349
- @telemetry.with_telemetry()
350
346
  def test_path_spec(config: str, input: str, path_spec_key: str) -> None:
351
347
  """Test input path string against PathSpec patterns in a DataHub recipe.
352
348
 
@@ -471,6 +467,7 @@ WHERE
471
467
 
472
468
 
473
469
  @check.command()
470
+ @upgrade.check_upgrade
474
471
  def server_config() -> None:
475
472
  """Print the server config."""
476
473
  graph = get_default_graph(ClientMode.CLI)
@@ -482,26 +479,83 @@ def server_config() -> None:
482
479
 
483
480
  @check.command()
484
481
  @click.option(
485
- "--urn", required=True, help="The urn or urn pattern (supports % for wildcard)"
482
+ "--urn", required=False, help="The urn or urn pattern (supports % for wildcard)"
486
483
  )
487
484
  @click.option("--aspect", default=None, help="Filter to a specific aspect name.")
488
485
  @click.option(
489
486
  "--start", type=int, default=None, help="Row number of sql store to restore from."
490
487
  )
491
488
  @click.option("--batch-size", type=int, default=None, help="How many rows to restore.")
489
+ @click.option(
490
+ "--file",
491
+ required=False,
492
+ type=click.Path(exists=True, dir_okay=True, readable=True),
493
+ help="File absolute path containing URNs (one per line) to restore indices",
494
+ )
495
+ @upgrade.check_upgrade
492
496
  def restore_indices(
493
- urn: str,
497
+ urn: Optional[str],
494
498
  aspect: Optional[str],
495
499
  start: Optional[int],
496
500
  batch_size: Optional[int],
501
+ file: Optional[str],
497
502
  ) -> None:
498
503
  """Resync metadata changes into the search and graph indices."""
504
+ if urn is None and file is None:
505
+ raise click.UsageError("Either --urn or --file must be provided")
499
506
  graph = get_default_graph(ClientMode.CLI)
500
507
 
501
- result = graph.restore_indices(
508
+ graph.restore_indices(
502
509
  urn_pattern=urn,
503
510
  aspect=aspect,
504
511
  start=start,
505
512
  batch_size=batch_size,
513
+ file=file,
506
514
  )
507
- click.echo(result)
515
+
516
+
517
+ @check.command()
518
+ @upgrade.check_upgrade
519
+ def get_kafka_consumer_offsets() -> None:
520
+ """Get Kafka consumer offsets from the DataHub API."""
521
+ graph = get_default_graph(ClientMode.CLI)
522
+ result = graph.get_kafka_consumer_offsets()
523
+
524
+ table_data = []
525
+ headers = [
526
+ "Topic",
527
+ "Consumer Group",
528
+ "Schema",
529
+ "Partition",
530
+ "Offset",
531
+ "Lag",
532
+ "Avg Lag",
533
+ "Max Lag",
534
+ "Total Lag",
535
+ ]
536
+
537
+ for topic, consumers in result.items():
538
+ for consumer_group, schemas in consumers.items():
539
+ for schema, data in schemas.items():
540
+ metrics = data.get("metrics", {})
541
+ partitions = data.get("partitions", {})
542
+
543
+ for partition, partition_data in partitions.items():
544
+ table_data.append(
545
+ [
546
+ topic,
547
+ consumer_group,
548
+ schema,
549
+ partition,
550
+ partition_data.get("offset", "N/A"),
551
+ partition_data.get("lag", "N/A"),
552
+ metrics.get("avgLag", "N/A"),
553
+ metrics.get("maxLag", "N/A"),
554
+ metrics.get("totalLag", "N/A"),
555
+ ]
556
+ )
557
+
558
+ if table_data:
559
+ click.echo(tabulate(table_data, headers=headers, tablefmt="grid"))
560
+ else:
561
+ click.echo("No Kafka consumer offset data found.")
datahub/cli/cli_utils.py CHANGED
@@ -3,6 +3,7 @@ import logging
3
3
  import time
4
4
  import typing
5
5
  from datetime import datetime
6
+ from functools import wraps
6
7
  from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
7
8
 
8
9
  import click
@@ -424,3 +425,65 @@ def ensure_has_system_metadata(
424
425
  props = metadata.properties
425
426
  props["clientId"] = datahub_version.__package_name__
426
427
  props["clientVersion"] = datahub_version.__version__
428
+
429
+
430
+ def enable_auto_decorators(main_group: click.Group) -> None:
431
+ """
432
+ Enable automatic decorators for all click commands.
433
+ This wraps existing command callback functions to add upgrade and telemetry decorators.
434
+ """
435
+
436
+ def has_decorator(func: Any, module_pattern: str, function_pattern: str) -> bool:
437
+ """Check if function already has a specific decorator"""
438
+ if hasattr(func, "__wrapped__"):
439
+ current_func = func
440
+ while hasattr(current_func, "__wrapped__"):
441
+ # Check if this wrapper matches the module and function patterns
442
+ if (
443
+ hasattr(current_func, "__module__")
444
+ and module_pattern in current_func.__module__
445
+ and hasattr(current_func, "__name__")
446
+ and function_pattern in current_func.__name__
447
+ ):
448
+ return True
449
+ current_func = current_func.__wrapped__
450
+ return False
451
+
452
+ def has_telemetry_decorator(func):
453
+ return has_decorator(func, "telemetry", "with_telemetry")
454
+
455
+ def wrap_command_callback(command_obj):
456
+ """Wrap a command's callback function to add decorators"""
457
+ if hasattr(command_obj, "callback") and command_obj.callback:
458
+ original_callback = command_obj.callback
459
+
460
+ # Import here to avoid circular imports
461
+ from datahub.telemetry import telemetry
462
+
463
+ decorated_callback = original_callback
464
+
465
+ if not has_telemetry_decorator(decorated_callback):
466
+ log.debug(
467
+ f"Applying telemetry decorator to {original_callback.__module__}.{original_callback.__name__}"
468
+ )
469
+ decorated_callback = telemetry.with_telemetry()(decorated_callback)
470
+
471
+ # Preserve the original function's metadata
472
+ decorated_callback = wraps(original_callback)(decorated_callback)
473
+
474
+ command_obj.callback = decorated_callback
475
+
476
+ def wrap_group_commands(group_obj):
477
+ """Recursively wrap all commands in a group"""
478
+ if hasattr(group_obj, "commands"):
479
+ for _, command_obj in group_obj.commands.items():
480
+ if isinstance(command_obj, click.Group):
481
+ # Recursively wrap sub-groups
482
+ wrap_group_commands(command_obj)
483
+ else:
484
+ # Wrap individual commands
485
+ wrap_command_callback(command_obj)
486
+
487
+ wrap_group_commands(main_group)
488
+
489
+ log.debug("Auto-decorators enabled successfully")
@@ -3,6 +3,7 @@ import logging
3
3
  import click
4
4
 
5
5
  from datahub.ingestion.source.apply.datahub_apply import apply_association_to_container
6
+ from datahub.upgrade import upgrade
6
7
 
7
8
  logger = logging.getLogger(__name__)
8
9
 
@@ -16,6 +17,7 @@ def container() -> None:
16
17
  @container.command()
17
18
  @click.option("--container-urn", required=True, type=str)
18
19
  @click.option("--tag-urn", required=True, type=str)
20
+ @upgrade.check_upgrade
19
21
  def tag(container_urn: str, tag_urn: str) -> None:
20
22
  """Add patch to add a tag to all datasets in a container"""
21
23
  apply_association_to_container(container_urn, tag_urn, "tag")
@@ -24,6 +26,7 @@ def tag(container_urn: str, tag_urn: str) -> None:
24
26
  @container.command()
25
27
  @click.option("--container-urn", required=True, type=str)
26
28
  @click.option("--term-urn", required=True, type=str)
29
+ @upgrade.check_upgrade
27
30
  def term(container_urn: str, term_urn: str) -> None:
28
31
  """Add patch to add a term to all datasets in a container"""
29
32
  apply_association_to_container(container_urn, term_urn, "term")
@@ -32,6 +35,7 @@ def term(container_urn: str, term_urn: str) -> None:
32
35
  @container.command()
33
36
  @click.option("--container-urn", required=True, type=str)
34
37
  @click.option("--owner-urn", required=True, type=str)
38
+ @upgrade.check_upgrade
35
39
  def owner(container_urn: str, owner_urn: str) -> None:
36
40
  """Add patch to add a owner to all datasets in a container"""
37
41
  apply_association_to_container(container_urn, owner_urn, "owner")
@@ -40,6 +44,7 @@ def owner(container_urn: str, owner_urn: str) -> None:
40
44
  @container.command()
41
45
  @click.option("--container-urn", required=True, type=str)
42
46
  @click.option("--domain-urn", required=True, type=str)
47
+ @upgrade.check_upgrade
43
48
  def domain(container_urn: str, domain_urn: str) -> None:
44
49
  """Add patch to add a domain to all datasets in a container"""
45
50
  apply_association_to_container(container_urn, domain_urn, "domain")
datahub/cli/delete_cli.py CHANGED
@@ -18,7 +18,6 @@ from datahub.emitter.aspect import ASPECT_MAP, TIMESERIES_ASPECT_MAP
18
18
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
19
19
  from datahub.ingestion.graph.config import ClientMode
20
20
  from datahub.ingestion.graph.filters import RemovedStatusFilter
21
- from datahub.telemetry import telemetry
22
21
  from datahub.upgrade import upgrade
23
22
  from datahub.utilities.perf_timer import PerfTimer
24
23
  from datahub.utilities.urns.urn import guess_entity_type
@@ -116,7 +115,7 @@ class DeletionResult:
116
115
  help="specifies soft/hard deletion",
117
116
  )
118
117
  @click.option("-n", "--dry-run", required=False, is_flag=True)
119
- @telemetry.with_telemetry()
118
+ @upgrade.check_upgrade
120
119
  def by_registry(
121
120
  registry_id: str,
122
121
  soft: bool,
@@ -171,7 +170,7 @@ def by_registry(
171
170
  @click.option(
172
171
  "-f", "--force", required=False, is_flag=True, help="force the delete if set"
173
172
  )
174
- @telemetry.with_telemetry()
173
+ @upgrade.check_upgrade
175
174
  def references(urn: str, dry_run: bool, force: bool) -> None:
176
175
  """
177
176
  Delete all references to an entity (but not the entity itself).
@@ -234,6 +233,7 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
234
233
  help="Batch size when querying for entities to un-soft delete."
235
234
  "Maximum 5000. Large batch sizes may cause timeouts.",
236
235
  )
236
+ @upgrade.check_upgrade
237
237
  def undo_by_filter(
238
238
  urn: Optional[str], platform: Optional[str], batch_size: int
239
239
  ) -> None:
@@ -370,7 +370,6 @@ def undo_by_filter(
370
370
  "--workers", type=int, default=1, help="Num of workers to use for deletion."
371
371
  )
372
372
  @upgrade.check_upgrade
373
- @telemetry.with_telemetry()
374
373
  def by_filter(
375
374
  urn: Optional[str],
376
375
  urn_file: Optional[str],
@@ -1,8 +1,9 @@
1
1
  import enum
2
2
  import os
3
+ import pathlib
3
4
  from contextlib import contextmanager
4
5
  from dataclasses import dataclass
5
- from typing import Any, Dict, Iterator, List, Optional
6
+ from typing import Any, Dict, Iterator, List, Optional, Set
6
7
 
7
8
  import docker
8
9
  import docker.errors
@@ -13,6 +14,7 @@ from datahub.configuration.common import ExceptionWithProps
13
14
 
14
15
  # Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.
15
16
  MIN_MEMORY_NEEDED = 3.8 # GB
17
+ MIN_DISK_SPACE_NEEDED = 12 # GB
16
18
 
17
19
  DOCKER_COMPOSE_PROJECT_NAME = os.getenv("DATAHUB_COMPOSE_PROJECT_NAME", "datahub")
18
20
  DATAHUB_COMPOSE_PROJECT_FILTER = {
@@ -37,6 +39,10 @@ class DockerLowMemoryError(Exception):
37
39
  SHOW_STACK_TRACE = False
38
40
 
39
41
 
42
+ class DockerLowDiskSpaceError(Exception):
43
+ SHOW_STACK_TRACE = False
44
+
45
+
40
46
  class DockerComposeVersionError(Exception):
41
47
  SHOW_STACK_TRACE = False
42
48
 
@@ -102,6 +108,24 @@ def run_quickstart_preflight_checks(client: docker.DockerClient) -> None:
102
108
  "You can increase the memory allocated to Docker in the Docker settings."
103
109
  )
104
110
 
111
+ result = client.containers.run(
112
+ "alpine:latest",
113
+ "sh -c \"df -B1 / | tail -1 | awk '{print $2, $4}'\"", # total, available
114
+ remove=True,
115
+ stdout=True,
116
+ stderr=True,
117
+ )
118
+
119
+ output = result.decode("utf-8").strip()
120
+ total_bytes, available_bytes = map(int, output.split())
121
+
122
+ available_gb = available_bytes / (1024**3)
123
+ if available_gb < MIN_DISK_SPACE_NEEDED:
124
+ raise DockerLowDiskSpaceError(
125
+ f"Total Docker disk space available {available_gb:.2f}GB is below the minimum threshold {MIN_DISK_SPACE_NEEDED}GB. "
126
+ "You can increase the disk space allocated to Docker in the Docker settings or free up disk space`"
127
+ )
128
+
105
129
 
106
130
  class ContainerStatus(enum.Enum):
107
131
  OK = "is ok"
@@ -126,10 +150,24 @@ class DockerContainerStatus:
126
150
  @dataclass
127
151
  class QuickstartStatus:
128
152
  containers: List[DockerContainerStatus]
153
+ volumes: Set[str]
154
+ # On moving to compose profiles, this CLI will no longer support running quickstart instances from earlier versions.
155
+ # While the check command can work, upgrades or
156
+ running_unsupported_version: bool
157
+
158
+ def __init__(
159
+ self,
160
+ containers: List[DockerContainerStatus],
161
+ volumes: List[str],
162
+ running_unsupported_version: bool = False,
163
+ ):
164
+ self.containers = containers
165
+ self.running_unsupported_version = running_unsupported_version
166
+ self.volumes = set(volumes)
129
167
 
130
168
  def errors(self) -> List[str]:
131
169
  if not self.containers:
132
- return ["quickstart.sh or dev.sh is not running"]
170
+ return ["datahub is not running"]
133
171
 
134
172
  return [
135
173
  f"{container.name} {container.status.value}"
@@ -176,6 +214,26 @@ class QuickstartStatus:
176
214
  },
177
215
  )
178
216
 
217
+ def get_containers(self) -> Set[str]:
218
+ if self.containers:
219
+ return {container.name for container in self.containers}
220
+ else:
221
+ return set()
222
+
223
+
224
+ def detect_legacy_quickstart_compose(containers: Set[str]) -> bool:
225
+ return "zookeeper" in containers
226
+
227
+
228
+ def _get_services_from_compose(compose_file: str) -> Set[str]:
229
+ with open(compose_file) as config_file:
230
+ return yaml.safe_load(config_file).get("services", {}).keys()
231
+
232
+
233
+ def _get_volumes_from_compose(compose_file: str) -> Set[str]:
234
+ with open(compose_file) as config_file:
235
+ return yaml.safe_load(config_file).get("volumes", {}).keys()
236
+
179
237
 
180
238
  def check_docker_quickstart() -> QuickstartStatus:
181
239
  container_statuses: List[DockerContainerStatus] = []
@@ -188,7 +246,7 @@ def check_docker_quickstart() -> QuickstartStatus:
188
246
  ignore_removed=True,
189
247
  )
190
248
  if len(containers) == 0:
191
- return QuickstartStatus([])
249
+ return QuickstartStatus([], [], running_unsupported_version=False)
192
250
 
193
251
  # load the expected containers from the docker-compose file
194
252
  config_files = (
@@ -197,16 +255,17 @@ def check_docker_quickstart() -> QuickstartStatus:
197
255
  .split(",")
198
256
  )
199
257
 
200
- # If using profiles, alternative check
258
+ # If using profiles, alternative check ##TODO: Does this really work? Check mixpanel for usage of this.
201
259
  if config_files and "/profiles/" in config_files[0]:
202
260
  return check_docker_quickstart_profiles(client)
203
261
 
204
262
  all_containers = set()
205
263
  for config_file in config_files:
206
- with open(config_file) as config_file:
207
- all_containers.update(
208
- yaml.safe_load(config_file).get("services", {}).keys()
209
- )
264
+ all_containers.update(_get_services_from_compose(config_file))
265
+
266
+ all_volumes = set()
267
+ for config_file in config_files:
268
+ all_volumes.update(_get_volumes_from_compose(config_file))
210
269
 
211
270
  existing_containers = set()
212
271
  # Check that the containers are running and healthy.
@@ -240,8 +299,12 @@ def check_docker_quickstart() -> QuickstartStatus:
240
299
  container_statuses.append(
241
300
  DockerContainerStatus(missing, ContainerStatus.MISSING)
242
301
  )
243
-
244
- return QuickstartStatus(container_statuses)
302
+ running_unsupported_version = detect_legacy_quickstart_compose(all_containers)
303
+ return QuickstartStatus(
304
+ containers=container_statuses,
305
+ volumes=list(all_volumes),
306
+ running_unsupported_version=running_unsupported_version,
307
+ )
245
308
 
246
309
 
247
310
  def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartStatus:
@@ -254,7 +317,7 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
254
317
  ignore_removed=True,
255
318
  )
256
319
  if len(containers) == 0:
257
- return QuickstartStatus([])
320
+ return QuickstartStatus([], [], running_unsupported_version=False)
258
321
 
259
322
  existing_containers = set()
260
323
  # Check that the containers are running and healthy.
@@ -273,4 +336,36 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
273
336
 
274
337
  container_statuses.append(DockerContainerStatus(name, status))
275
338
 
276
- return QuickstartStatus(container_statuses)
339
+ # TODO: Can this be handled with older verions?
340
+ return QuickstartStatus(
341
+ container_statuses, volumes=[], running_unsupported_version=False
342
+ )
343
+
344
+
345
+ def check_upgrade_supported(
346
+ quickstart_compose_file: List[pathlib.Path], quickstart_status: QuickstartStatus
347
+ ) -> bool:
348
+ if (
349
+ quickstart_status.running_unsupported_version
350
+ ): # we detected a legacy quickstart service
351
+ return False
352
+
353
+ if not quickstart_status.get_containers(): # no containers are running
354
+ return True
355
+
356
+ compose_services = set()
357
+ compose_volumes = set()
358
+
359
+ for compose_file in quickstart_compose_file:
360
+ compose_services.update(_get_services_from_compose(str(compose_file)))
361
+ compose_volumes.update(_get_volumes_from_compose(str(compose_file)))
362
+
363
+ # if all services and volumes are not the same, the state in the volumes may not be compatible with the new services.
364
+ # We are checking for containers and volumes per the compose file, not necessarily all of them being present
365
+ if (
366
+ compose_services == quickstart_status.get_containers()
367
+ and compose_volumes == quickstart_status.volumes
368
+ ):
369
+ return True
370
+ else:
371
+ return False