acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2558 -2531
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +221 -187
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1433 -546
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17736 -17112
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
datahub/cli/delete_cli.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import random
|
|
3
|
+
import sys
|
|
3
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
5
|
from dataclasses import dataclass
|
|
5
6
|
from datetime import datetime
|
|
@@ -17,7 +18,6 @@ from datahub.emitter.aspect import ASPECT_MAP, TIMESERIES_ASPECT_MAP
|
|
|
17
18
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
18
19
|
from datahub.ingestion.graph.config import ClientMode
|
|
19
20
|
from datahub.ingestion.graph.filters import RemovedStatusFilter
|
|
20
|
-
from datahub.telemetry import telemetry
|
|
21
21
|
from datahub.upgrade import upgrade
|
|
22
22
|
from datahub.utilities.perf_timer import PerfTimer
|
|
23
23
|
from datahub.utilities.urns.urn import guess_entity_type
|
|
@@ -115,7 +115,7 @@ class DeletionResult:
|
|
|
115
115
|
help="specifies soft/hard deletion",
|
|
116
116
|
)
|
|
117
117
|
@click.option("-n", "--dry-run", required=False, is_flag=True)
|
|
118
|
-
@
|
|
118
|
+
@upgrade.check_upgrade
|
|
119
119
|
def by_registry(
|
|
120
120
|
registry_id: str,
|
|
121
121
|
soft: bool,
|
|
@@ -170,7 +170,7 @@ def by_registry(
|
|
|
170
170
|
@click.option(
|
|
171
171
|
"-f", "--force", required=False, is_flag=True, help="force the delete if set"
|
|
172
172
|
)
|
|
173
|
-
@
|
|
173
|
+
@upgrade.check_upgrade
|
|
174
174
|
def references(urn: str, dry_run: bool, force: bool) -> None:
|
|
175
175
|
"""
|
|
176
176
|
Delete all references to an entity (but not the entity itself).
|
|
@@ -231,8 +231,9 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
|
|
|
231
231
|
default=3000,
|
|
232
232
|
type=int,
|
|
233
233
|
help="Batch size when querying for entities to un-soft delete."
|
|
234
|
-
"Maximum
|
|
234
|
+
"Maximum 5000. Large batch sizes may cause timeouts.",
|
|
235
235
|
)
|
|
236
|
+
@upgrade.check_upgrade
|
|
236
237
|
def undo_by_filter(
|
|
237
238
|
urn: Optional[str], platform: Optional[str], batch_size: int
|
|
238
239
|
) -> None:
|
|
@@ -317,6 +318,19 @@ def undo_by_filter(
|
|
|
317
318
|
is_flag=True,
|
|
318
319
|
help="Recursively delete all contained entities (only for containers and dataPlatformInstances)",
|
|
319
320
|
)
|
|
321
|
+
@click.option(
|
|
322
|
+
"--streaming-batch",
|
|
323
|
+
required=False,
|
|
324
|
+
is_flag=True,
|
|
325
|
+
help="Use streaming batch deletion for recursive operations. Benefit of being resumable for large hierarchies where getting all URNs at once can take a long time.",
|
|
326
|
+
)
|
|
327
|
+
@click.option(
|
|
328
|
+
"--streaming-batch-size",
|
|
329
|
+
required=False,
|
|
330
|
+
default=12000,
|
|
331
|
+
type=int,
|
|
332
|
+
help="Batch size for streaming batch deletion for recursive operations.",
|
|
333
|
+
)
|
|
320
334
|
@click.option(
|
|
321
335
|
"--start-time",
|
|
322
336
|
required=False,
|
|
@@ -336,7 +350,7 @@ def undo_by_filter(
|
|
|
336
350
|
default=3000,
|
|
337
351
|
type=int,
|
|
338
352
|
help="Batch size when querying for entities to delete."
|
|
339
|
-
"Maximum
|
|
353
|
+
"Maximum 5000. Large batch sizes may cause timeouts.",
|
|
340
354
|
)
|
|
341
355
|
@click.option(
|
|
342
356
|
"-n",
|
|
@@ -356,7 +370,6 @@ def undo_by_filter(
|
|
|
356
370
|
"--workers", type=int, default=1, help="Num of workers to use for deletion."
|
|
357
371
|
)
|
|
358
372
|
@upgrade.check_upgrade
|
|
359
|
-
@telemetry.with_telemetry()
|
|
360
373
|
def by_filter(
|
|
361
374
|
urn: Optional[str],
|
|
362
375
|
urn_file: Optional[str],
|
|
@@ -368,6 +381,8 @@ def by_filter(
|
|
|
368
381
|
entity_type: Optional[str],
|
|
369
382
|
query: Optional[str],
|
|
370
383
|
recursive: bool,
|
|
384
|
+
streaming_batch: bool,
|
|
385
|
+
streaming_batch_size: int,
|
|
371
386
|
start_time: Optional[datetime],
|
|
372
387
|
end_time: Optional[datetime],
|
|
373
388
|
batch_size: int,
|
|
@@ -386,6 +401,7 @@ def by_filter(
|
|
|
386
401
|
env=env,
|
|
387
402
|
query=query,
|
|
388
403
|
recursive=recursive,
|
|
404
|
+
streaming_batch=streaming_batch,
|
|
389
405
|
)
|
|
390
406
|
soft_delete_filter = _validate_user_soft_delete_flags(
|
|
391
407
|
soft=soft, aspect=aspect, only_soft_deleted=only_soft_deleted
|
|
@@ -417,26 +433,27 @@ def by_filter(
|
|
|
417
433
|
# Determine which urns to delete.
|
|
418
434
|
delete_by_urn = bool(urn) and not recursive
|
|
419
435
|
if urn:
|
|
420
|
-
urns = [urn]
|
|
421
|
-
|
|
422
436
|
if recursive:
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
437
|
+
_delete_urns_streaming_recursive(
|
|
438
|
+
graph=graph,
|
|
439
|
+
parent_urn=urn,
|
|
440
|
+
aspect_name=aspect,
|
|
441
|
+
soft=soft,
|
|
442
|
+
dry_run=dry_run,
|
|
443
|
+
start_time=start_time,
|
|
444
|
+
end_time=end_time,
|
|
445
|
+
workers=workers,
|
|
446
|
+
soft_delete_filter=soft_delete_filter,
|
|
447
|
+
batch_size=batch_size,
|
|
448
|
+
force=force,
|
|
449
|
+
streaming_batch_size=streaming_batch_size
|
|
450
|
+
if streaming_batch
|
|
451
|
+
else sys.maxsize,
|
|
452
|
+
)
|
|
453
|
+
return
|
|
454
|
+
|
|
455
|
+
else:
|
|
456
|
+
urns = [urn]
|
|
440
457
|
elif urn_file:
|
|
441
458
|
with open(urn_file, "r") as r:
|
|
442
459
|
urns = []
|
|
@@ -557,6 +574,7 @@ def _validate_user_urn_and_filters(
|
|
|
557
574
|
env: Optional[str],
|
|
558
575
|
query: Optional[str],
|
|
559
576
|
recursive: bool,
|
|
577
|
+
streaming_batch: bool,
|
|
560
578
|
) -> None:
|
|
561
579
|
# Check urn / filters options.
|
|
562
580
|
if urn:
|
|
@@ -592,6 +610,12 @@ def _validate_user_urn_and_filters(
|
|
|
592
610
|
f"This will only delete {urn}. Use --recursive to delete all contained entities."
|
|
593
611
|
)
|
|
594
612
|
|
|
613
|
+
# Check streaming flag.
|
|
614
|
+
if streaming_batch and not recursive:
|
|
615
|
+
raise click.UsageError(
|
|
616
|
+
"The --streaming-batch flag can only be used with --recursive."
|
|
617
|
+
)
|
|
618
|
+
|
|
595
619
|
|
|
596
620
|
def _validate_user_soft_delete_flags(
|
|
597
621
|
soft: bool, aspect: Optional[str], only_soft_deleted: bool
|
|
@@ -654,8 +678,8 @@ def _validate_user_aspect_flags(
|
|
|
654
678
|
def _validate_batch_size(batch_size: int) -> None:
|
|
655
679
|
if batch_size <= 0:
|
|
656
680
|
raise click.UsageError("Batch size must be a positive integer.")
|
|
657
|
-
elif batch_size >
|
|
658
|
-
raise click.UsageError("Batch size cannot exceed
|
|
681
|
+
elif batch_size > 5000:
|
|
682
|
+
raise click.UsageError("Batch size cannot exceed 5,000.")
|
|
659
683
|
|
|
660
684
|
|
|
661
685
|
def _delete_one_urn(
|
|
@@ -738,3 +762,76 @@ def _delete_one_urn(
|
|
|
738
762
|
num_timeseries_records=ts_rows_affected,
|
|
739
763
|
num_referenced_entities=referenced_entities_affected,
|
|
740
764
|
)
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
def _delete_urns_streaming_recursive(
|
|
768
|
+
graph: DataHubGraph,
|
|
769
|
+
parent_urn: str,
|
|
770
|
+
aspect_name: Optional[str],
|
|
771
|
+
soft: bool,
|
|
772
|
+
dry_run: bool,
|
|
773
|
+
start_time: Optional[datetime],
|
|
774
|
+
end_time: Optional[datetime],
|
|
775
|
+
workers: int,
|
|
776
|
+
soft_delete_filter: RemovedStatusFilter,
|
|
777
|
+
batch_size: int,
|
|
778
|
+
force: bool,
|
|
779
|
+
streaming_batch_size: int,
|
|
780
|
+
) -> None:
|
|
781
|
+
"""Streaming recursive batch deletion that processes URNs in batches."""
|
|
782
|
+
|
|
783
|
+
entity_type = guess_entity_type(parent_urn)
|
|
784
|
+
click.echo(f"Starting recursive deletion of {entity_type} {parent_urn}")
|
|
785
|
+
|
|
786
|
+
if not force and not dry_run:
|
|
787
|
+
click.confirm(
|
|
788
|
+
f"This will recursively delete {parent_urn} and all its contained entities. Do you want to continue?",
|
|
789
|
+
abort=True,
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
urns = []
|
|
793
|
+
|
|
794
|
+
if entity_type == "dataPlatformInstance":
|
|
795
|
+
child_urns_iter = graph.get_urns_by_filter(
|
|
796
|
+
platform_instance=parent_urn,
|
|
797
|
+
status=soft_delete_filter,
|
|
798
|
+
batch_size=batch_size,
|
|
799
|
+
# Important to skip cache so we can resume from where we left off.
|
|
800
|
+
skip_cache=True,
|
|
801
|
+
)
|
|
802
|
+
else:
|
|
803
|
+
child_urns_iter = graph.get_urns_by_filter(
|
|
804
|
+
container=parent_urn,
|
|
805
|
+
status=soft_delete_filter,
|
|
806
|
+
batch_size=batch_size,
|
|
807
|
+
# Important to skip cache so we can resume from where we left off.
|
|
808
|
+
skip_cache=True,
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
for child_urn in child_urns_iter:
|
|
812
|
+
urns.append(child_urn)
|
|
813
|
+
if len(urns) >= streaming_batch_size:
|
|
814
|
+
_delete_urns_parallel(
|
|
815
|
+
graph=graph,
|
|
816
|
+
urns=urns,
|
|
817
|
+
aspect_name=aspect_name,
|
|
818
|
+
soft=soft,
|
|
819
|
+
dry_run=dry_run,
|
|
820
|
+
delete_by_urn=False,
|
|
821
|
+
start_time=start_time,
|
|
822
|
+
end_time=end_time,
|
|
823
|
+
workers=workers,
|
|
824
|
+
)
|
|
825
|
+
urns = []
|
|
826
|
+
urns.append(parent_urn)
|
|
827
|
+
_delete_urns_parallel(
|
|
828
|
+
graph=graph,
|
|
829
|
+
urns=urns,
|
|
830
|
+
aspect_name=aspect_name,
|
|
831
|
+
soft=soft,
|
|
832
|
+
dry_run=dry_run,
|
|
833
|
+
delete_by_urn=False,
|
|
834
|
+
start_time=start_time,
|
|
835
|
+
end_time=end_time,
|
|
836
|
+
workers=workers,
|
|
837
|
+
)
|
datahub/cli/docker_check.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import enum
|
|
2
2
|
import os
|
|
3
|
+
import pathlib
|
|
3
4
|
from contextlib import contextmanager
|
|
4
5
|
from dataclasses import dataclass
|
|
5
|
-
from typing import Any, Dict, Iterator, List, Optional
|
|
6
|
+
from typing import Any, Dict, Iterator, List, Optional, Set
|
|
6
7
|
|
|
7
8
|
import docker
|
|
8
9
|
import docker.errors
|
|
@@ -13,6 +14,7 @@ from datahub.configuration.common import ExceptionWithProps
|
|
|
13
14
|
|
|
14
15
|
# Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.
|
|
15
16
|
MIN_MEMORY_NEEDED = 3.8 # GB
|
|
17
|
+
MIN_DISK_SPACE_NEEDED = 12 # GB
|
|
16
18
|
|
|
17
19
|
DOCKER_COMPOSE_PROJECT_NAME = os.getenv("DATAHUB_COMPOSE_PROJECT_NAME", "datahub")
|
|
18
20
|
DATAHUB_COMPOSE_PROJECT_FILTER = {
|
|
@@ -37,6 +39,10 @@ class DockerLowMemoryError(Exception):
|
|
|
37
39
|
SHOW_STACK_TRACE = False
|
|
38
40
|
|
|
39
41
|
|
|
42
|
+
class DockerLowDiskSpaceError(Exception):
|
|
43
|
+
SHOW_STACK_TRACE = False
|
|
44
|
+
|
|
45
|
+
|
|
40
46
|
class DockerComposeVersionError(Exception):
|
|
41
47
|
SHOW_STACK_TRACE = False
|
|
42
48
|
|
|
@@ -102,6 +108,24 @@ def run_quickstart_preflight_checks(client: docker.DockerClient) -> None:
|
|
|
102
108
|
"You can increase the memory allocated to Docker in the Docker settings."
|
|
103
109
|
)
|
|
104
110
|
|
|
111
|
+
result = client.containers.run(
|
|
112
|
+
"alpine:latest",
|
|
113
|
+
"sh -c \"df -B1 / | tail -1 | awk '{print $2, $4}'\"", # total, available
|
|
114
|
+
remove=True,
|
|
115
|
+
stdout=True,
|
|
116
|
+
stderr=True,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
output = result.decode("utf-8").strip()
|
|
120
|
+
total_bytes, available_bytes = map(int, output.split())
|
|
121
|
+
|
|
122
|
+
available_gb = available_bytes / (1024**3)
|
|
123
|
+
if available_gb < MIN_DISK_SPACE_NEEDED:
|
|
124
|
+
raise DockerLowDiskSpaceError(
|
|
125
|
+
f"Total Docker disk space available {available_gb:.2f}GB is below the minimum threshold {MIN_DISK_SPACE_NEEDED}GB. "
|
|
126
|
+
"You can increase the disk space allocated to Docker in the Docker settings or free up disk space`"
|
|
127
|
+
)
|
|
128
|
+
|
|
105
129
|
|
|
106
130
|
class ContainerStatus(enum.Enum):
|
|
107
131
|
OK = "is ok"
|
|
@@ -126,10 +150,24 @@ class DockerContainerStatus:
|
|
|
126
150
|
@dataclass
|
|
127
151
|
class QuickstartStatus:
|
|
128
152
|
containers: List[DockerContainerStatus]
|
|
153
|
+
volumes: Set[str]
|
|
154
|
+
# On moving to compose profiles, this CLI will no longer support running quickstart instances from earlier versions.
|
|
155
|
+
# While the check command can work, upgrades or
|
|
156
|
+
running_unsupported_version: bool
|
|
157
|
+
|
|
158
|
+
def __init__(
|
|
159
|
+
self,
|
|
160
|
+
containers: List[DockerContainerStatus],
|
|
161
|
+
volumes: List[str],
|
|
162
|
+
running_unsupported_version: bool = False,
|
|
163
|
+
):
|
|
164
|
+
self.containers = containers
|
|
165
|
+
self.running_unsupported_version = running_unsupported_version
|
|
166
|
+
self.volumes = set(volumes)
|
|
129
167
|
|
|
130
168
|
def errors(self) -> List[str]:
|
|
131
169
|
if not self.containers:
|
|
132
|
-
return ["
|
|
170
|
+
return ["datahub is not running"]
|
|
133
171
|
|
|
134
172
|
return [
|
|
135
173
|
f"{container.name} {container.status.value}"
|
|
@@ -176,6 +214,26 @@ class QuickstartStatus:
|
|
|
176
214
|
},
|
|
177
215
|
)
|
|
178
216
|
|
|
217
|
+
def get_containers(self) -> Set[str]:
|
|
218
|
+
if self.containers:
|
|
219
|
+
return {container.name for container in self.containers}
|
|
220
|
+
else:
|
|
221
|
+
return set()
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def detect_legacy_quickstart_compose(containers: Set[str]) -> bool:
|
|
225
|
+
return "zookeeper" in containers
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _get_services_from_compose(compose_file: str) -> Set[str]:
|
|
229
|
+
with open(compose_file) as config_file:
|
|
230
|
+
return yaml.safe_load(config_file).get("services", {}).keys()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _get_volumes_from_compose(compose_file: str) -> Set[str]:
|
|
234
|
+
with open(compose_file) as config_file:
|
|
235
|
+
return yaml.safe_load(config_file).get("volumes", {}).keys()
|
|
236
|
+
|
|
179
237
|
|
|
180
238
|
def check_docker_quickstart() -> QuickstartStatus:
|
|
181
239
|
container_statuses: List[DockerContainerStatus] = []
|
|
@@ -188,7 +246,7 @@ def check_docker_quickstart() -> QuickstartStatus:
|
|
|
188
246
|
ignore_removed=True,
|
|
189
247
|
)
|
|
190
248
|
if len(containers) == 0:
|
|
191
|
-
return QuickstartStatus([])
|
|
249
|
+
return QuickstartStatus([], [], running_unsupported_version=False)
|
|
192
250
|
|
|
193
251
|
# load the expected containers from the docker-compose file
|
|
194
252
|
config_files = (
|
|
@@ -197,16 +255,17 @@ def check_docker_quickstart() -> QuickstartStatus:
|
|
|
197
255
|
.split(",")
|
|
198
256
|
)
|
|
199
257
|
|
|
200
|
-
# If using profiles, alternative check
|
|
258
|
+
# If using profiles, alternative check ##TODO: Does this really work? Check mixpanel for usage of this.
|
|
201
259
|
if config_files and "/profiles/" in config_files[0]:
|
|
202
260
|
return check_docker_quickstart_profiles(client)
|
|
203
261
|
|
|
204
262
|
all_containers = set()
|
|
205
263
|
for config_file in config_files:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
264
|
+
all_containers.update(_get_services_from_compose(config_file))
|
|
265
|
+
|
|
266
|
+
all_volumes = set()
|
|
267
|
+
for config_file in config_files:
|
|
268
|
+
all_volumes.update(_get_volumes_from_compose(config_file))
|
|
210
269
|
|
|
211
270
|
existing_containers = set()
|
|
212
271
|
# Check that the containers are running and healthy.
|
|
@@ -240,8 +299,12 @@ def check_docker_quickstart() -> QuickstartStatus:
|
|
|
240
299
|
container_statuses.append(
|
|
241
300
|
DockerContainerStatus(missing, ContainerStatus.MISSING)
|
|
242
301
|
)
|
|
243
|
-
|
|
244
|
-
return QuickstartStatus(
|
|
302
|
+
running_unsupported_version = detect_legacy_quickstart_compose(all_containers)
|
|
303
|
+
return QuickstartStatus(
|
|
304
|
+
containers=container_statuses,
|
|
305
|
+
volumes=list(all_volumes),
|
|
306
|
+
running_unsupported_version=running_unsupported_version,
|
|
307
|
+
)
|
|
245
308
|
|
|
246
309
|
|
|
247
310
|
def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartStatus:
|
|
@@ -254,7 +317,7 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
|
|
|
254
317
|
ignore_removed=True,
|
|
255
318
|
)
|
|
256
319
|
if len(containers) == 0:
|
|
257
|
-
return QuickstartStatus([])
|
|
320
|
+
return QuickstartStatus([], [], running_unsupported_version=False)
|
|
258
321
|
|
|
259
322
|
existing_containers = set()
|
|
260
323
|
# Check that the containers are running and healthy.
|
|
@@ -273,4 +336,36 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
|
|
|
273
336
|
|
|
274
337
|
container_statuses.append(DockerContainerStatus(name, status))
|
|
275
338
|
|
|
276
|
-
|
|
339
|
+
# TODO: Can this be handled with older verions?
|
|
340
|
+
return QuickstartStatus(
|
|
341
|
+
container_statuses, volumes=[], running_unsupported_version=False
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def check_upgrade_supported(
|
|
346
|
+
quickstart_compose_file: List[pathlib.Path], quickstart_status: QuickstartStatus
|
|
347
|
+
) -> bool:
|
|
348
|
+
if (
|
|
349
|
+
quickstart_status.running_unsupported_version
|
|
350
|
+
): # we detected a legacy quickstart service
|
|
351
|
+
return False
|
|
352
|
+
|
|
353
|
+
if not quickstart_status.get_containers(): # no containers are running
|
|
354
|
+
return True
|
|
355
|
+
|
|
356
|
+
compose_services = set()
|
|
357
|
+
compose_volumes = set()
|
|
358
|
+
|
|
359
|
+
for compose_file in quickstart_compose_file:
|
|
360
|
+
compose_services.update(_get_services_from_compose(str(compose_file)))
|
|
361
|
+
compose_volumes.update(_get_volumes_from_compose(str(compose_file)))
|
|
362
|
+
|
|
363
|
+
# if all services and volumes are not the same, the state in the volumes may not be compatible with the new services.
|
|
364
|
+
# We are checking for containers and volumes per the compose file, not necessarily all of them being present
|
|
365
|
+
if (
|
|
366
|
+
compose_services == quickstart_status.get_containers()
|
|
367
|
+
and compose_volumes == quickstart_status.volumes
|
|
368
|
+
):
|
|
369
|
+
return True
|
|
370
|
+
else:
|
|
371
|
+
return False
|