acryl-datahub 1.1.0.4rc2__py3-none-any.whl → 1.1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2528 -2530
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +156 -138
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/cli/check_cli.py +65 -11
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +3 -4
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/rest_emitter.py +41 -8
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +47 -45
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/graph/client.py +73 -30
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +12 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/glue.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +49 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
- datahub/ingestion/source/dbt/dbt_common.py +3 -1
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/powerbi/powerbi.py +0 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/redshift/redshift.py +17 -0
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +6 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -12
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/hive_metastore.py +0 -10
- datahub/ingestion/source/sql/mssql/source.py +24 -15
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/sql_common.py +11 -0
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +57 -2
- datahub/ingestion/source/tableau/tableau.py +57 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/proxy.py +4 -3
- datahub/ingestion/source/unity/source.py +56 -30
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -0
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1253 -536
- datahub/metadata/_urns/urn_defs.py +1797 -1685
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +16614 -16538
- datahub/metadata/schemas/ContainerProperties.avsc +2 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataJobInfo.avsc +2 -0
- datahub/metadata/schemas/DataProcessKey.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +4 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
- datahub/metadata/schemas/MLModelKey.avsc +2 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/sdk/datajob.py +39 -15
- datahub/sdk/lineage_client.py +2 -0
- datahub/sdk/main_client.py +14 -2
- datahub/sdk/search_client.py +4 -3
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +40 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0
|
@@ -39,6 +39,7 @@ datahub-business-glossary = datahub.ingestion.source.metadata.business_glossary:
|
|
|
39
39
|
datahub-debug = datahub.ingestion.source.debug.datahub_debug:DataHubDebugSource
|
|
40
40
|
datahub-gc = datahub.ingestion.source.gc.datahub_gc:DataHubGcSource
|
|
41
41
|
datahub-lineage-file = datahub.ingestion.source.metadata.lineage:LineageFileSource
|
|
42
|
+
datahub-mock-data = datahub.ingestion.source.mock_data.datahub_mock_data:DataHubMockDataSource
|
|
42
43
|
dbt = datahub.ingestion.source.dbt.dbt_core:DBTCoreSource
|
|
43
44
|
dbt-cloud = datahub.ingestion.source.dbt.dbt_cloud:DBTCloudSource
|
|
44
45
|
delta-lake = datahub.ingestion.source.delta_lake:DeltaLakeSource
|
datahub/_version.py
CHANGED
|
@@ -383,7 +383,7 @@ class Dataset(StrictModel):
|
|
|
383
383
|
urn: Optional[str] = None
|
|
384
384
|
description: Optional[str] = None
|
|
385
385
|
name: Optional[str] = None
|
|
386
|
-
schema_metadata: Optional[SchemaSpecification] = Field(alias="schema")
|
|
386
|
+
schema_metadata: Optional[SchemaSpecification] = Field(default=None, alias="schema")
|
|
387
387
|
downstreams: Optional[List[str]] = None
|
|
388
388
|
properties: Optional[Dict[str, str]] = None
|
|
389
389
|
subtype: Optional[str] = None
|
datahub/cli/check_cli.py
CHANGED
|
@@ -9,6 +9,7 @@ from datetime import datetime
|
|
|
9
9
|
from typing import Any, Dict, List, Optional, Union
|
|
10
10
|
|
|
11
11
|
import click
|
|
12
|
+
from tabulate import tabulate
|
|
12
13
|
|
|
13
14
|
from datahub._version import __package_name__
|
|
14
15
|
from datahub.cli.json_file import check_mce_file
|
|
@@ -21,7 +22,7 @@ from datahub.ingestion.run.pipeline import Pipeline
|
|
|
21
22
|
from datahub.ingestion.sink.sink_registry import sink_registry
|
|
22
23
|
from datahub.ingestion.source.source_registry import source_registry
|
|
23
24
|
from datahub.ingestion.transformer.transform_registry import transform_registry
|
|
24
|
-
from datahub.
|
|
25
|
+
from datahub.upgrade import upgrade
|
|
25
26
|
from datahub.utilities.file_backed_collections import (
|
|
26
27
|
ConnectionWrapper,
|
|
27
28
|
FileBackedDict,
|
|
@@ -47,7 +48,6 @@ def check() -> None:
|
|
|
47
48
|
@click.option(
|
|
48
49
|
"--unpack-mces", default=False, is_flag=True, help="Converts MCEs into MCPs"
|
|
49
50
|
)
|
|
50
|
-
@telemetry.with_telemetry()
|
|
51
51
|
def metadata_file(json_file: str, rewrite: bool, unpack_mces: bool) -> None:
|
|
52
52
|
"""Check the schema of a metadata (MCE or MCP) JSON file."""
|
|
53
53
|
|
|
@@ -105,7 +105,6 @@ def metadata_file(json_file: str, rewrite: bool, unpack_mces: bool) -> None:
|
|
|
105
105
|
default=(),
|
|
106
106
|
help="[Advanced] Paths in the deepdiff object to ignore",
|
|
107
107
|
)
|
|
108
|
-
@telemetry.with_telemetry()
|
|
109
108
|
def metadata_diff(
|
|
110
109
|
actual_file: str, expected_file: str, verbose: bool, ignore_path: List[str]
|
|
111
110
|
) -> None:
|
|
@@ -142,7 +141,6 @@ def metadata_diff(
|
|
|
142
141
|
type=str,
|
|
143
142
|
default=None,
|
|
144
143
|
)
|
|
145
|
-
@telemetry.with_telemetry()
|
|
146
144
|
def plugins(source: Optional[str], verbose: bool) -> None:
|
|
147
145
|
"""List the enabled ingestion plugins."""
|
|
148
146
|
|
|
@@ -234,7 +232,7 @@ def sql_format(sql: str, platform: str) -> None:
|
|
|
234
232
|
default=True,
|
|
235
233
|
help="Run in offline mode and disable schema-aware parsing.",
|
|
236
234
|
)
|
|
237
|
-
@
|
|
235
|
+
@upgrade.check_upgrade
|
|
238
236
|
def sql_lineage(
|
|
239
237
|
sql: Optional[str],
|
|
240
238
|
sql_file: Optional[str],
|
|
@@ -297,7 +295,6 @@ def sql_lineage(
|
|
|
297
295
|
type=str,
|
|
298
296
|
help="the input to validate",
|
|
299
297
|
)
|
|
300
|
-
@telemetry.with_telemetry()
|
|
301
298
|
def test_allow_deny(config: str, input: str, pattern_key: str) -> None:
|
|
302
299
|
"""Test input string against AllowDeny pattern in a DataHub recipe.
|
|
303
300
|
|
|
@@ -346,7 +343,6 @@ def test_allow_deny(config: str, input: str, pattern_key: str) -> None:
|
|
|
346
343
|
type=str,
|
|
347
344
|
help="The input to validate",
|
|
348
345
|
)
|
|
349
|
-
@telemetry.with_telemetry()
|
|
350
346
|
def test_path_spec(config: str, input: str, path_spec_key: str) -> None:
|
|
351
347
|
"""Test input path string against PathSpec patterns in a DataHub recipe.
|
|
352
348
|
|
|
@@ -471,6 +467,7 @@ WHERE
|
|
|
471
467
|
|
|
472
468
|
|
|
473
469
|
@check.command()
|
|
470
|
+
@upgrade.check_upgrade
|
|
474
471
|
def server_config() -> None:
|
|
475
472
|
"""Print the server config."""
|
|
476
473
|
graph = get_default_graph(ClientMode.CLI)
|
|
@@ -482,26 +479,83 @@ def server_config() -> None:
|
|
|
482
479
|
|
|
483
480
|
@check.command()
|
|
484
481
|
@click.option(
|
|
485
|
-
"--urn", required=
|
|
482
|
+
"--urn", required=False, help="The urn or urn pattern (supports % for wildcard)"
|
|
486
483
|
)
|
|
487
484
|
@click.option("--aspect", default=None, help="Filter to a specific aspect name.")
|
|
488
485
|
@click.option(
|
|
489
486
|
"--start", type=int, default=None, help="Row number of sql store to restore from."
|
|
490
487
|
)
|
|
491
488
|
@click.option("--batch-size", type=int, default=None, help="How many rows to restore.")
|
|
489
|
+
@click.option(
|
|
490
|
+
"--file",
|
|
491
|
+
required=False,
|
|
492
|
+
type=click.Path(exists=True, dir_okay=True, readable=True),
|
|
493
|
+
help="File absolute path containing URNs (one per line) to restore indices",
|
|
494
|
+
)
|
|
495
|
+
@upgrade.check_upgrade
|
|
492
496
|
def restore_indices(
|
|
493
|
-
urn: str,
|
|
497
|
+
urn: Optional[str],
|
|
494
498
|
aspect: Optional[str],
|
|
495
499
|
start: Optional[int],
|
|
496
500
|
batch_size: Optional[int],
|
|
501
|
+
file: Optional[str],
|
|
497
502
|
) -> None:
|
|
498
503
|
"""Resync metadata changes into the search and graph indices."""
|
|
504
|
+
if urn is None and file is None:
|
|
505
|
+
raise click.UsageError("Either --urn or --file must be provided")
|
|
499
506
|
graph = get_default_graph(ClientMode.CLI)
|
|
500
507
|
|
|
501
|
-
|
|
508
|
+
graph.restore_indices(
|
|
502
509
|
urn_pattern=urn,
|
|
503
510
|
aspect=aspect,
|
|
504
511
|
start=start,
|
|
505
512
|
batch_size=batch_size,
|
|
513
|
+
file=file,
|
|
506
514
|
)
|
|
507
|
-
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
@check.command()
|
|
518
|
+
@upgrade.check_upgrade
|
|
519
|
+
def get_kafka_consumer_offsets() -> None:
|
|
520
|
+
"""Get Kafka consumer offsets from the DataHub API."""
|
|
521
|
+
graph = get_default_graph(ClientMode.CLI)
|
|
522
|
+
result = graph.get_kafka_consumer_offsets()
|
|
523
|
+
|
|
524
|
+
table_data = []
|
|
525
|
+
headers = [
|
|
526
|
+
"Topic",
|
|
527
|
+
"Consumer Group",
|
|
528
|
+
"Schema",
|
|
529
|
+
"Partition",
|
|
530
|
+
"Offset",
|
|
531
|
+
"Lag",
|
|
532
|
+
"Avg Lag",
|
|
533
|
+
"Max Lag",
|
|
534
|
+
"Total Lag",
|
|
535
|
+
]
|
|
536
|
+
|
|
537
|
+
for topic, consumers in result.items():
|
|
538
|
+
for consumer_group, schemas in consumers.items():
|
|
539
|
+
for schema, data in schemas.items():
|
|
540
|
+
metrics = data.get("metrics", {})
|
|
541
|
+
partitions = data.get("partitions", {})
|
|
542
|
+
|
|
543
|
+
for partition, partition_data in partitions.items():
|
|
544
|
+
table_data.append(
|
|
545
|
+
[
|
|
546
|
+
topic,
|
|
547
|
+
consumer_group,
|
|
548
|
+
schema,
|
|
549
|
+
partition,
|
|
550
|
+
partition_data.get("offset", "N/A"),
|
|
551
|
+
partition_data.get("lag", "N/A"),
|
|
552
|
+
metrics.get("avgLag", "N/A"),
|
|
553
|
+
metrics.get("maxLag", "N/A"),
|
|
554
|
+
metrics.get("totalLag", "N/A"),
|
|
555
|
+
]
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
if table_data:
|
|
559
|
+
click.echo(tabulate(table_data, headers=headers, tablefmt="grid"))
|
|
560
|
+
else:
|
|
561
|
+
click.echo("No Kafka consumer offset data found.")
|
datahub/cli/cli_utils.py
CHANGED
|
@@ -3,6 +3,7 @@ import logging
|
|
|
3
3
|
import time
|
|
4
4
|
import typing
|
|
5
5
|
from datetime import datetime
|
|
6
|
+
from functools import wraps
|
|
6
7
|
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
|
|
7
8
|
|
|
8
9
|
import click
|
|
@@ -424,3 +425,65 @@ def ensure_has_system_metadata(
|
|
|
424
425
|
props = metadata.properties
|
|
425
426
|
props["clientId"] = datahub_version.__package_name__
|
|
426
427
|
props["clientVersion"] = datahub_version.__version__
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def enable_auto_decorators(main_group: click.Group) -> None:
|
|
431
|
+
"""
|
|
432
|
+
Enable automatic decorators for all click commands.
|
|
433
|
+
This wraps existing command callback functions to add upgrade and telemetry decorators.
|
|
434
|
+
"""
|
|
435
|
+
|
|
436
|
+
def has_decorator(func: Any, module_pattern: str, function_pattern: str) -> bool:
|
|
437
|
+
"""Check if function already has a specific decorator"""
|
|
438
|
+
if hasattr(func, "__wrapped__"):
|
|
439
|
+
current_func = func
|
|
440
|
+
while hasattr(current_func, "__wrapped__"):
|
|
441
|
+
# Check if this wrapper matches the module and function patterns
|
|
442
|
+
if (
|
|
443
|
+
hasattr(current_func, "__module__")
|
|
444
|
+
and module_pattern in current_func.__module__
|
|
445
|
+
and hasattr(current_func, "__name__")
|
|
446
|
+
and function_pattern in current_func.__name__
|
|
447
|
+
):
|
|
448
|
+
return True
|
|
449
|
+
current_func = current_func.__wrapped__
|
|
450
|
+
return False
|
|
451
|
+
|
|
452
|
+
def has_telemetry_decorator(func):
|
|
453
|
+
return has_decorator(func, "telemetry", "with_telemetry")
|
|
454
|
+
|
|
455
|
+
def wrap_command_callback(command_obj):
|
|
456
|
+
"""Wrap a command's callback function to add decorators"""
|
|
457
|
+
if hasattr(command_obj, "callback") and command_obj.callback:
|
|
458
|
+
original_callback = command_obj.callback
|
|
459
|
+
|
|
460
|
+
# Import here to avoid circular imports
|
|
461
|
+
from datahub.telemetry import telemetry
|
|
462
|
+
|
|
463
|
+
decorated_callback = original_callback
|
|
464
|
+
|
|
465
|
+
if not has_telemetry_decorator(decorated_callback):
|
|
466
|
+
log.debug(
|
|
467
|
+
f"Applying telemetry decorator to {original_callback.__module__}.{original_callback.__name__}"
|
|
468
|
+
)
|
|
469
|
+
decorated_callback = telemetry.with_telemetry()(decorated_callback)
|
|
470
|
+
|
|
471
|
+
# Preserve the original function's metadata
|
|
472
|
+
decorated_callback = wraps(original_callback)(decorated_callback)
|
|
473
|
+
|
|
474
|
+
command_obj.callback = decorated_callback
|
|
475
|
+
|
|
476
|
+
def wrap_group_commands(group_obj):
|
|
477
|
+
"""Recursively wrap all commands in a group"""
|
|
478
|
+
if hasattr(group_obj, "commands"):
|
|
479
|
+
for _, command_obj in group_obj.commands.items():
|
|
480
|
+
if isinstance(command_obj, click.Group):
|
|
481
|
+
# Recursively wrap sub-groups
|
|
482
|
+
wrap_group_commands(command_obj)
|
|
483
|
+
else:
|
|
484
|
+
# Wrap individual commands
|
|
485
|
+
wrap_command_callback(command_obj)
|
|
486
|
+
|
|
487
|
+
wrap_group_commands(main_group)
|
|
488
|
+
|
|
489
|
+
log.debug("Auto-decorators enabled successfully")
|
datahub/cli/container_cli.py
CHANGED
|
@@ -3,6 +3,7 @@ import logging
|
|
|
3
3
|
import click
|
|
4
4
|
|
|
5
5
|
from datahub.ingestion.source.apply.datahub_apply import apply_association_to_container
|
|
6
|
+
from datahub.upgrade import upgrade
|
|
6
7
|
|
|
7
8
|
logger = logging.getLogger(__name__)
|
|
8
9
|
|
|
@@ -16,6 +17,7 @@ def container() -> None:
|
|
|
16
17
|
@container.command()
|
|
17
18
|
@click.option("--container-urn", required=True, type=str)
|
|
18
19
|
@click.option("--tag-urn", required=True, type=str)
|
|
20
|
+
@upgrade.check_upgrade
|
|
19
21
|
def tag(container_urn: str, tag_urn: str) -> None:
|
|
20
22
|
"""Add patch to add a tag to all datasets in a container"""
|
|
21
23
|
apply_association_to_container(container_urn, tag_urn, "tag")
|
|
@@ -24,6 +26,7 @@ def tag(container_urn: str, tag_urn: str) -> None:
|
|
|
24
26
|
@container.command()
|
|
25
27
|
@click.option("--container-urn", required=True, type=str)
|
|
26
28
|
@click.option("--term-urn", required=True, type=str)
|
|
29
|
+
@upgrade.check_upgrade
|
|
27
30
|
def term(container_urn: str, term_urn: str) -> None:
|
|
28
31
|
"""Add patch to add a term to all datasets in a container"""
|
|
29
32
|
apply_association_to_container(container_urn, term_urn, "term")
|
|
@@ -32,6 +35,7 @@ def term(container_urn: str, term_urn: str) -> None:
|
|
|
32
35
|
@container.command()
|
|
33
36
|
@click.option("--container-urn", required=True, type=str)
|
|
34
37
|
@click.option("--owner-urn", required=True, type=str)
|
|
38
|
+
@upgrade.check_upgrade
|
|
35
39
|
def owner(container_urn: str, owner_urn: str) -> None:
|
|
36
40
|
"""Add patch to add a owner to all datasets in a container"""
|
|
37
41
|
apply_association_to_container(container_urn, owner_urn, "owner")
|
|
@@ -40,6 +44,7 @@ def owner(container_urn: str, owner_urn: str) -> None:
|
|
|
40
44
|
@container.command()
|
|
41
45
|
@click.option("--container-urn", required=True, type=str)
|
|
42
46
|
@click.option("--domain-urn", required=True, type=str)
|
|
47
|
+
@upgrade.check_upgrade
|
|
43
48
|
def domain(container_urn: str, domain_urn: str) -> None:
|
|
44
49
|
"""Add patch to add a domain to all datasets in a container"""
|
|
45
50
|
apply_association_to_container(container_urn, domain_urn, "domain")
|
datahub/cli/delete_cli.py
CHANGED
|
@@ -18,7 +18,6 @@ from datahub.emitter.aspect import ASPECT_MAP, TIMESERIES_ASPECT_MAP
|
|
|
18
18
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
19
19
|
from datahub.ingestion.graph.config import ClientMode
|
|
20
20
|
from datahub.ingestion.graph.filters import RemovedStatusFilter
|
|
21
|
-
from datahub.telemetry import telemetry
|
|
22
21
|
from datahub.upgrade import upgrade
|
|
23
22
|
from datahub.utilities.perf_timer import PerfTimer
|
|
24
23
|
from datahub.utilities.urns.urn import guess_entity_type
|
|
@@ -116,7 +115,7 @@ class DeletionResult:
|
|
|
116
115
|
help="specifies soft/hard deletion",
|
|
117
116
|
)
|
|
118
117
|
@click.option("-n", "--dry-run", required=False, is_flag=True)
|
|
119
|
-
@
|
|
118
|
+
@upgrade.check_upgrade
|
|
120
119
|
def by_registry(
|
|
121
120
|
registry_id: str,
|
|
122
121
|
soft: bool,
|
|
@@ -171,7 +170,7 @@ def by_registry(
|
|
|
171
170
|
@click.option(
|
|
172
171
|
"-f", "--force", required=False, is_flag=True, help="force the delete if set"
|
|
173
172
|
)
|
|
174
|
-
@
|
|
173
|
+
@upgrade.check_upgrade
|
|
175
174
|
def references(urn: str, dry_run: bool, force: bool) -> None:
|
|
176
175
|
"""
|
|
177
176
|
Delete all references to an entity (but not the entity itself).
|
|
@@ -234,6 +233,7 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
|
|
|
234
233
|
help="Batch size when querying for entities to un-soft delete."
|
|
235
234
|
"Maximum 5000. Large batch sizes may cause timeouts.",
|
|
236
235
|
)
|
|
236
|
+
@upgrade.check_upgrade
|
|
237
237
|
def undo_by_filter(
|
|
238
238
|
urn: Optional[str], platform: Optional[str], batch_size: int
|
|
239
239
|
) -> None:
|
|
@@ -370,7 +370,6 @@ def undo_by_filter(
|
|
|
370
370
|
"--workers", type=int, default=1, help="Num of workers to use for deletion."
|
|
371
371
|
)
|
|
372
372
|
@upgrade.check_upgrade
|
|
373
|
-
@telemetry.with_telemetry()
|
|
374
373
|
def by_filter(
|
|
375
374
|
urn: Optional[str],
|
|
376
375
|
urn_file: Optional[str],
|
datahub/cli/docker_check.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import enum
|
|
2
2
|
import os
|
|
3
|
+
import pathlib
|
|
3
4
|
from contextlib import contextmanager
|
|
4
5
|
from dataclasses import dataclass
|
|
5
|
-
from typing import Any, Dict, Iterator, List, Optional
|
|
6
|
+
from typing import Any, Dict, Iterator, List, Optional, Set
|
|
6
7
|
|
|
7
8
|
import docker
|
|
8
9
|
import docker.errors
|
|
@@ -13,6 +14,7 @@ from datahub.configuration.common import ExceptionWithProps
|
|
|
13
14
|
|
|
14
15
|
# Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.
|
|
15
16
|
MIN_MEMORY_NEEDED = 3.8 # GB
|
|
17
|
+
MIN_DISK_SPACE_NEEDED = 12 # GB
|
|
16
18
|
|
|
17
19
|
DOCKER_COMPOSE_PROJECT_NAME = os.getenv("DATAHUB_COMPOSE_PROJECT_NAME", "datahub")
|
|
18
20
|
DATAHUB_COMPOSE_PROJECT_FILTER = {
|
|
@@ -37,6 +39,10 @@ class DockerLowMemoryError(Exception):
|
|
|
37
39
|
SHOW_STACK_TRACE = False
|
|
38
40
|
|
|
39
41
|
|
|
42
|
+
class DockerLowDiskSpaceError(Exception):
|
|
43
|
+
SHOW_STACK_TRACE = False
|
|
44
|
+
|
|
45
|
+
|
|
40
46
|
class DockerComposeVersionError(Exception):
|
|
41
47
|
SHOW_STACK_TRACE = False
|
|
42
48
|
|
|
@@ -102,6 +108,24 @@ def run_quickstart_preflight_checks(client: docker.DockerClient) -> None:
|
|
|
102
108
|
"You can increase the memory allocated to Docker in the Docker settings."
|
|
103
109
|
)
|
|
104
110
|
|
|
111
|
+
result = client.containers.run(
|
|
112
|
+
"alpine:latest",
|
|
113
|
+
"sh -c \"df -B1 / | tail -1 | awk '{print $2, $4}'\"", # total, available
|
|
114
|
+
remove=True,
|
|
115
|
+
stdout=True,
|
|
116
|
+
stderr=True,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
output = result.decode("utf-8").strip()
|
|
120
|
+
total_bytes, available_bytes = map(int, output.split())
|
|
121
|
+
|
|
122
|
+
available_gb = available_bytes / (1024**3)
|
|
123
|
+
if available_gb < MIN_DISK_SPACE_NEEDED:
|
|
124
|
+
raise DockerLowDiskSpaceError(
|
|
125
|
+
f"Total Docker disk space available {available_gb:.2f}GB is below the minimum threshold {MIN_DISK_SPACE_NEEDED}GB. "
|
|
126
|
+
"You can increase the disk space allocated to Docker in the Docker settings or free up disk space`"
|
|
127
|
+
)
|
|
128
|
+
|
|
105
129
|
|
|
106
130
|
class ContainerStatus(enum.Enum):
|
|
107
131
|
OK = "is ok"
|
|
@@ -126,10 +150,24 @@ class DockerContainerStatus:
|
|
|
126
150
|
@dataclass
|
|
127
151
|
class QuickstartStatus:
|
|
128
152
|
containers: List[DockerContainerStatus]
|
|
153
|
+
volumes: Set[str]
|
|
154
|
+
# On moving to compose profiles, this CLI will no longer support running quickstart instances from earlier versions.
|
|
155
|
+
# While the check command can work, upgrades or
|
|
156
|
+
running_unsupported_version: bool
|
|
157
|
+
|
|
158
|
+
def __init__(
|
|
159
|
+
self,
|
|
160
|
+
containers: List[DockerContainerStatus],
|
|
161
|
+
volumes: List[str],
|
|
162
|
+
running_unsupported_version: bool = False,
|
|
163
|
+
):
|
|
164
|
+
self.containers = containers
|
|
165
|
+
self.running_unsupported_version = running_unsupported_version
|
|
166
|
+
self.volumes = set(volumes)
|
|
129
167
|
|
|
130
168
|
def errors(self) -> List[str]:
|
|
131
169
|
if not self.containers:
|
|
132
|
-
return ["
|
|
170
|
+
return ["datahub is not running"]
|
|
133
171
|
|
|
134
172
|
return [
|
|
135
173
|
f"{container.name} {container.status.value}"
|
|
@@ -176,6 +214,26 @@ class QuickstartStatus:
|
|
|
176
214
|
},
|
|
177
215
|
)
|
|
178
216
|
|
|
217
|
+
def get_containers(self) -> Set[str]:
|
|
218
|
+
if self.containers:
|
|
219
|
+
return {container.name for container in self.containers}
|
|
220
|
+
else:
|
|
221
|
+
return set()
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def detect_legacy_quickstart_compose(containers: Set[str]) -> bool:
|
|
225
|
+
return "zookeeper" in containers
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _get_services_from_compose(compose_file: str) -> Set[str]:
|
|
229
|
+
with open(compose_file) as config_file:
|
|
230
|
+
return yaml.safe_load(config_file).get("services", {}).keys()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _get_volumes_from_compose(compose_file: str) -> Set[str]:
|
|
234
|
+
with open(compose_file) as config_file:
|
|
235
|
+
return yaml.safe_load(config_file).get("volumes", {}).keys()
|
|
236
|
+
|
|
179
237
|
|
|
180
238
|
def check_docker_quickstart() -> QuickstartStatus:
|
|
181
239
|
container_statuses: List[DockerContainerStatus] = []
|
|
@@ -188,7 +246,7 @@ def check_docker_quickstart() -> QuickstartStatus:
|
|
|
188
246
|
ignore_removed=True,
|
|
189
247
|
)
|
|
190
248
|
if len(containers) == 0:
|
|
191
|
-
return QuickstartStatus([])
|
|
249
|
+
return QuickstartStatus([], [], running_unsupported_version=False)
|
|
192
250
|
|
|
193
251
|
# load the expected containers from the docker-compose file
|
|
194
252
|
config_files = (
|
|
@@ -197,16 +255,17 @@ def check_docker_quickstart() -> QuickstartStatus:
|
|
|
197
255
|
.split(",")
|
|
198
256
|
)
|
|
199
257
|
|
|
200
|
-
# If using profiles, alternative check
|
|
258
|
+
# If using profiles, alternative check ##TODO: Does this really work? Check mixpanel for usage of this.
|
|
201
259
|
if config_files and "/profiles/" in config_files[0]:
|
|
202
260
|
return check_docker_quickstart_profiles(client)
|
|
203
261
|
|
|
204
262
|
all_containers = set()
|
|
205
263
|
for config_file in config_files:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
264
|
+
all_containers.update(_get_services_from_compose(config_file))
|
|
265
|
+
|
|
266
|
+
all_volumes = set()
|
|
267
|
+
for config_file in config_files:
|
|
268
|
+
all_volumes.update(_get_volumes_from_compose(config_file))
|
|
210
269
|
|
|
211
270
|
existing_containers = set()
|
|
212
271
|
# Check that the containers are running and healthy.
|
|
@@ -240,8 +299,12 @@ def check_docker_quickstart() -> QuickstartStatus:
|
|
|
240
299
|
container_statuses.append(
|
|
241
300
|
DockerContainerStatus(missing, ContainerStatus.MISSING)
|
|
242
301
|
)
|
|
243
|
-
|
|
244
|
-
return QuickstartStatus(
|
|
302
|
+
running_unsupported_version = detect_legacy_quickstart_compose(all_containers)
|
|
303
|
+
return QuickstartStatus(
|
|
304
|
+
containers=container_statuses,
|
|
305
|
+
volumes=list(all_volumes),
|
|
306
|
+
running_unsupported_version=running_unsupported_version,
|
|
307
|
+
)
|
|
245
308
|
|
|
246
309
|
|
|
247
310
|
def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartStatus:
|
|
@@ -254,7 +317,7 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
|
|
|
254
317
|
ignore_removed=True,
|
|
255
318
|
)
|
|
256
319
|
if len(containers) == 0:
|
|
257
|
-
return QuickstartStatus([])
|
|
320
|
+
return QuickstartStatus([], [], running_unsupported_version=False)
|
|
258
321
|
|
|
259
322
|
existing_containers = set()
|
|
260
323
|
# Check that the containers are running and healthy.
|
|
@@ -273,4 +336,36 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
|
|
|
273
336
|
|
|
274
337
|
container_statuses.append(DockerContainerStatus(name, status))
|
|
275
338
|
|
|
276
|
-
|
|
339
|
+
# TODO: Can this be handled with older verions?
|
|
340
|
+
return QuickstartStatus(
|
|
341
|
+
container_statuses, volumes=[], running_unsupported_version=False
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def check_upgrade_supported(
|
|
346
|
+
quickstart_compose_file: List[pathlib.Path], quickstart_status: QuickstartStatus
|
|
347
|
+
) -> bool:
|
|
348
|
+
if (
|
|
349
|
+
quickstart_status.running_unsupported_version
|
|
350
|
+
): # we detected a legacy quickstart service
|
|
351
|
+
return False
|
|
352
|
+
|
|
353
|
+
if not quickstart_status.get_containers(): # no containers are running
|
|
354
|
+
return True
|
|
355
|
+
|
|
356
|
+
compose_services = set()
|
|
357
|
+
compose_volumes = set()
|
|
358
|
+
|
|
359
|
+
for compose_file in quickstart_compose_file:
|
|
360
|
+
compose_services.update(_get_services_from_compose(str(compose_file)))
|
|
361
|
+
compose_volumes.update(_get_volumes_from_compose(str(compose_file)))
|
|
362
|
+
|
|
363
|
+
# if all services and volumes are not the same, the state in the volumes may not be compatible with the new services.
|
|
364
|
+
# We are checking for containers and volumes per the compose file, not necessarily all of them being present
|
|
365
|
+
if (
|
|
366
|
+
compose_services == quickstart_status.get_containers()
|
|
367
|
+
and compose_volumes == quickstart_status.volumes
|
|
368
|
+
):
|
|
369
|
+
return True
|
|
370
|
+
else:
|
|
371
|
+
return False
|