acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +1 -1
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +3 -5
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +3 -3
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +6 -12
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +7 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +251 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +29 -5
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +20 -13
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Iterable, List, Optional
|
|
4
|
+
from typing import Iterable, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
import yaml
|
|
7
7
|
from pydantic import validator
|
|
@@ -38,7 +38,7 @@ class AllowedTypes(Enum):
|
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
class AllowedValue(ConfigModel):
|
|
41
|
-
value: str
|
|
41
|
+
value: Union[int, float, str]
|
|
42
42
|
description: Optional[str] = None
|
|
43
43
|
|
|
44
44
|
|
datahub/cli/check_cli.py
CHANGED
|
@@ -5,7 +5,8 @@ import pathlib
|
|
|
5
5
|
import pprint
|
|
6
6
|
import shutil
|
|
7
7
|
import tempfile
|
|
8
|
-
from
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import Any, Dict, List, Optional, Union
|
|
9
10
|
|
|
10
11
|
import click
|
|
11
12
|
|
|
@@ -20,7 +21,10 @@ from datahub.ingestion.sink.sink_registry import sink_registry
|
|
|
20
21
|
from datahub.ingestion.source.source_registry import source_registry
|
|
21
22
|
from datahub.ingestion.transformer.transform_registry import transform_registry
|
|
22
23
|
from datahub.telemetry import telemetry
|
|
23
|
-
from datahub.utilities.file_backed_collections import
|
|
24
|
+
from datahub.utilities.file_backed_collections import (
|
|
25
|
+
ConnectionWrapper,
|
|
26
|
+
FileBackedDict,
|
|
27
|
+
)
|
|
24
28
|
|
|
25
29
|
logger = logging.getLogger(__name__)
|
|
26
30
|
|
|
@@ -391,29 +395,78 @@ def test_path_spec(config: str, input: str, path_spec_key: str) -> None:
|
|
|
391
395
|
raise e
|
|
392
396
|
|
|
393
397
|
|
|
398
|
+
def _jsonify(data: Any) -> Any:
|
|
399
|
+
if dataclasses.is_dataclass(data):
|
|
400
|
+
# dataclasses.asdict() is recursive. We're doing the recursion
|
|
401
|
+
# manually here via _jsonify calls, so we can't use
|
|
402
|
+
# dataclasses.asdict() here.
|
|
403
|
+
return {
|
|
404
|
+
f.name: _jsonify(getattr(data, f.name)) for f in dataclasses.fields(data)
|
|
405
|
+
}
|
|
406
|
+
elif isinstance(data, list):
|
|
407
|
+
return [_jsonify(item) for item in data]
|
|
408
|
+
elif isinstance(data, dict):
|
|
409
|
+
return {_jsonify(k): _jsonify(v) for k, v in data.items()}
|
|
410
|
+
elif isinstance(data, datetime):
|
|
411
|
+
return data.isoformat()
|
|
412
|
+
else:
|
|
413
|
+
return data
|
|
414
|
+
|
|
415
|
+
|
|
394
416
|
@check.command()
|
|
395
|
-
@click.argument("
|
|
396
|
-
|
|
397
|
-
def extract_sql_agg_log(query_log_file: str, output: Optional[str]) -> None:
|
|
417
|
+
@click.argument("db-file", type=click.Path(exists=True, dir_okay=False))
|
|
418
|
+
def extract_sql_agg_log(db_file: str) -> None:
|
|
398
419
|
"""Convert a sqlite db generated by the SqlParsingAggregator into a JSON."""
|
|
399
420
|
|
|
400
|
-
|
|
421
|
+
if pathlib.Path(db_file).suffix != ".db":
|
|
422
|
+
raise click.UsageError("DB file must be a sqlite db")
|
|
423
|
+
|
|
424
|
+
output_dir = pathlib.Path(db_file).with_suffix("")
|
|
425
|
+
output_dir.mkdir(exist_ok=True)
|
|
426
|
+
|
|
427
|
+
shared_connection = ConnectionWrapper(pathlib.Path(db_file))
|
|
428
|
+
|
|
429
|
+
tables: List[str] = [
|
|
430
|
+
row[0]
|
|
431
|
+
for row in shared_connection.execute(
|
|
432
|
+
"""\
|
|
433
|
+
SELECT
|
|
434
|
+
name
|
|
435
|
+
FROM
|
|
436
|
+
sqlite_schema
|
|
437
|
+
WHERE
|
|
438
|
+
type ='table' AND
|
|
439
|
+
name NOT LIKE 'sqlite_%';
|
|
440
|
+
""",
|
|
441
|
+
parameters={},
|
|
442
|
+
)
|
|
443
|
+
]
|
|
444
|
+
logger.info(f"Extracting {len(tables)} tables from {db_file}: {tables}")
|
|
445
|
+
|
|
446
|
+
for table in tables:
|
|
447
|
+
table_output_path = output_dir / f"{table}.json"
|
|
448
|
+
if table_output_path.exists():
|
|
449
|
+
logger.info(f"Skipping {table_output_path} because it already exists")
|
|
450
|
+
continue
|
|
401
451
|
|
|
402
|
-
|
|
452
|
+
# Some of the tables might actually be FileBackedList. Because
|
|
453
|
+
# the list is built on top of the FileBackedDict, we don't
|
|
454
|
+
# need to distinguish between the two cases.
|
|
403
455
|
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
)
|
|
408
|
-
logger.info(f"Extracting {len(query_log)} queries from {query_log_file}")
|
|
409
|
-
queries = [dataclasses.asdict(query) for query in query_log]
|
|
456
|
+
table_data: FileBackedDict[Any] = FileBackedDict(
|
|
457
|
+
shared_connection=shared_connection, tablename=table
|
|
458
|
+
)
|
|
410
459
|
|
|
411
|
-
|
|
412
|
-
with
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
460
|
+
data = {}
|
|
461
|
+
with click.progressbar(
|
|
462
|
+
table_data.items(), length=len(table_data), label=f"Extracting {table}"
|
|
463
|
+
) as items:
|
|
464
|
+
for k, v in items:
|
|
465
|
+
data[k] = _jsonify(v)
|
|
466
|
+
|
|
467
|
+
with open(table_output_path, "w") as f:
|
|
468
|
+
json.dump(data, f, indent=2, default=str)
|
|
469
|
+
logger.info(f"Extracted {len(data)} entries to {table_output_path}")
|
|
417
470
|
|
|
418
471
|
|
|
419
472
|
@check.command()
|
datahub/cli/docker_cli.py
CHANGED
|
@@ -231,7 +231,7 @@ def _docker_compose_v2() -> List[str]:
|
|
|
231
231
|
# docker-compose v1 is not installed either.
|
|
232
232
|
raise DockerComposeVersionError(
|
|
233
233
|
"You don't have Docker Compose installed. Please install Docker Compose. See https://docs.docker.com/compose/install/.",
|
|
234
|
-
)
|
|
234
|
+
) from None
|
|
235
235
|
|
|
236
236
|
|
|
237
237
|
def _attempt_stop(quickstart_compose_file: List[pathlib.Path]) -> None:
|
|
@@ -430,7 +430,7 @@ def detect_quickstart_arch(arch: Optional[str]) -> Architectures:
|
|
|
430
430
|
return quickstart_arch
|
|
431
431
|
|
|
432
432
|
|
|
433
|
-
@docker.command()
|
|
433
|
+
@docker.command()
|
|
434
434
|
@click.option(
|
|
435
435
|
"--version",
|
|
436
436
|
type=str,
|
|
@@ -592,7 +592,7 @@ def detect_quickstart_arch(arch: Optional[str]) -> Architectures:
|
|
|
592
592
|
"arch",
|
|
593
593
|
]
|
|
594
594
|
)
|
|
595
|
-
def quickstart(
|
|
595
|
+
def quickstart(
|
|
596
596
|
version: Optional[str],
|
|
597
597
|
build_locally: bool,
|
|
598
598
|
pull_images: bool,
|
datahub/cli/iceberg_cli.py
CHANGED
|
@@ -645,7 +645,7 @@ def get_all_warehouses(client: DataHubGraph) -> Iterator[str]:
|
|
|
645
645
|
graph_query = """
|
|
646
646
|
query getIcebergWarehouses($start: Int, $count: Int) {
|
|
647
647
|
search(
|
|
648
|
-
input: {type: DATA_PLATFORM_INSTANCE, query: "
|
|
648
|
+
input: {type: DATA_PLATFORM_INSTANCE, query: "dataPlatform:iceberg", start: $start, count: $count}
|
|
649
649
|
) {
|
|
650
650
|
start
|
|
651
651
|
total
|
datahub/cli/ingest_cli.py
CHANGED
|
@@ -15,14 +15,14 @@ from tabulate import tabulate
|
|
|
15
15
|
from datahub._version import nice_version_name
|
|
16
16
|
from datahub.cli import cli_utils
|
|
17
17
|
from datahub.cli.config_utils import CONDENSED_DATAHUB_CONFIG_PATH
|
|
18
|
-
from datahub.configuration.common import
|
|
18
|
+
from datahub.configuration.common import GraphError
|
|
19
19
|
from datahub.configuration.config_loader import load_config_file
|
|
20
|
-
from datahub.emitter.mce_builder import datahub_guid
|
|
21
20
|
from datahub.ingestion.graph.client import get_default_graph
|
|
22
21
|
from datahub.ingestion.run.connection import ConnectionManager
|
|
23
22
|
from datahub.ingestion.run.pipeline import Pipeline
|
|
24
23
|
from datahub.telemetry import telemetry
|
|
25
24
|
from datahub.upgrade import upgrade
|
|
25
|
+
from datahub.utilities.ingest_utils import deploy_source_vars
|
|
26
26
|
from datahub.utilities.perf_timer import PerfTimer
|
|
27
27
|
|
|
28
28
|
logger = logging.getLogger(__name__)
|
|
@@ -191,23 +191,6 @@ def run(
|
|
|
191
191
|
# don't raise SystemExit if there's no error
|
|
192
192
|
|
|
193
193
|
|
|
194
|
-
def _make_ingestion_urn(name: str) -> str:
|
|
195
|
-
guid = datahub_guid(
|
|
196
|
-
{
|
|
197
|
-
"name": name,
|
|
198
|
-
}
|
|
199
|
-
)
|
|
200
|
-
return f"urn:li:dataHubIngestionSource:deploy-{guid}"
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
class DeployOptions(ConfigModel):
|
|
204
|
-
name: str
|
|
205
|
-
schedule: Optional[str] = None
|
|
206
|
-
time_zone: str = "UTC"
|
|
207
|
-
cli_version: Optional[str] = None
|
|
208
|
-
executor_id: str = "default"
|
|
209
|
-
|
|
210
|
-
|
|
211
194
|
@ingest.command()
|
|
212
195
|
@upgrade.check_upgrade
|
|
213
196
|
@telemetry.with_telemetry()
|
|
@@ -258,6 +241,16 @@ class DeployOptions(ConfigModel):
|
|
|
258
241
|
required=False,
|
|
259
242
|
default="UTC",
|
|
260
243
|
)
|
|
244
|
+
@click.option(
|
|
245
|
+
"--debug", type=bool, help="Should we debug.", required=False, default=False
|
|
246
|
+
)
|
|
247
|
+
@click.option(
|
|
248
|
+
"--extra-pip",
|
|
249
|
+
type=str,
|
|
250
|
+
help='Extra pip packages. e.g. ["memray"]',
|
|
251
|
+
required=False,
|
|
252
|
+
default=None,
|
|
253
|
+
)
|
|
261
254
|
def deploy(
|
|
262
255
|
name: Optional[str],
|
|
263
256
|
config: str,
|
|
@@ -266,6 +259,8 @@ def deploy(
|
|
|
266
259
|
cli_version: Optional[str],
|
|
267
260
|
schedule: Optional[str],
|
|
268
261
|
time_zone: str,
|
|
262
|
+
extra_pip: Optional[str],
|
|
263
|
+
debug: bool = False,
|
|
269
264
|
) -> None:
|
|
270
265
|
"""
|
|
271
266
|
Deploy an ingestion recipe to your DataHub instance.
|
|
@@ -276,83 +271,23 @@ def deploy(
|
|
|
276
271
|
|
|
277
272
|
datahub_graph = get_default_graph()
|
|
278
273
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
274
|
+
variables = deploy_source_vars(
|
|
275
|
+
name=name,
|
|
276
|
+
config=config,
|
|
277
|
+
urn=urn,
|
|
278
|
+
executor_id=executor_id,
|
|
279
|
+
cli_version=cli_version,
|
|
280
|
+
schedule=schedule,
|
|
281
|
+
time_zone=time_zone,
|
|
282
|
+
extra_pip=extra_pip,
|
|
283
|
+
debug=debug,
|
|
284
284
|
)
|
|
285
285
|
|
|
286
|
-
deploy_options_raw = pipeline_config.pop("deployment", None)
|
|
287
|
-
if deploy_options_raw is not None:
|
|
288
|
-
deploy_options = DeployOptions.parse_obj(deploy_options_raw)
|
|
289
|
-
|
|
290
|
-
if name:
|
|
291
|
-
logger.info(f"Overriding deployment name {deploy_options.name} with {name}")
|
|
292
|
-
deploy_options.name = name
|
|
293
|
-
else:
|
|
294
|
-
if not name:
|
|
295
|
-
raise click.UsageError(
|
|
296
|
-
"Either --name must be set or deployment_name specified in the config"
|
|
297
|
-
)
|
|
298
|
-
deploy_options = DeployOptions(name=name)
|
|
299
|
-
|
|
300
|
-
# Use remaining CLI args to override deploy_options
|
|
301
|
-
if schedule:
|
|
302
|
-
deploy_options.schedule = schedule
|
|
303
|
-
if time_zone:
|
|
304
|
-
deploy_options.time_zone = time_zone
|
|
305
|
-
if cli_version:
|
|
306
|
-
deploy_options.cli_version = cli_version
|
|
307
|
-
if executor_id:
|
|
308
|
-
deploy_options.executor_id = executor_id
|
|
309
|
-
|
|
310
|
-
logger.info(f"Using {repr(deploy_options)}")
|
|
311
|
-
|
|
312
|
-
if not urn:
|
|
313
|
-
# When urn/name is not specified, we will generate a unique urn based on the deployment name.
|
|
314
|
-
urn = _make_ingestion_urn(deploy_options.name)
|
|
315
|
-
logger.info(f"Using recipe urn: {urn}")
|
|
316
|
-
|
|
317
|
-
# Invariant - at this point, both urn and deploy_options are set.
|
|
318
|
-
|
|
319
|
-
variables: dict = {
|
|
320
|
-
"urn": urn,
|
|
321
|
-
"name": deploy_options.name,
|
|
322
|
-
"type": pipeline_config["source"]["type"],
|
|
323
|
-
"recipe": json.dumps(pipeline_config),
|
|
324
|
-
"executorId": deploy_options.executor_id,
|
|
325
|
-
"version": deploy_options.cli_version,
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
if deploy_options.schedule is not None:
|
|
329
|
-
variables["schedule"] = {
|
|
330
|
-
"interval": deploy_options.schedule,
|
|
331
|
-
"timezone": deploy_options.time_zone,
|
|
332
|
-
}
|
|
333
|
-
|
|
334
286
|
# The updateIngestionSource endpoint can actually do upserts as well.
|
|
335
287
|
graphql_query: str = textwrap.dedent(
|
|
336
288
|
"""
|
|
337
|
-
mutation updateIngestionSource(
|
|
338
|
-
$urn:
|
|
339
|
-
$name: String!,
|
|
340
|
-
$type: String!,
|
|
341
|
-
$schedule: UpdateIngestionSourceScheduleInput,
|
|
342
|
-
$recipe: String!,
|
|
343
|
-
$executorId: String!
|
|
344
|
-
$version: String) {
|
|
345
|
-
|
|
346
|
-
updateIngestionSource(urn: $urn, input: {
|
|
347
|
-
name: $name,
|
|
348
|
-
type: $type,
|
|
349
|
-
schedule: $schedule,
|
|
350
|
-
config: {
|
|
351
|
-
recipe: $recipe,
|
|
352
|
-
executorId: $executorId,
|
|
353
|
-
version: $version,
|
|
354
|
-
}
|
|
355
|
-
})
|
|
289
|
+
mutation updateIngestionSource($urn: String!, $input: UpdateIngestionSourceInput!) {
|
|
290
|
+
updateIngestionSource(urn: $urn, input: $input)
|
|
356
291
|
}
|
|
357
292
|
"""
|
|
358
293
|
)
|
|
@@ -372,7 +307,7 @@ def deploy(
|
|
|
372
307
|
sys.exit(1)
|
|
373
308
|
|
|
374
309
|
click.echo(
|
|
375
|
-
f"✅ Successfully wrote data ingestion source metadata for recipe {
|
|
310
|
+
f"✅ Successfully wrote data ingestion source metadata for recipe {variables['input']['name']}:"
|
|
376
311
|
)
|
|
377
312
|
click.echo(response)
|
|
378
313
|
|
|
@@ -414,7 +349,9 @@ def parse_restli_response(response):
|
|
|
414
349
|
|
|
415
350
|
|
|
416
351
|
@ingest.command()
|
|
417
|
-
@click.argument(
|
|
352
|
+
@click.argument(
|
|
353
|
+
"path", type=click.Path(exists=False)
|
|
354
|
+
) # exists=False since it only supports local filesystems
|
|
418
355
|
def mcps(path: str) -> None:
|
|
419
356
|
"""
|
|
420
357
|
Ingest metadata from a mcp json file or directory of files.
|
datahub/cli/lite_cli.py
CHANGED
|
@@ -285,10 +285,12 @@ def search(
|
|
|
285
285
|
ctx: click.Context,
|
|
286
286
|
query: str = "",
|
|
287
287
|
flavor: str = SearchFlavor.FREE_TEXT.name.lower(),
|
|
288
|
-
aspect: List[str] =
|
|
288
|
+
aspect: Optional[List[str]] = None,
|
|
289
289
|
details: bool = True,
|
|
290
290
|
) -> None:
|
|
291
291
|
"""Search with a free text or exact query string"""
|
|
292
|
+
if aspect is None:
|
|
293
|
+
aspect = []
|
|
292
294
|
|
|
293
295
|
# query flavor should be sanitized by now, but we still need to convert it to a SearchFlavor
|
|
294
296
|
try:
|
|
@@ -296,7 +298,7 @@ def search(
|
|
|
296
298
|
except KeyError:
|
|
297
299
|
raise click.UsageError(
|
|
298
300
|
f"Failed to find a matching query flavor for {flavor}. Valid values are {[x.lower() for x in SearchFlavor._member_names_]}"
|
|
299
|
-
)
|
|
301
|
+
) from None
|
|
300
302
|
catalog = _get_datahub_lite(read_only=True)
|
|
301
303
|
# sanitize query
|
|
302
304
|
result_ids = set()
|
|
@@ -49,7 +49,7 @@ def _abort_if_non_existent_urn(graph: DataHubGraph, urn: str, operation: str) ->
|
|
|
49
49
|
entity_type = parsed_urn.get_type()
|
|
50
50
|
except Exception:
|
|
51
51
|
click.secho(f"Provided urn {urn} does not seem valid", fg="red")
|
|
52
|
-
raise click.Abort()
|
|
52
|
+
raise click.Abort() from None
|
|
53
53
|
else:
|
|
54
54
|
if not graph.exists(urn):
|
|
55
55
|
click.secho(
|
|
@@ -1,12 +1,15 @@
|
|
|
1
|
+
import filecmp
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
4
|
+
import os
|
|
5
|
+
import shutil
|
|
3
6
|
from pathlib import Path
|
|
4
|
-
from typing import Set, Tuple
|
|
7
|
+
from typing import List, Set, Tuple
|
|
5
8
|
|
|
6
9
|
import click
|
|
7
10
|
from click_default_group import DefaultGroup
|
|
8
11
|
|
|
9
|
-
from datahub.api.entities.dataset.dataset import Dataset
|
|
12
|
+
from datahub.api.entities.dataset.dataset import Dataset, DatasetRetrievalConfig
|
|
10
13
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
11
14
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
12
15
|
from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
|
|
@@ -30,18 +33,9 @@ def dataset() -> None:
|
|
|
30
33
|
@telemetry.with_telemetry()
|
|
31
34
|
def upsert(file: Path) -> None:
|
|
32
35
|
"""Upsert attributes to a Dataset in DataHub."""
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
try:
|
|
37
|
-
for mcp in dataset.generate_mcp():
|
|
38
|
-
graph.emit(mcp)
|
|
39
|
-
click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
|
|
40
|
-
except Exception as e:
|
|
41
|
-
click.secho(
|
|
42
|
-
f"Update failed for id {id}. due to {e}",
|
|
43
|
-
fg="red",
|
|
44
|
-
)
|
|
36
|
+
# Call the sync command with to_datahub=True to perform the upsert operation
|
|
37
|
+
ctx = click.get_current_context()
|
|
38
|
+
ctx.invoke(sync, file=str(file), to_datahub=True)
|
|
45
39
|
|
|
46
40
|
|
|
47
41
|
@dataset.command(
|
|
@@ -111,3 +105,123 @@ def _get_existing_siblings(graph: DataHubGraph, urn: str) -> Set[str]:
|
|
|
111
105
|
return set(existing.siblings)
|
|
112
106
|
else:
|
|
113
107
|
return set()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataset.command(
|
|
111
|
+
name="file",
|
|
112
|
+
)
|
|
113
|
+
@click.option("--lintCheck", required=False, is_flag=True)
|
|
114
|
+
@click.option("--lintFix", required=False, is_flag=True)
|
|
115
|
+
@click.argument("file", type=click.Path(exists=True))
|
|
116
|
+
@upgrade.check_upgrade
|
|
117
|
+
@telemetry.with_telemetry()
|
|
118
|
+
def file(lintcheck: bool, lintfix: bool, file: str) -> None:
|
|
119
|
+
"""Operate on a Dataset file"""
|
|
120
|
+
|
|
121
|
+
if lintcheck or lintfix:
|
|
122
|
+
import tempfile
|
|
123
|
+
from pathlib import Path
|
|
124
|
+
|
|
125
|
+
# Create a temporary file in a secure way
|
|
126
|
+
# The file will be automatically deleted when the context manager exits
|
|
127
|
+
with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as temp:
|
|
128
|
+
temp_path = Path(temp.name)
|
|
129
|
+
try:
|
|
130
|
+
# Copy content to the temporary file
|
|
131
|
+
shutil.copyfile(file, temp_path)
|
|
132
|
+
|
|
133
|
+
# Run the linting
|
|
134
|
+
datasets = Dataset.from_yaml(temp.name)
|
|
135
|
+
for dataset in datasets:
|
|
136
|
+
dataset.to_yaml(temp_path)
|
|
137
|
+
|
|
138
|
+
# Compare the files
|
|
139
|
+
files_match = filecmp.cmp(file, temp_path)
|
|
140
|
+
|
|
141
|
+
if files_match:
|
|
142
|
+
click.secho("No differences found", fg="green")
|
|
143
|
+
else:
|
|
144
|
+
# Show diff for visibility
|
|
145
|
+
os.system(f"diff {file} {temp_path}")
|
|
146
|
+
|
|
147
|
+
if lintfix:
|
|
148
|
+
shutil.copyfile(temp_path, file)
|
|
149
|
+
click.secho(f"Fixed linting issues in {file}", fg="green")
|
|
150
|
+
else:
|
|
151
|
+
click.secho(
|
|
152
|
+
f"To fix these differences, run 'datahub dataset file --lintFix {file}'",
|
|
153
|
+
fg="yellow",
|
|
154
|
+
)
|
|
155
|
+
finally:
|
|
156
|
+
# Ensure the temporary file is removed
|
|
157
|
+
if temp_path.exists():
|
|
158
|
+
temp_path.unlink()
|
|
159
|
+
else:
|
|
160
|
+
click.secho(
|
|
161
|
+
"No operation specified. Choose from --lintCheck or --lintFix", fg="yellow"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@dataset.command(
|
|
166
|
+
name="sync",
|
|
167
|
+
)
|
|
168
|
+
@click.option("-f", "--file", required=True, type=click.Path(exists=True))
|
|
169
|
+
@click.option("--to-datahub/--from-datahub", required=True, is_flag=True)
|
|
170
|
+
@upgrade.check_upgrade
|
|
171
|
+
@telemetry.with_telemetry()
|
|
172
|
+
def sync(file: str, to_datahub: bool) -> None:
|
|
173
|
+
"""Sync a Dataset file to/from DataHub"""
|
|
174
|
+
|
|
175
|
+
failures: List[str] = []
|
|
176
|
+
with get_default_graph() as graph:
|
|
177
|
+
datasets = Dataset.from_yaml(file)
|
|
178
|
+
for dataset in datasets:
|
|
179
|
+
assert (
|
|
180
|
+
dataset.urn is not None
|
|
181
|
+
) # Validator should have ensured this is filled. Tell mypy it's not None
|
|
182
|
+
if to_datahub:
|
|
183
|
+
missing_entity_references = [
|
|
184
|
+
entity_reference
|
|
185
|
+
for entity_reference in dataset.entity_references()
|
|
186
|
+
if not graph.exists(entity_reference)
|
|
187
|
+
]
|
|
188
|
+
if missing_entity_references:
|
|
189
|
+
click.secho(
|
|
190
|
+
"\n\t- ".join(
|
|
191
|
+
[
|
|
192
|
+
f"Skipping Dataset {dataset.urn} due to missing entity references: "
|
|
193
|
+
]
|
|
194
|
+
+ missing_entity_references
|
|
195
|
+
),
|
|
196
|
+
fg="red",
|
|
197
|
+
)
|
|
198
|
+
failures.append(dataset.urn)
|
|
199
|
+
continue
|
|
200
|
+
try:
|
|
201
|
+
for mcp in dataset.generate_mcp():
|
|
202
|
+
graph.emit(mcp)
|
|
203
|
+
click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
|
|
204
|
+
except Exception as e:
|
|
205
|
+
click.secho(
|
|
206
|
+
f"Update failed for id {id}. due to {e}",
|
|
207
|
+
fg="red",
|
|
208
|
+
)
|
|
209
|
+
else:
|
|
210
|
+
# Sync from DataHub
|
|
211
|
+
if graph.exists(dataset.urn):
|
|
212
|
+
dataset_get_config = DatasetRetrievalConfig()
|
|
213
|
+
if dataset.downstreams:
|
|
214
|
+
dataset_get_config.include_downstreams = True
|
|
215
|
+
existing_dataset: Dataset = Dataset.from_datahub(
|
|
216
|
+
graph=graph, urn=dataset.urn, config=dataset_get_config
|
|
217
|
+
)
|
|
218
|
+
existing_dataset.to_yaml(Path(file))
|
|
219
|
+
else:
|
|
220
|
+
click.secho(f"Dataset {dataset.urn} does not exist")
|
|
221
|
+
failures.append(dataset.urn)
|
|
222
|
+
if failures:
|
|
223
|
+
click.secho(
|
|
224
|
+
f"\nFailed to sync the following Datasets: {', '.join(failures)}",
|
|
225
|
+
fg="red",
|
|
226
|
+
)
|
|
227
|
+
raise click.Abort()
|
datahub/configuration/common.py
CHANGED
|
@@ -20,7 +20,7 @@ from pydantic import BaseModel, Extra, ValidationError
|
|
|
20
20
|
from pydantic.fields import Field
|
|
21
21
|
from typing_extensions import Protocol, Self
|
|
22
22
|
|
|
23
|
-
from datahub.configuration._config_enum import ConfigEnum as ConfigEnum
|
|
23
|
+
from datahub.configuration._config_enum import ConfigEnum as ConfigEnum
|
|
24
24
|
from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
|
|
25
25
|
from datahub.utilities.dedup_list import deduplicate_list
|
|
26
26
|
|
|
@@ -130,7 +130,7 @@ class PermissiveConfigModel(ConfigModel):
|
|
|
130
130
|
# It is usually used for argument bags that are passed through to third-party libraries.
|
|
131
131
|
|
|
132
132
|
class Config:
|
|
133
|
-
if PYDANTIC_VERSION_2:
|
|
133
|
+
if PYDANTIC_VERSION_2: # noqa: SIM108
|
|
134
134
|
extra = "allow"
|
|
135
135
|
else:
|
|
136
136
|
extra = Extra.allow
|
|
@@ -198,6 +198,14 @@ class IgnorableError(MetaError):
|
|
|
198
198
|
"""An error that can be ignored."""
|
|
199
199
|
|
|
200
200
|
|
|
201
|
+
class TraceTimeoutError(OperationalError):
|
|
202
|
+
"""Failure to complete an API Trace within the timeout."""
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class TraceValidationError(OperationalError):
|
|
206
|
+
"""Failure to complete the expected write operation."""
|
|
207
|
+
|
|
208
|
+
|
|
201
209
|
@runtime_checkable
|
|
202
210
|
class ExceptionWithProps(Protocol):
|
|
203
211
|
def get_telemetry_props(self) -> Dict[str, Any]: ...
|
datahub/configuration/git.py
CHANGED
|
@@ -43,9 +43,7 @@ class GitReference(ConfigModel):
|
|
|
43
43
|
|
|
44
44
|
@validator("repo", pre=True)
|
|
45
45
|
def simplify_repo_url(cls, repo: str) -> str:
|
|
46
|
-
if repo.startswith("github.com/"):
|
|
47
|
-
repo = f"https://{repo}"
|
|
48
|
-
elif repo.startswith("gitlab.com"):
|
|
46
|
+
if repo.startswith("github.com/") or repo.startswith("gitlab.com"):
|
|
49
47
|
repo = f"https://{repo}"
|
|
50
48
|
elif repo.count("/") == 1:
|
|
51
49
|
repo = f"https://github.com/{repo}"
|
datahub/configuration/kafka.py
CHANGED