PyPI - acryl-datahub - Versions diffs - 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show

{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/entities/common/serialized_value.py +4 -3
datahub/api/entities/dataset/dataset.py +731 -42
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/check_cli.py +72 -19
datahub/cli/docker_cli.py +3 -3
datahub/cli/iceberg_cli.py +31 -7
datahub/cli/ingest_cli.py +30 -93
datahub/cli/lite_cli.py +4 -2
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/dataset_cli.py +128 -14
datahub/configuration/common.py +10 -2
datahub/configuration/git.py +1 -3
datahub/configuration/kafka.py +1 -1
datahub/emitter/mce_builder.py +28 -13
datahub/emitter/mcp_builder.py +4 -1
datahub/emitter/response_helper.py +145 -0
datahub/emitter/rest_emitter.py +323 -10
datahub/ingestion/api/decorators.py +1 -1
datahub/ingestion/api/source_helpers.py +4 -0
datahub/ingestion/fs/s3_fs.py +2 -2
datahub/ingestion/glossary/classification_mixin.py +1 -5
datahub/ingestion/graph/client.py +41 -22
datahub/ingestion/graph/entity_versioning.py +3 -3
datahub/ingestion/graph/filters.py +64 -37
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
datahub/ingestion/run/pipeline.py +112 -148
datahub/ingestion/run/sink_callback.py +77 -0
datahub/ingestion/sink/datahub_rest.py +8 -0
datahub/ingestion/source/abs/config.py +2 -4
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
datahub/ingestion/source/cassandra/cassandra.py +152 -233
datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
datahub/ingestion/source/common/subtypes.py +12 -0
datahub/ingestion/source/csv_enricher.py +3 -3
datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
datahub/ingestion/source/dbt/dbt_common.py +8 -5
datahub/ingestion/source/dbt/dbt_core.py +11 -9
datahub/ingestion/source/dbt/dbt_tests.py +4 -8
datahub/ingestion/source/delta_lake/config.py +8 -1
datahub/ingestion/source/delta_lake/report.py +4 -2
datahub/ingestion/source/delta_lake/source.py +20 -5
datahub/ingestion/source/dremio/dremio_api.py +4 -8
datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
datahub/ingestion/source/elastic_search.py +26 -6
datahub/ingestion/source/feast.py +27 -8
datahub/ingestion/source/file.py +6 -3
datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
datahub/ingestion/source/ge_data_profiler.py +12 -15
datahub/ingestion/source/iceberg/iceberg.py +46 -12
datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
datahub/ingestion/source/identity/okta.py +37 -7
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/kafka_connect/common.py +2 -7
datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
datahub/ingestion/source/looker/looker_common.py +6 -5
datahub/ingestion/source/looker/looker_file_loader.py +2 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
datahub/ingestion/source/looker/looker_source.py +1 -1
datahub/ingestion/source/looker/looker_template_language.py +4 -2
datahub/ingestion/source/looker/lookml_source.py +3 -2
datahub/ingestion/source/metabase.py +57 -35
datahub/ingestion/source/metadata/business_glossary.py +45 -3
datahub/ingestion/source/metadata/lineage.py +2 -2
datahub/ingestion/source/mlflow.py +365 -35
datahub/ingestion/source/mode.py +18 -8
datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
datahub/ingestion/source/nifi.py +37 -11
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/openapi_parser.py +49 -17
datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -3
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
datahub/ingestion/source/preset.py +7 -4
datahub/ingestion/source/pulsar.py +3 -2
datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
datahub/ingestion/source/redash.py +31 -7
datahub/ingestion/source/redshift/config.py +4 -0
datahub/ingestion/source/redshift/datashares.py +236 -0
datahub/ingestion/source/redshift/lineage.py +6 -2
datahub/ingestion/source/redshift/lineage_v2.py +24 -9
datahub/ingestion/source/redshift/profile.py +1 -1
datahub/ingestion/source/redshift/query.py +133 -33
datahub/ingestion/source/redshift/redshift.py +46 -73
datahub/ingestion/source/redshift/redshift_schema.py +186 -6
datahub/ingestion/source/redshift/report.py +3 -0
datahub/ingestion/source/s3/config.py +5 -5
datahub/ingestion/source/s3/source.py +20 -41
datahub/ingestion/source/salesforce.py +550 -275
datahub/ingestion/source/schema_inference/object.py +1 -1
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/slack/slack.py +31 -10
datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
datahub/ingestion/source/sql/athena.py +10 -16
datahub/ingestion/source/sql/druid.py +1 -5
datahub/ingestion/source/sql/hive.py +15 -6
datahub/ingestion/source/sql/hive_metastore.py +3 -2
datahub/ingestion/source/sql/mssql/job_models.py +29 -0
datahub/ingestion/source/sql/mssql/source.py +11 -5
datahub/ingestion/source/sql/oracle.py +127 -63
datahub/ingestion/source/sql/sql_common.py +16 -18
datahub/ingestion/source/sql/sql_types.py +2 -2
datahub/ingestion/source/sql/teradata.py +19 -5
datahub/ingestion/source/sql/trino.py +2 -2
datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
datahub/ingestion/source/superset.py +222 -62
datahub/ingestion/source/tableau/tableau.py +22 -6
datahub/ingestion/source/tableau/tableau_common.py +3 -2
datahub/ingestion/source/unity/ge_profiler.py +2 -1
datahub/ingestion/source/unity/source.py +11 -1
datahub/ingestion/source/vertexai.py +697 -0
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
datahub/lite/duckdb_lite.py +3 -10
datahub/lite/lite_local.py +1 -1
datahub/lite/lite_util.py +4 -3
datahub/metadata/_schema_classes.py +714 -417
datahub/metadata/_urns/urn_defs.py +1673 -1649
datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
datahub/metadata/schema.avsc +16438 -16603
datahub/metadata/schemas/AssertionInfo.avsc +3 -1
datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
datahub/metadata/schemas/ChartInfo.avsc +1 -0
datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
datahub/metadata/schemas/DataProcessKey.avsc +2 -1
datahub/metadata/schemas/DataProductKey.avsc +2 -1
datahub/metadata/schemas/DomainKey.avsc +2 -1
datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
datahub/metadata/schemas/IncidentInfo.avsc +130 -46
datahub/metadata/schemas/InputFields.avsc +3 -1
datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
datahub/metadata/schemas/MLModelKey.avsc +3 -1
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
datahub/metadata/schemas/PostKey.avsc +2 -1
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
datahub/metadata/schemas/VersionProperties.avsc +18 -0
datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
datahub/pydantic/__init__.py +0 -0
datahub/pydantic/compat.py +58 -0
datahub/sdk/__init__.py +30 -12
datahub/sdk/_all_entities.py +1 -1
datahub/sdk/_attribution.py +4 -0
datahub/sdk/_shared.py +258 -16
datahub/sdk/_utils.py +35 -0
datahub/sdk/container.py +30 -6
datahub/sdk/dataset.py +118 -20
datahub/sdk/{_entity.py → entity.py} +24 -1
datahub/sdk/entity_client.py +1 -1
datahub/sdk/main_client.py +23 -0
datahub/sdk/resolver_client.py +17 -29
datahub/sdk/search_client.py +50 -0
datahub/sdk/search_filters.py +374 -0
datahub/specific/dataset.py +3 -4
datahub/sql_parsing/_sqlglot_patch.py +2 -10
datahub/sql_parsing/schema_resolver.py +1 -1
datahub/sql_parsing/split_statements.py +220 -126
datahub/sql_parsing/sql_parsing_common.py +7 -0
datahub/sql_parsing/sqlglot_lineage.py +1 -1
datahub/sql_parsing/sqlglot_utils.py +1 -4
datahub/testing/check_sql_parser_result.py +5 -6
datahub/testing/compare_metadata_json.py +7 -6
datahub/testing/pytest_hooks.py +56 -0
datahub/upgrade/upgrade.py +2 -2
datahub/utilities/file_backed_collections.py +3 -14
datahub/utilities/ingest_utils.py +106 -0
datahub/utilities/mapping.py +1 -1
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/sentinels.py +22 -0
datahub/utilities/unified_diff.py +5 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0

datahub/api/entities/structuredproperties/structuredproperties.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from enum import Enum
 from pathlib import Path
-from typing import Iterable, List, Optional
+from typing import Iterable, List, Optional, Union
 import yaml
 from pydantic import validator
@@ -38,7 +38,7 @@ class AllowedTypes(Enum):
 class AllowedValue(ConfigModel):
-    value: str
+    value: Union[int, float, str]
     description: Optional[str] = None

datahub/cli/check_cli.py CHANGED Viewed

@@ -5,7 +5,8 @@ import pathlib
 import pprint
 import shutil
 import tempfile
-from typing import Dict, List, Optional, Union
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Union
 import click
@@ -20,7 +21,10 @@ from datahub.ingestion.sink.sink_registry import sink_registry
 from datahub.ingestion.source.source_registry import source_registry
 from datahub.ingestion.transformer.transform_registry import transform_registry
 from datahub.telemetry import telemetry
-from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
+from datahub.utilities.file_backed_collections import (
+    ConnectionWrapper,
+    FileBackedDict,
+)
 logger = logging.getLogger(__name__)
@@ -391,29 +395,78 @@ def test_path_spec(config: str, input: str, path_spec_key: str) -> None:
         raise e
+def _jsonify(data: Any) -> Any:
+    if dataclasses.is_dataclass(data):
+        # dataclasses.asdict() is recursive. We're doing the recursion
+        # manually here via _jsonify calls, so we can't use
+        # dataclasses.asdict() here.
+        return {
+            f.name: _jsonify(getattr(data, f.name)) for f in dataclasses.fields(data)
+        }
+    elif isinstance(data, list):
+        return [_jsonify(item) for item in data]
+    elif isinstance(data, dict):
+        return {_jsonify(k): _jsonify(v) for k, v in data.items()}
+    elif isinstance(data, datetime):
+        return data.isoformat()
+    else:
+        return data
 @check.command()
-@click.argument("query-log-file", type=click.Path(exists=True, dir_okay=False))
-@click.option("--output", type=click.Path())
-def extract_sql_agg_log(query_log_file: str, output: Optional[str]) -> None:
+@click.argument("db-file", type=click.Path(exists=True, dir_okay=False))
+def extract_sql_agg_log(db_file: str) -> None:
     """Convert a sqlite db generated by the SqlParsingAggregator into a JSON."""
-    from datahub.sql_parsing.sql_parsing_aggregator import LoggedQuery
+    if pathlib.Path(db_file).suffix != ".db":
+        raise click.UsageError("DB file must be a sqlite db")
+    output_dir = pathlib.Path(db_file).with_suffix("")
+    output_dir.mkdir(exist_ok=True)
+    shared_connection = ConnectionWrapper(pathlib.Path(db_file))
+    tables: List[str] = [
+        row[0]
+        for row in shared_connection.execute(
+            """\
+SELECT
+    name
+FROM
+    sqlite_schema
+WHERE
+    type ='table' AND
+    name NOT LIKE 'sqlite_%';
+""",
+            parameters={},
+        )
+    ]
+    logger.info(f"Extracting {len(tables)} tables from {db_file}: {tables}")
+    for table in tables:
+        table_output_path = output_dir / f"{table}.json"
+        if table_output_path.exists():
+            logger.info(f"Skipping {table_output_path} because it already exists")
+            continue
-    assert dataclasses.is_dataclass(LoggedQuery)
+        # Some of the tables might actually be FileBackedList. Because
+        # the list is built on top of the FileBackedDict, we don't
+        # need to distinguish between the two cases.
-    shared_connection = ConnectionWrapper(pathlib.Path(query_log_file))
-    query_log = FileBackedList[LoggedQuery](
-        shared_connection=shared_connection, tablename="stored_queries"
-    )
-    logger.info(f"Extracting {len(query_log)} queries from {query_log_file}")
-    queries = [dataclasses.asdict(query) for query in query_log]
+        table_data: FileBackedDict[Any] = FileBackedDict(
+            shared_connection=shared_connection, tablename=table
+        )
-    if output:
-        with open(output, "w") as f:
-            json.dump(queries, f, indent=2, default=str)
-        logger.info(f"Extracted {len(queries)} queries to {output}")
-    else:
-        click.echo(json.dumps(queries, indent=2))
+        data = {}
+        with click.progressbar(
+            table_data.items(), length=len(table_data), label=f"Extracting {table}"
+        ) as items:
+            for k, v in items:
+                data[k] = _jsonify(v)
+        with open(table_output_path, "w") as f:
+            json.dump(data, f, indent=2, default=str)
+        logger.info(f"Extracted {len(data)} entries to {table_output_path}")
 @check.command()

datahub/cli/docker_cli.py CHANGED Viewed

@@ -231,7 +231,7 @@ def _docker_compose_v2() -> List[str]:
             # docker-compose v1 is not installed either.
             raise DockerComposeVersionError(
                 "You don't have Docker Compose installed. Please install Docker Compose. See https://docs.docker.com/compose/install/.",
-            )
+            ) from None
 def _attempt_stop(quickstart_compose_file: List[pathlib.Path]) -> None:
@@ -430,7 +430,7 @@ def detect_quickstart_arch(arch: Optional[str]) -> Architectures:
     return quickstart_arch
-@docker.command()  # noqa: C901
+@docker.command()
 @click.option(
     "--version",
     type=str,
@@ -592,7 +592,7 @@ def detect_quickstart_arch(arch: Optional[str]) -> Architectures:
         "arch",
     ]
 )
-def quickstart(  # noqa: C901
+def quickstart(
     version: Optional[str],
     build_locally: bool,
     pull_images: bool,

datahub/cli/iceberg_cli.py CHANGED Viewed

@@ -14,6 +14,7 @@ from datahub.cli.cli_utils import post_entity
 from datahub.configuration.common import GraphError
 from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
 from datahub.metadata.schema_classes import SystemMetadataClass
+from datahub.telemetry import telemetry
 logger = logging.getLogger(__name__)
@@ -161,6 +162,7 @@ def validate_warehouse(data_root: str) -> None:
     type=int,
     help=f"Expiration duration for temporary credentials used for role. Defaults to {DEFAULT_CREDS_EXPIRY_DURATION_SECONDS} seconds if unspecified",
 )
+@telemetry.with_telemetry(capture_kwargs=["duration_seconds"])
 def create(
     warehouse: str,
     description: Optional[str],
@@ -313,6 +315,7 @@ def create(
     type=int,
     help=f"Expiration duration for temporary credentials used for role. Defaults to {DEFAULT_CREDS_EXPIRY_DURATION_SECONDS} seconds if unspecified",
 )
+@telemetry.with_telemetry(capture_kwargs=["duration_seconds"])
 def update(
     warehouse: str,
     data_root: str,
@@ -398,6 +401,7 @@ def update(
 @iceberg.command()
+@telemetry.with_telemetry()
 def list() -> None:
     """
     List iceberg warehouses
@@ -413,6 +417,7 @@ def list() -> None:
 @click.option(
     "-w", "--warehouse", required=True, type=str, help="The name of the warehouse"
 )
+@telemetry.with_telemetry()
 def get(warehouse: str) -> None:
     """Fetches the details of the specified iceberg warehouse"""
     client = get_default_graph()
@@ -442,6 +447,7 @@ def get(warehouse: str) -> None:
     is_flag=True,
     help="force the delete if set without confirmation",
 )
+@telemetry.with_telemetry(capture_kwargs=["dry_run", "force"])
 def delete(warehouse: str, dry_run: bool, force: bool) -> None:
     """
     Delete warehouse
@@ -470,11 +476,19 @@ def delete(warehouse: str, dry_run: bool, force: bool) -> None:
             # Do we really need this double-check?
             if "__typename" in entity and "urn" in entity:
                 if entity["__typename"] in ["Container", "Dataset"]:
+                    # add the Platform Resource URN to also be deleted for each dataset.
+                    # This is not user visible, so no need to show a name to the user and include it in the count. Each
+                    # instance corresponds to a dataset whose name is shown.
+                    if entity["__typename"] == "Dataset":
+                        resource_urn = platform_resource_urn(
+                            entity["properties"]["qualifiedName"]
+                        )
+                        urns_to_delete.append(resource_urn)
                     urns_to_delete.append(entity["urn"])
                     resource_names_to_be_deleted.append(
                         entity.get("name", entity.get("urn"))
                     )
-            # TODO: PlatformResource associated with datasets need to be deleted.
         if dry_run:
             click.echo(
@@ -485,18 +499,21 @@ def delete(warehouse: str, dry_run: bool, force: bool) -> None:
         else:
             if not force:
                 click.confirm(
-                    f"This will delete {warehouse} warehouse, credentials, and {len(urns_to_delete)} datasets and namespaces from DataHub. Do you want to continue?",
+                    f"This will delete {warehouse} warehouse, credentials, and {len(resource_names_to_be_deleted)} datasets and namespaces from DataHub. Do you want to continue?",
                     abort=True,
                 )
-            client.hard_delete_entity(urn)
-            client.hard_delete_entity(warehouse_aspect.clientId)
-            client.hard_delete_entity(warehouse_aspect.clientSecret)
+            # Delete the resources in the warehouse first, so that in case it is interrupted, the warehouse itself is
+            # still available to enumerate the resources in it that are not yet deleted.
             for urn_to_delete in urns_to_delete:
                 client.hard_delete_entity(urn_to_delete)
+            client.hard_delete_entity(urn)
+            client.hard_delete_entity(warehouse_aspect.clientId)
+            client.hard_delete_entity(warehouse_aspect.clientSecret)
             click.echo(
-                f"✅ Successfully deleted iceberg warehouse {warehouse} and associated credentials, {len(urns_to_delete)} datasets and namespaces"
+                f"✅ Successfully deleted iceberg warehouse {warehouse} and associated credentials, {len(resource_names_to_be_deleted)} datasets and namespaces"
             )
@@ -504,6 +521,10 @@ def iceberg_data_platform_instance_urn(warehouse: str) -> str:
     return f"urn:li:dataPlatformInstance:({iceberg_data_platform()},{warehouse})"
+def platform_resource_urn(dataset_name: str) -> str:
+    return f"urn:li:platformResource:iceberg.{dataset_name}"
 def iceberg_data_platform() -> str:
     return "urn:li:dataPlatform:iceberg"
@@ -624,7 +645,7 @@ def get_all_warehouses(client: DataHubGraph) -> Iterator[str]:
     graph_query = """
         query getIcebergWarehouses($start: Int, $count: Int) {
           search(
-            input: {type: DATA_PLATFORM_INSTANCE, query: "*", start: $start, count: $count}
+            input: {type: DATA_PLATFORM_INSTANCE, query: "dataPlatform:iceberg", start: $start, count: $count}
           ) {
             start
             total
@@ -677,6 +698,9 @@ def get_related_entities_for_platform_instance(
                 ... on Dataset {
                   urn
                   name
+                  properties{
+                    qualifiedName
+                  }
                 }
               }
             }

datahub/cli/ingest_cli.py CHANGED Viewed

@@ -15,14 +15,14 @@ from tabulate import tabulate
 from datahub._version import nice_version_name
 from datahub.cli import cli_utils
 from datahub.cli.config_utils import CONDENSED_DATAHUB_CONFIG_PATH
-from datahub.configuration.common import ConfigModel, GraphError
+from datahub.configuration.common import GraphError
 from datahub.configuration.config_loader import load_config_file
-from datahub.emitter.mce_builder import datahub_guid
 from datahub.ingestion.graph.client import get_default_graph
 from datahub.ingestion.run.connection import ConnectionManager
 from datahub.ingestion.run.pipeline import Pipeline
 from datahub.telemetry import telemetry
 from datahub.upgrade import upgrade
+from datahub.utilities.ingest_utils import deploy_source_vars
 from datahub.utilities.perf_timer import PerfTimer
 logger = logging.getLogger(__name__)
@@ -191,23 +191,6 @@ def run(
     # don't raise SystemExit if there's no error
-def _make_ingestion_urn(name: str) -> str:
-    guid = datahub_guid(
-        {
-            "name": name,
-        }
-    )
-    return f"urn:li:dataHubIngestionSource:deploy-{guid}"
-class DeployOptions(ConfigModel):
-    name: str
-    schedule: Optional[str] = None
-    time_zone: str = "UTC"
-    cli_version: Optional[str] = None
-    executor_id: str = "default"
 @ingest.command()
 @upgrade.check_upgrade
 @telemetry.with_telemetry()
@@ -258,6 +241,16 @@ class DeployOptions(ConfigModel):
     required=False,
     default="UTC",
 )
+@click.option(
+    "--debug", type=bool, help="Should we debug.", required=False, default=False
+)
+@click.option(
+    "--extra-pip",
+    type=str,
+    help='Extra pip packages. e.g. ["memray"]',
+    required=False,
+    default=None,
+)
 def deploy(
     name: Optional[str],
     config: str,
@@ -266,6 +259,8 @@ def deploy(
     cli_version: Optional[str],
     schedule: Optional[str],
     time_zone: str,
+    extra_pip: Optional[str],
+    debug: bool = False,
 ) -> None:
     """
     Deploy an ingestion recipe to your DataHub instance.
@@ -276,83 +271,23 @@ def deploy(
     datahub_graph = get_default_graph()
-    pipeline_config = load_config_file(
-        config,
-        allow_stdin=True,
-        allow_remote=True,
-        resolve_env_vars=False,
+    variables = deploy_source_vars(
+        name=name,
+        config=config,
+        urn=urn,
+        executor_id=executor_id,
+        cli_version=cli_version,
+        schedule=schedule,
+        time_zone=time_zone,
+        extra_pip=extra_pip,
+        debug=debug,
     )
-    deploy_options_raw = pipeline_config.pop("deployment", None)
-    if deploy_options_raw is not None:
-        deploy_options = DeployOptions.parse_obj(deploy_options_raw)
-        if name:
-            logger.info(f"Overriding deployment name {deploy_options.name} with {name}")
-            deploy_options.name = name
-    else:
-        if not name:
-            raise click.UsageError(
-                "Either --name must be set or deployment_name specified in the config"
-            )
-        deploy_options = DeployOptions(name=name)
-    # Use remaining CLI args to override deploy_options
-    if schedule:
-        deploy_options.schedule = schedule
-    if time_zone:
-        deploy_options.time_zone = time_zone
-    if cli_version:
-        deploy_options.cli_version = cli_version
-    if executor_id:
-        deploy_options.executor_id = executor_id
-    logger.info(f"Using {repr(deploy_options)}")
-    if not urn:
-        # When urn/name is not specified, we will generate a unique urn based on the deployment name.
-        urn = _make_ingestion_urn(deploy_options.name)
-        logger.info(f"Using recipe urn: {urn}")
-    # Invariant - at this point, both urn and deploy_options are set.
-    variables: dict = {
-        "urn": urn,
-        "name": deploy_options.name,
-        "type": pipeline_config["source"]["type"],
-        "recipe": json.dumps(pipeline_config),
-        "executorId": deploy_options.executor_id,
-        "version": deploy_options.cli_version,
-    }
-    if deploy_options.schedule is not None:
-        variables["schedule"] = {
-            "interval": deploy_options.schedule,
-            "timezone": deploy_options.time_zone,
-        }
     # The updateIngestionSource endpoint can actually do upserts as well.
     graphql_query: str = textwrap.dedent(
         """
-        mutation updateIngestionSource(
-            $urn: String!,
-            $name: String!,
-            $type: String!,
-            $schedule: UpdateIngestionSourceScheduleInput,
-            $recipe: String!,
-            $executorId: String!
-            $version: String) {
-            updateIngestionSource(urn: $urn, input: {
-                name: $name,
-                type: $type,
-                schedule: $schedule,
-                config: {
-                    recipe: $recipe,
-                    executorId: $executorId,
-                    version: $version,
-                }
-            })
+        mutation updateIngestionSource($urn: String!, $input: UpdateIngestionSourceInput!) {
+            updateIngestionSource(urn: $urn, input: $input)
         }
         """
     )
@@ -372,7 +307,7 @@ def deploy(
         sys.exit(1)
     click.echo(
-        f"✅ Successfully wrote data ingestion source metadata for recipe {deploy_options.name}:"
+        f"✅ Successfully wrote data ingestion source metadata for recipe {variables['input']['name']}:"
     )
     click.echo(response)
@@ -414,7 +349,9 @@ def parse_restli_response(response):
 @ingest.command()
-@click.argument("path", type=click.Path(exists=True))
+@click.argument(
+    "path", type=click.Path(exists=False)
+)  # exists=False since it only supports local filesystems
 def mcps(path: str) -> None:
     """
     Ingest metadata from a mcp json file or directory of files.

datahub/cli/lite_cli.py CHANGED Viewed

@@ -285,10 +285,12 @@ def search(
     ctx: click.Context,
     query: str = "",
     flavor: str = SearchFlavor.FREE_TEXT.name.lower(),
-    aspect: List[str] = [],
+    aspect: Optional[List[str]] = None,
     details: bool = True,
 ) -> None:
     """Search with a free text or exact query string"""
+    if aspect is None:
+        aspect = []
     # query flavor should be sanitized by now, but we still need to convert it to a SearchFlavor
     try:
@@ -296,7 +298,7 @@ def search(
     except KeyError:
         raise click.UsageError(
             f"Failed to find a matching query flavor for {flavor}. Valid values are {[x.lower() for x in SearchFlavor._member_names_]}"
-        )
+        ) from None
     catalog = _get_datahub_lite(read_only=True)
     # sanitize query
     result_ids = set()

datahub/cli/specific/dataproduct_cli.py CHANGED Viewed

@@ -49,7 +49,7 @@ def _abort_if_non_existent_urn(graph: DataHubGraph, urn: str, operation: str) ->
         entity_type = parsed_urn.get_type()
     except Exception:
         click.secho(f"Provided urn {urn} does not seem valid", fg="red")
-        raise click.Abort()
+        raise click.Abort() from None
     else:
         if not graph.exists(urn):
             click.secho(

datahub/cli/specific/dataset_cli.py CHANGED Viewed

@@ -1,12 +1,15 @@
+import filecmp
 import json
 import logging
+import os
+import shutil
 from pathlib import Path
-from typing import Set, Tuple
+from typing import List, Set, Tuple
 import click
 from click_default_group import DefaultGroup
-from datahub.api.entities.dataset.dataset import Dataset
+from datahub.api.entities.dataset.dataset import Dataset, DatasetRetrievalConfig
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
 from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
@@ -30,18 +33,9 @@ def dataset() -> None:
 @telemetry.with_telemetry()
 def upsert(file: Path) -> None:
     """Upsert attributes to a Dataset in DataHub."""
-    with get_default_graph() as graph:
-        for dataset in Dataset.from_yaml(str(file)):
-            try:
-                for mcp in dataset.generate_mcp():
-                    graph.emit(mcp)
-                click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
-            except Exception as e:
-                click.secho(
-                    f"Update failed for id {id}. due to {e}",
-                    fg="red",
-                )
+    # Call the sync command with to_datahub=True to perform the upsert operation
+    ctx = click.get_current_context()
+    ctx.invoke(sync, file=str(file), to_datahub=True)
 @dataset.command(
@@ -111,3 +105,123 @@ def _get_existing_siblings(graph: DataHubGraph, urn: str) -> Set[str]:
         return set(existing.siblings)
     else:
         return set()
+@dataset.command(
+    name="file",
+)
+@click.option("--lintCheck", required=False, is_flag=True)
+@click.option("--lintFix", required=False, is_flag=True)
+@click.argument("file", type=click.Path(exists=True))
+@upgrade.check_upgrade
+@telemetry.with_telemetry()
+def file(lintcheck: bool, lintfix: bool, file: str) -> None:
+    """Operate on a Dataset file"""
+    if lintcheck or lintfix:
+        import tempfile
+        from pathlib import Path
+        # Create a temporary file in a secure way
+        # The file will be automatically deleted when the context manager exits
+        with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as temp:
+            temp_path = Path(temp.name)
+            try:
+                # Copy content to the temporary file
+                shutil.copyfile(file, temp_path)
+                # Run the linting
+                datasets = Dataset.from_yaml(temp.name)
+                for dataset in datasets:
+                    dataset.to_yaml(temp_path)
+                # Compare the files
+                files_match = filecmp.cmp(file, temp_path)
+                if files_match:
+                    click.secho("No differences found", fg="green")
+                else:
+                    # Show diff for visibility
+                    os.system(f"diff {file} {temp_path}")
+                    if lintfix:
+                        shutil.copyfile(temp_path, file)
+                        click.secho(f"Fixed linting issues in {file}", fg="green")
+                    else:
+                        click.secho(
+                            f"To fix these differences, run 'datahub dataset file --lintFix {file}'",
+                            fg="yellow",
+                        )
+            finally:
+                # Ensure the temporary file is removed
+                if temp_path.exists():
+                    temp_path.unlink()
+    else:
+        click.secho(
+            "No operation specified. Choose from --lintCheck or --lintFix", fg="yellow"
+        )
+@dataset.command(
+    name="sync",
+)
+@click.option("-f", "--file", required=True, type=click.Path(exists=True))
+@click.option("--to-datahub/--from-datahub", required=True, is_flag=True)
+@upgrade.check_upgrade
+@telemetry.with_telemetry()
+def sync(file: str, to_datahub: bool) -> None:
+    """Sync a Dataset file to/from DataHub"""
+    failures: List[str] = []
+    with get_default_graph() as graph:
+        datasets = Dataset.from_yaml(file)
+        for dataset in datasets:
+            assert (
+                dataset.urn is not None
+            )  # Validator should have ensured this is filled. Tell mypy it's not None
+            if to_datahub:
+                missing_entity_references = [
+                    entity_reference
+                    for entity_reference in dataset.entity_references()
+                    if not graph.exists(entity_reference)
+                ]
+                if missing_entity_references:
+                    click.secho(
+                        "\n\t- ".join(
+                            [
+                                f"Skipping Dataset {dataset.urn} due to missing entity references: "
+                            ]
+                            + missing_entity_references
+                        ),
+                        fg="red",
+                    )
+                    failures.append(dataset.urn)
+                    continue
+                try:
+                    for mcp in dataset.generate_mcp():
+                        graph.emit(mcp)
+                    click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
+                except Exception as e:
+                    click.secho(
+                        f"Update failed for id {id}. due to {e}",
+                        fg="red",
+                    )
+            else:
+                # Sync from DataHub
+                if graph.exists(dataset.urn):
+                    dataset_get_config = DatasetRetrievalConfig()
+                    if dataset.downstreams:
+                        dataset_get_config.include_downstreams = True
+                    existing_dataset: Dataset = Dataset.from_datahub(
+                        graph=graph, urn=dataset.urn, config=dataset_get_config
+                    )
+                    existing_dataset.to_yaml(Path(file))
+                else:
+                    click.secho(f"Dataset {dataset.urn} does not exist")
+                    failures.append(dataset.urn)
+    if failures:
+        click.secho(
+            f"\nFailed to sync the following Datasets: {', '.join(failures)}",
+            fg="red",
+        )
+        raise click.Abort()

acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl