PyPI - acryl-datahub - Versions diffs - 1.0.0rc7__py3-none-any.whl → 1.0.0rc9__py3-none-any.whl - Mend

acryl-datahub 1.0.0rc7py3-none-any.whl → 1.0.0rc9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (88) hide show

{acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/METADATA +2487 -2487
{acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/RECORD +88 -84
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +731 -42
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/specific/dataset_cli.py +128 -14
datahub/configuration/git.py +1 -3
datahub/ingestion/glossary/classification_mixin.py +1 -1
datahub/ingestion/graph/client.py +16 -12
datahub/ingestion/graph/filters.py +64 -37
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
datahub/ingestion/source/abs/config.py +2 -4
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
datahub/ingestion/source/cassandra/cassandra.py +1 -1
datahub/ingestion/source/csv_enricher.py +1 -1
datahub/ingestion/source/dbt/dbt_common.py +1 -1
datahub/ingestion/source/file.py +5 -2
datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
datahub/ingestion/source/ge_data_profiler.py +11 -14
datahub/ingestion/source/iceberg/iceberg.py +46 -12
datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
datahub/ingestion/source/identity/okta.py +1 -3
datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
datahub/ingestion/source/looker/looker_template_language.py +4 -2
datahub/ingestion/source/looker/lookml_source.py +2 -1
datahub/ingestion/source/metadata/lineage.py +2 -2
datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
datahub/ingestion/source/nifi.py +6 -3
datahub/ingestion/source/openapi_parser.py +2 -2
datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -3
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
datahub/ingestion/source/preset.py +7 -4
datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
datahub/ingestion/source/redash.py +2 -1
datahub/ingestion/source/s3/config.py +2 -4
datahub/ingestion/source/s3/source.py +20 -41
datahub/ingestion/source/salesforce.py +1 -1
datahub/ingestion/source/schema_inference/object.py +1 -1
datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
datahub/ingestion/source/sql/athena.py +2 -2
datahub/ingestion/source/sql/sql_common.py +2 -2
datahub/ingestion/source/sql/sql_types.py +2 -2
datahub/ingestion/source/sql/teradata.py +4 -2
datahub/ingestion/source/sql/trino.py +2 -2
datahub/ingestion/source/superset.py +218 -56
datahub/ingestion/source/tableau/tableau.py +1 -5
datahub/lite/duckdb_lite.py +3 -9
datahub/metadata/_schema_classes.py +157 -14
datahub/metadata/_urns/urn_defs.py +58 -58
datahub/metadata/schema.avsc +23 -10
datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/DataProcessKey.avsc +2 -1
datahub/metadata/schemas/DataProductKey.avsc +2 -1
datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
datahub/metadata/schemas/MLModelKey.avsc +2 -1
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
datahub/metadata/schemas/PostKey.avsc +2 -1
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/VersionProperties.avsc +18 -0
datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
datahub/pydantic/__init__.py +0 -0
datahub/pydantic/compat.py +58 -0
datahub/sdk/__init__.py +1 -0
datahub/sdk/_all_entities.py +1 -1
datahub/sdk/_shared.py +88 -3
datahub/sdk/container.py +7 -1
datahub/sdk/dataset.py +10 -4
datahub/sdk/{_entity.py → entity.py} +4 -0
datahub/sdk/entity_client.py +1 -1
datahub/sdk/main_client.py +7 -1
datahub/sdk/resolver_client.py +17 -29
datahub/sdk/search_client.py +50 -0
datahub/sdk/search_filters.py +374 -0
{acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/LICENSE +0 -0
{acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/WHEEL +0 -0
{acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/top_level.txt +0 -0

datahub/api/entities/structuredproperties/structuredproperties.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from enum import Enum
 from pathlib import Path
-from typing import Iterable, List, Optional
+from typing import Iterable, List, Optional, Union
 import yaml
 from pydantic import validator
@@ -38,7 +38,7 @@ class AllowedTypes(Enum):
 class AllowedValue(ConfigModel):
-    value: str
+    value: Union[int, float, str]
     description: Optional[str] = None

datahub/cli/specific/dataset_cli.py CHANGED Viewed

@@ -1,12 +1,15 @@
+import filecmp
 import json
 import logging
+import os
+import shutil
 from pathlib import Path
-from typing import Set, Tuple
+from typing import List, Set, Tuple
 import click
 from click_default_group import DefaultGroup
-from datahub.api.entities.dataset.dataset import Dataset
+from datahub.api.entities.dataset.dataset import Dataset, DatasetRetrievalConfig
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
 from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
@@ -30,18 +33,9 @@ def dataset() -> None:
 @telemetry.with_telemetry()
 def upsert(file: Path) -> None:
     """Upsert attributes to a Dataset in DataHub."""
-    with get_default_graph() as graph:
-        for dataset in Dataset.from_yaml(str(file)):
-            try:
-                for mcp in dataset.generate_mcp():
-                    graph.emit(mcp)
-                click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
-            except Exception as e:
-                click.secho(
-                    f"Update failed for id {id}. due to {e}",
-                    fg="red",
-                )
+    # Call the sync command with to_datahub=True to perform the upsert operation
+    ctx = click.get_current_context()
+    ctx.invoke(sync, file=str(file), to_datahub=True)
 @dataset.command(
@@ -111,3 +105,123 @@ def _get_existing_siblings(graph: DataHubGraph, urn: str) -> Set[str]:
         return set(existing.siblings)
     else:
         return set()
+@dataset.command(
+    name="file",
+)
+@click.option("--lintCheck", required=False, is_flag=True)
+@click.option("--lintFix", required=False, is_flag=True)
+@click.argument("file", type=click.Path(exists=True))
+@upgrade.check_upgrade
+@telemetry.with_telemetry()
+def file(lintcheck: bool, lintfix: bool, file: str) -> None:
+    """Operate on a Dataset file"""
+    if lintcheck or lintfix:
+        import tempfile
+        from pathlib import Path
+        # Create a temporary file in a secure way
+        # The file will be automatically deleted when the context manager exits
+        with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as temp:
+            temp_path = Path(temp.name)
+            try:
+                # Copy content to the temporary file
+                shutil.copyfile(file, temp_path)
+                # Run the linting
+                datasets = Dataset.from_yaml(temp.name)
+                for dataset in datasets:
+                    dataset.to_yaml(temp_path)
+                # Compare the files
+                files_match = filecmp.cmp(file, temp_path)
+                if files_match:
+                    click.secho("No differences found", fg="green")
+                else:
+                    # Show diff for visibility
+                    os.system(f"diff {file} {temp_path}")
+                    if lintfix:
+                        shutil.copyfile(temp_path, file)
+                        click.secho(f"Fixed linting issues in {file}", fg="green")
+                    else:
+                        click.secho(
+                            f"To fix these differences, run 'datahub dataset file --lintFix {file}'",
+                            fg="yellow",
+                        )
+            finally:
+                # Ensure the temporary file is removed
+                if temp_path.exists():
+                    temp_path.unlink()
+    else:
+        click.secho(
+            "No operation specified. Choose from --lintCheck or --lintFix", fg="yellow"
+        )
+@dataset.command(
+    name="sync",
+)
+@click.option("-f", "--file", required=True, type=click.Path(exists=True))
+@click.option("--to-datahub/--from-datahub", required=True, is_flag=True)
+@upgrade.check_upgrade
+@telemetry.with_telemetry()
+def sync(file: str, to_datahub: bool) -> None:
+    """Sync a Dataset file to/from DataHub"""
+    failures: List[str] = []
+    with get_default_graph() as graph:
+        datasets = Dataset.from_yaml(file)
+        for dataset in datasets:
+            assert (
+                dataset.urn is not None
+            )  # Validator should have ensured this is filled. Tell mypy it's not None
+            if to_datahub:
+                missing_entity_references = [
+                    entity_reference
+                    for entity_reference in dataset.entity_references()
+                    if not graph.exists(entity_reference)
+                ]
+                if missing_entity_references:
+                    click.secho(
+                        "\n\t- ".join(
+                            [
+                                f"Skipping Dataset {dataset.urn} due to missing entity references: "
+                            ]
+                            + missing_entity_references
+                        ),
+                        fg="red",
+                    )
+                    failures.append(dataset.urn)
+                    continue
+                try:
+                    for mcp in dataset.generate_mcp():
+                        graph.emit(mcp)
+                    click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
+                except Exception as e:
+                    click.secho(
+                        f"Update failed for id {id}. due to {e}",
+                        fg="red",
+                    )
+            else:
+                # Sync from DataHub
+                if graph.exists(dataset.urn):
+                    dataset_get_config = DatasetRetrievalConfig()
+                    if dataset.downstreams:
+                        dataset_get_config.include_downstreams = True
+                    existing_dataset: Dataset = Dataset.from_datahub(
+                        graph=graph, urn=dataset.urn, config=dataset_get_config
+                    )
+                    existing_dataset.to_yaml(Path(file))
+                else:
+                    click.secho(f"Dataset {dataset.urn} does not exist")
+                    failures.append(dataset.urn)
+    if failures:
+        click.secho(
+            f"\nFailed to sync the following Datasets: {', '.join(failures)}",
+            fg="red",
+        )
+        raise click.Abort()

datahub/configuration/git.py CHANGED Viewed

@@ -43,9 +43,7 @@ class GitReference(ConfigModel):
     @validator("repo", pre=True)
     def simplify_repo_url(cls, repo: str) -> str:
-        if repo.startswith("github.com/"):
-            repo = f"https://{repo}"
-        elif repo.startswith("gitlab.com"):
+        if repo.startswith("github.com/") or repo.startswith("gitlab.com"):
             repo = f"https://{repo}"
         elif repo.count("/") == 1:
             repo = f"https://github.com/{repo}"

datahub/ingestion/glossary/classification_mixin.py CHANGED Viewed

@@ -281,7 +281,7 @@ class ClassificationHandler:
                     ),
                     values=(
                         sample_data[schema_field.fieldPath]
-                        if schema_field.fieldPath in sample_data.keys()
+                        if schema_field.fieldPath in sample_data
                         else []
                     ),
                 )

datahub/ingestion/graph/client.py CHANGED Viewed

@@ -16,6 +16,7 @@ from typing import (
     List,
     Literal,
     Optional,
+    Sequence,
     Tuple,
     Type,
     Union,
@@ -42,8 +43,8 @@ from datahub.ingestion.graph.connections import (
 )
 from datahub.ingestion.graph.entity_versioning import EntityVersioningAPI
 from datahub.ingestion.graph.filters import (
+    RawSearchFilterRule,
     RemovedStatusFilter,
-    SearchFilterRule,
     generate_filter,
 )
 from datahub.ingestion.source.state.checkpoint import Checkpoint
@@ -105,7 +106,7 @@ class RelatedEntity:
     via: Optional[str] = None
-def _graphql_entity_type(entity_type: str) -> str:
+def entity_type_to_graphql(entity_type: str) -> str:
     """Convert the entity types into GraphQL "EntityType" enum values."""
     # Hard-coded special cases.
@@ -330,7 +331,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         aspect_type_name: Optional[str] = None,
         version: int = 0,
     ) -> Optional[Aspect]:
-        assert aspect_type.ASPECT_NAME == aspect
+        assert aspect == aspect_type.ASPECT_NAME
         return self.get_aspect(
             entity_urn=entity_urn,
             aspect_type=aspect_type,
@@ -797,13 +798,13 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         container: Optional[str] = None,
         status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
         batch_size: int = 100,
-        extraFilters: Optional[List[SearchFilterRule]] = None,
+        extraFilters: Optional[List[RawSearchFilterRule]] = None,
     ) -> Iterable[Tuple[str, "GraphQLSchemaMetadata"]]:
         """Fetch schema info for datasets that match all of the given filters.
         :return: An iterable of (urn, schema info) tuple that match the filters.
         """
-        types = [_graphql_entity_type("dataset")]
+        types = [entity_type_to_graphql("dataset")]
         # Add the query default of * if no query is specified.
         query = query or "*"
@@ -865,7 +866,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
     def get_urns_by_filter(
         self,
         *,
-        entity_types: Optional[List[str]] = None,
+        entity_types: Optional[Sequence[str]] = None,
         platform: Optional[str] = None,
         platform_instance: Optional[str] = None,
         env: Optional[str] = None,
@@ -873,8 +874,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         container: Optional[str] = None,
         status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
         batch_size: int = 10000,
-        extraFilters: Optional[List[SearchFilterRule]] = None,
-        extra_or_filters: Optional[List[Dict[str, List[SearchFilterRule]]]] = None,
+        extraFilters: Optional[List[RawSearchFilterRule]] = None,
+        extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
     ) -> Iterable[str]:
         """Fetch all urns that match all of the given filters.
@@ -965,8 +966,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         container: Optional[str] = None,
         status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
         batch_size: int = 10000,
-        extra_and_filters: Optional[List[SearchFilterRule]] = None,
-        extra_or_filters: Optional[List[Dict[str, List[SearchFilterRule]]]] = None,
+        extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
+        extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
         extra_source_fields: Optional[List[str]] = None,
         skip_cache: bool = False,
     ) -> Iterable[dict]:
@@ -1109,7 +1110,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
                     f"Scrolling to next scrollAcrossEntities page: {scroll_id}"
                 )
-    def _get_types(self, entity_types: Optional[List[str]]) -> Optional[List[str]]:
+    @classmethod
+    def _get_types(cls, entity_types: Optional[Sequence[str]]) -> Optional[List[str]]:
         types: Optional[List[str]] = None
         if entity_types is not None:
             if not entity_types:
@@ -1117,7 +1119,9 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
                     "entity_types cannot be an empty list; use None for all entities"
                 )
-            types = [_graphql_entity_type(entity_type) for entity_type in entity_types]
+            types = [
+                entity_type_to_graphql(entity_type) for entity_type in entity_types
+            ]
         return types
     def get_latest_pipeline_checkpoint(

datahub/ingestion/graph/filters.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import dataclasses
 import enum
 from typing import Any, Dict, List, Optional
@@ -7,7 +8,31 @@ from datahub.emitter.mce_builder import (
 )
 from datahub.utilities.urns.urn import guess_entity_type
-SearchFilterRule = Dict[str, Any]
+RawSearchFilterRule = Dict[str, Any]
+@dataclasses.dataclass
+class SearchFilterRule:
+    field: str
+    condition: str  # TODO: convert to an enum
+    values: List[str]
+    negated: bool = False
+    def to_raw(self) -> RawSearchFilterRule:
+        return {
+            "field": self.field,
+            "condition": self.condition,
+            "values": self.values,
+            "negated": self.negated,
+        }
+    def negate(self) -> "SearchFilterRule":
+        return SearchFilterRule(
+            field=self.field,
+            condition=self.condition,
+            values=self.values,
+            negated=not self.negated,
+        )
 class RemovedStatusFilter(enum.Enum):
@@ -29,9 +54,9 @@ def generate_filter(
     env: Optional[str],
     container: Optional[str],
     status: RemovedStatusFilter,
-    extra_filters: Optional[List[SearchFilterRule]],
-    extra_or_filters: Optional[List[SearchFilterRule]] = None,
-) -> List[Dict[str, List[SearchFilterRule]]]:
+    extra_filters: Optional[List[RawSearchFilterRule]],
+    extra_or_filters: Optional[List[RawSearchFilterRule]] = None,
+) -> List[Dict[str, List[RawSearchFilterRule]]]:
     """
     Generate a search filter based on the provided parameters.
     :param platform: The platform to filter by.
@@ -43,30 +68,32 @@ def generate_filter(
     :param extra_or_filters: Extra OR filters to apply. These are combined with
     the AND filters using an OR at the top level.
     """
-    and_filters: List[SearchFilterRule] = []
+    and_filters: List[RawSearchFilterRule] = []
     # Platform filter.
     if platform:
-        and_filters.append(_get_platform_filter(platform))
+        and_filters.append(_get_platform_filter(platform).to_raw())
     # Platform instance filter.
     if platform_instance:
-        and_filters.append(_get_platform_instance_filter(platform, platform_instance))
+        and_filters.append(
+            _get_platform_instance_filter(platform, platform_instance).to_raw()
+        )
     # Browse path v2 filter.
     if container:
-        and_filters.append(_get_container_filter(container))
+        and_filters.append(_get_container_filter(container).to_raw())
     # Status filter.
     status_filter = _get_status_filter(status)
     if status_filter:
-        and_filters.append(status_filter)
+        and_filters.append(status_filter.to_raw())
     # Extra filters.
     if extra_filters:
         and_filters += extra_filters
-    or_filters: List[Dict[str, List[SearchFilterRule]]] = [{"and": and_filters}]
+    or_filters: List[Dict[str, List[RawSearchFilterRule]]] = [{"and": and_filters}]
     # Env filter
     if env:
@@ -89,7 +116,7 @@ def generate_filter(
     return or_filters
-def _get_env_filters(env: str) -> List[SearchFilterRule]:
+def _get_env_filters(env: str) -> List[RawSearchFilterRule]:
     # The env filter is a bit more tricky since it's not always stored
     # in the same place in ElasticSearch.
     return [
@@ -125,19 +152,19 @@ def _get_status_filter(status: RemovedStatusFilter) -> Optional[SearchFilterRule
         # removed field is simply not present in the ElasticSearch document. Ideally this
         # would be a "removed" : "false" filter, but that doesn't work. Instead, we need to
         # use a negated filter.
-        return {
-            "field": "removed",
-            "values": ["true"],
-            "condition": "EQUAL",
-            "negated": True,
-        }
+        return SearchFilterRule(
+            field="removed",
+            values=["true"],
+            condition="EQUAL",
+            negated=True,
+        )
     elif status == RemovedStatusFilter.ONLY_SOFT_DELETED:
-        return {
-            "field": "removed",
-            "values": ["true"],
-            "condition": "EQUAL",
-        }
+        return SearchFilterRule(
+            field="removed",
+            values=["true"],
+            condition="EQUAL",
+        )
     elif status == RemovedStatusFilter.ALL:
         # We don't need to add a filter for this case.
@@ -152,11 +179,11 @@ def _get_container_filter(container: str) -> SearchFilterRule:
     if guess_entity_type(container) != "container":
         raise ValueError(f"Invalid container urn: {container}")
-    return {
-        "field": "browsePathV2",
-        "values": [container],
-        "condition": "CONTAIN",
-    }
+    return SearchFilterRule(
+        field="browsePathV2",
+        values=[container],
+        condition="CONTAIN",
+    )
 def _get_platform_instance_filter(
@@ -171,16 +198,16 @@ def _get_platform_instance_filter(
     if guess_entity_type(platform_instance) != "dataPlatformInstance":
         raise ValueError(f"Invalid data platform instance urn: {platform_instance}")
-    return {
-        "field": "platformInstance",
-        "values": [platform_instance],
-        "condition": "EQUAL",
-    }
+    return SearchFilterRule(
+        field="platformInstance",
+        condition="EQUAL",
+        values=[platform_instance],
+    )
 def _get_platform_filter(platform: str) -> SearchFilterRule:
-    return {
-        "field": "platform.keyword",
-        "values": [make_data_platform_urn(platform)],
-        "condition": "EQUAL",
-    }
+    return SearchFilterRule(
+        field="platform.keyword",
+        condition="EQUAL",
+        values=[make_data_platform_urn(platform)],
+    )

datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py CHANGED Viewed

@@ -163,12 +163,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
                 key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
                 for key, value in obj.items()
             }
-        elif isinstance(obj, list):
-            return [
-                DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
-                for element in obj
-            ]
-        elif isinstance(obj, set):
+        elif isinstance(obj, list) or isinstance(obj, set):
             return [
                 DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
                 for element in obj

datahub/ingestion/source/abs/config.py CHANGED Viewed

@@ -144,10 +144,8 @@ class DataLakeSourceConfig(
         return path_specs
     @pydantic.validator("platform", always=True)
-    def platform_not_empty(cls, platform: str, values: dict) -> str:
-        inferred_platform = values.get(
-            "platform", None
-        )  # we may have inferred it above
+    def platform_not_empty(cls, platform: Any, values: dict) -> str:
+        inferred_platform = values.get("platform")  # we may have inferred it above
         platform = platform or inferred_platform
         if not platform:
             raise ValueError("platform must not be empty")

datahub/ingestion/source/bigquery_v2/bigquery_audit.py CHANGED Viewed

@@ -165,7 +165,7 @@ class BigQueryTableRef:
     @classmethod
     def from_spec_obj(cls, spec: dict) -> "BigQueryTableRef":
         for key in ["projectId", "datasetId", "tableId"]:
-            if key not in spec.keys():
+            if key not in spec:
                 raise ValueError(f"invalid BigQuery table reference dict: {spec}")
         return cls(

datahub/ingestion/source/bigquery_v2/bigquery_schema.py CHANGED Viewed

@@ -344,7 +344,7 @@ class BigQuerySchemaApi:
         with_partitions: bool = False,
     ) -> Iterator[BigqueryTable]:
         with PerfTimer() as current_timer:
-            filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys())
+            filter_clause: str = ", ".join(f"'{table}'" for table in tables)
             if with_partitions:
                 query_template = BigqueryQuery.tables_for_dataset

datahub/ingestion/source/cassandra/cassandra.py CHANGED Viewed

@@ -59,9 +59,9 @@ from datahub.metadata.schema_classes import (
     UpstreamLineageClass,
     ViewPropertiesClass,
 )
-from datahub.sdk._entity import Entity
 from datahub.sdk.container import Container
 from datahub.sdk.dataset import Dataset
+from datahub.sdk.entity import Entity
 logger = logging.getLogger(__name__)

datahub/ingestion/source/csv_enricher.py CHANGED Viewed

@@ -314,7 +314,7 @@ class CSVEnricherSource(Source):
             "datajob": EditableDataJobPropertiesClass,
             "dataflow": EditableDataFlowPropertiesClass,
             "notebook": EditableNotebookPropertiesClass,
-        }.get(entityType, None)
+        }.get(entityType)
         if not entityClass:
             raise ValueError(

datahub/ingestion/source/dbt/dbt_common.py CHANGED Viewed

@@ -1033,7 +1033,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
             cll_nodes.add(dbt_name)
             schema_nodes.add(dbt_name)
-        for dbt_name in all_nodes_map.keys():
+        for dbt_name in all_nodes_map:
             if self._is_allowed_node(dbt_name):
                 add_node_to_cll_list(dbt_name)

datahub/ingestion/source/file.py CHANGED Viewed

@@ -410,10 +410,13 @@ def _from_obj_for_file(
         item = MetadataChangeEvent.from_obj(obj)
     elif "aspect" in obj:
         item = MetadataChangeProposalWrapper.from_obj(obj)
-    else:
+    elif "bucket" in obj:
         item = UsageAggregationClass.from_obj(obj)
+    else:
+        raise ValueError(f"Unknown object type: {obj}")
     if not item.validate():
-        raise ValueError(f"failed to parse: {obj}")
+        raise ValueError(f"Failed to parse: {obj}")
     if isinstance(item, UsageAggregationClass):
         logger.warning(f"Dropping deprecated UsageAggregationClass: {item}")

datahub/ingestion/source/gc/dataprocess_cleanup.py CHANGED Viewed

@@ -498,7 +498,7 @@ class DataProcessCleanup:
         # Delete empty dataflows if needed
         if self.config.delete_empty_data_flows:
             deleted_data_flows: int = 0
-            for key in dataFlows.keys():
+            for key in dataFlows:
                 if not dataJobs.get(key) or len(dataJobs[key]) == 0:
                     logger.info(
                         f"Deleting dataflow {key} because there are not datajobs"

datahub/ingestion/source/ge_data_profiler.py CHANGED Viewed

@@ -170,14 +170,10 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
             ).select_from(self._table)
         )
         return convert_to_json_serializable(element_values.fetchone()[0])
-    elif self.engine.dialect.name.lower() == BIGQUERY:
-        element_values = self.engine.execute(
-            sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
-                self._table
-            )
-        )
-        return convert_to_json_serializable(element_values.fetchone()[0])
-    elif self.engine.dialect.name.lower() == SNOWFLAKE:
+    elif (
+        self.engine.dialect.name.lower() == BIGQUERY
+        or self.engine.dialect.name.lower() == SNOWFLAKE
+    ):
         element_values = self.engine.execute(
             sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
                 self._table
@@ -381,13 +377,14 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
             col = col_dict["name"]
             self.column_types[col] = str(col_dict["type"])
             # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
-            if not self.config._allow_deny_patterns.allowed(
-                f"{self.dataset_name}.{col}"
+            if (
+                not self.config._allow_deny_patterns.allowed(
+                    f"{self.dataset_name}.{col}"
+                )
+                or not self.config.profile_nested_fields
+                and "." in col
             ):
                 ignored_columns_by_pattern.append(col)
-            # We try to ignore nested columns as well
-            elif not self.config.profile_nested_fields and "." in col:
-                ignored_columns_by_pattern.append(col)
             elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
                 ignored_columns_by_type.append(col)
             else:
@@ -1408,7 +1405,7 @@ class DatahubGEProfiler:
             },
         )
-        if platform == BIGQUERY or platform == DATABRICKS:
+        if platform in (BIGQUERY, DATABRICKS):
             # This is done as GE makes the name as DATASET.TABLE
             # but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
             name_parts = pretty_name.split(".")

acryl-datahub 1.0.0rc7__py3-none-any.whl → 1.0.0rc9__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0rc7py3-none-any.whl → 1.0.0rc9py3-none-any.whl