PyPI - acryl-datahub - Versions diffs - 1.2.0.10rc3__py3-none-any.whl → 1.2.0.10rc5__py3-none-any.whl - Mend

acryl-datahub 1.2.0.10rc3py3-none-any.whl → 1.2.0.10rc5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (94) hide show

{acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc5.dist-info}/METADATA +2513 -2571
{acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc5.dist-info}/RECORD +94 -87
{acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc5.dist-info}/entry_points.txt +2 -0
datahub/_version.py +1 -1
datahub/api/entities/assertion/assertion.py +1 -1
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/dataproduct/dataproduct.py +6 -3
datahub/api/entities/dataset/dataset.py +9 -18
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/api/graphql/operation.py +10 -6
datahub/cli/docker_check.py +2 -2
datahub/configuration/common.py +29 -1
datahub/configuration/connection_resolver.py +5 -2
datahub/configuration/import_resolver.py +7 -4
datahub/configuration/pydantic_migration_helpers.py +0 -9
datahub/configuration/source_common.py +3 -2
datahub/configuration/validate_field_deprecation.py +5 -2
datahub/configuration/validate_field_removal.py +5 -2
datahub/configuration/validate_field_rename.py +6 -5
datahub/configuration/validate_multiline_string.py +5 -2
datahub/ingestion/autogenerated/capability_summary.json +33 -1
datahub/ingestion/run/pipeline_config.py +2 -2
datahub/ingestion/source/azure/azure_common.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
datahub/ingestion/source/datahub/config.py +8 -9
datahub/ingestion/source/delta_lake/config.py +1 -1
datahub/ingestion/source/dremio/dremio_config.py +3 -4
datahub/ingestion/source/feast.py +8 -10
datahub/ingestion/source/fivetran/config.py +1 -1
datahub/ingestion/source/ge_profiling_config.py +26 -22
datahub/ingestion/source/grafana/grafana_config.py +2 -2
datahub/ingestion/source/grafana/models.py +12 -14
datahub/ingestion/source/hex/hex.py +6 -1
datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
datahub/ingestion/source/kafka_connect/common.py +2 -2
datahub/ingestion/source/looker/looker_common.py +1 -1
datahub/ingestion/source/looker/looker_config.py +15 -4
datahub/ingestion/source/looker/looker_source.py +52 -3
datahub/ingestion/source/looker/lookml_config.py +1 -1
datahub/ingestion/source/metadata/business_glossary.py +7 -7
datahub/ingestion/source/metadata/lineage.py +1 -1
datahub/ingestion/source/mode.py +13 -5
datahub/ingestion/source/nifi.py +1 -1
datahub/ingestion/source/powerbi/config.py +14 -21
datahub/ingestion/source/preset.py +1 -1
datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
datahub/ingestion/source/redshift/config.py +6 -3
datahub/ingestion/source/salesforce.py +13 -9
datahub/ingestion/source/schema/json_schema.py +14 -14
datahub/ingestion/source/sigma/data_classes.py +3 -0
datahub/ingestion/source/snaplogic/__init__.py +0 -0
datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
datahub/ingestion/source/sql/athena.py +2 -1
datahub/ingestion/source/sql/clickhouse.py +12 -7
datahub/ingestion/source/sql/cockroachdb.py +5 -3
datahub/ingestion/source/sql/druid.py +2 -2
datahub/ingestion/source/sql/hive.py +4 -3
datahub/ingestion/source/sql/hive_metastore.py +7 -9
datahub/ingestion/source/sql/mssql/source.py +2 -2
datahub/ingestion/source/sql/mysql.py +2 -2
datahub/ingestion/source/sql/oracle.py +3 -3
datahub/ingestion/source/sql/presto.py +2 -1
datahub/ingestion/source/sql/teradata.py +4 -4
datahub/ingestion/source/sql/trino.py +2 -1
datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
datahub/ingestion/source/sql/vertica.py +1 -1
datahub/ingestion/source/sql_queries.py +6 -6
datahub/ingestion/source/state/checkpoint.py +5 -1
datahub/ingestion/source/state/entity_removal_state.py +5 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
datahub/ingestion/source/superset.py +1 -2
datahub/ingestion/source/tableau/tableau.py +20 -6
datahub/ingestion/source/unity/config.py +7 -3
datahub/ingestion/source/usage/usage_common.py +3 -3
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/set_browse_path.py +112 -0
datahub/sdk/_shared.py +126 -0
datahub/sdk/chart.py +87 -30
datahub/sdk/dashboard.py +79 -32
datahub/sdk/search_filters.py +1 -7
{acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc5.dist-info}/WHEEL +0 -0
{acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc5.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc5.dist-info}/top_level.txt +0 -0

datahub/api/entities/dataset/dataset.py CHANGED Viewed

@@ -27,7 +27,7 @@ from typing_extensions import TypeAlias
 import datahub.metadata.schema_classes as models
 from datahub.api.entities.structuredproperties.structuredproperties import AllowedTypes
-from datahub.configuration.common import ConfigModel
+from datahub.configuration.common import ConfigModel, LaxStr
 from datahub.emitter.mce_builder import (
     make_data_platform_urn,
     make_dataset_urn,
@@ -143,7 +143,6 @@ class SchemaFieldSpecification(StrictModel):
     jsonPath: Union[None, str] = None
     nullable: bool = False
     description: Union[None, str] = None
-    doc: Union[None, str] = None  # doc is an alias for description
     label: Optional[str] = None
     created: Optional[dict] = None
     lastModified: Optional[dict] = None
@@ -221,14 +220,14 @@ class SchemaFieldSpecification(StrictModel):
         return v
     @root_validator(pre=True)
-    def sync_description_and_doc(cls, values: Dict) -> Dict:
-        """Synchronize doc and description fields if one is provided but not the other."""
+    def sync_doc_into_description(cls, values: Dict) -> Dict:
+        """Synchronize doc into description field if doc is provided."""
         description = values.get("description")
-        doc = values.get("doc")
+        doc = values.pop("doc", None)
-        if description is not None and doc is None:
-            values["doc"] = description
-        elif doc is not None and description is None:
+        if doc is not None:
+            if description is not None:
+                raise ValueError("doc and description cannot both be provided")
             values["description"] = doc
         return values
@@ -296,10 +295,6 @@ class SchemaFieldSpecification(StrictModel):
             """Custom dict method for Pydantic v1 to handle YAML serialization properly."""
             exclude = kwargs.pop("exclude", None) or set()
-            # If description and doc are identical, exclude doc from the output
-            if self.description == self.doc and self.description is not None:
-                exclude.add("doc")
             # if nativeDataType and type are identical, exclude nativeDataType from the output
             if self.nativeDataType == self.type and self.nativeDataType is not None:
                 exclude.add("nativeDataType")
@@ -327,10 +322,6 @@ class SchemaFieldSpecification(StrictModel):
             """Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
             exclude = kwargs.pop("exclude", None) or set()
-            # If description and doc are identical, exclude doc from the output
-            if self.description == self.doc and self.description is not None:
-                exclude.add("doc")
             # if nativeDataType and type are identical, exclude nativeDataType from the output
             if self.nativeDataType == self.type and self.nativeDataType is not None:
                 exclude.add("nativeDataType")
@@ -387,7 +378,7 @@ class Dataset(StrictModel):
     name: Optional[str] = Field(None, validate_default=True)
     schema_metadata: Optional[SchemaSpecification] = Field(default=None, alias="schema")
     downstreams: Optional[List[str]] = None
-    properties: Optional[Dict[str, str]] = None
+    properties: Optional[Dict[str, LaxStr]] = None
     subtype: Optional[str] = None
     subtypes: Optional[List[str]] = None
     tags: Optional[List[str]] = None
@@ -605,7 +596,7 @@ class Dataset(StrictModel):
                         ],
                         platformSchema=OtherSchemaClass(
                             rawSchema=yaml.dump(
-                                self.schema_metadata.dict(
+                                self.schema_metadata.model_dump(
                                     exclude_none=True, exclude_unset=True
                                 )
                             )

datahub/api/entities/structuredproperties/structuredproperties.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from enum import Enum
 from pathlib import Path
-from typing import Iterable, List, Optional, Union
+from typing import Iterable, List, Optional, Type, Union
 import yaml
 from pydantic import Field, StrictStr, validator
@@ -48,7 +48,7 @@ VALID_ENTITY_TYPE_URNS = [
 _VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."
-def _validate_entity_type_urn(v: str) -> str:
+def _validate_entity_type_urn(cls: Type, v: str) -> str:
     urn = Urn.make_entity_type_urn(v)
     if urn not in VALID_ENTITY_TYPE_URNS:
         raise ValueError(

datahub/api/graphql/operation.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from typing import Any, Dict, List, Optional
-from gql import gql
+from gql import GraphQLRequest
 from datahub.api.graphql.base import BaseApi
@@ -79,10 +79,12 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
         if custom_properties is not None:
             variable_values["customProperties"] = custom_properties
-        result = self.client.execute(
-            gql(Operation.REPORT_OPERATION_MUTATION), variable_values=variable_values
+        request = GraphQLRequest(
+            Operation.REPORT_OPERATION_MUTATION, variable_values=variable_values
         )
+        result = self.client.execute(request)
         return result["reportOperation"]
     def query_operations(
@@ -109,12 +111,12 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
         :param partition: The partition to check the operation.
         """
-        result = self.client.execute(
-            gql(Operation.QUERY_OPERATIONS),
+        request = GraphQLRequest(
+            Operation.QUERY_OPERATIONS,
             variable_values={
                 "urn": urn,
                 "startTimeMillis": start_time_millis,
-                "end_time_millis": end_time_millis,
+                "endTimeMillis": end_time_millis,
                 "limit": limit,
                 "filter": self.gen_filter(
                     {
@@ -125,6 +127,8 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
                 ),
             },
         )
+        result = self.client.execute(request)
         if "dataset" in result and "operations" in result["dataset"]:
             operations = []
             if source_type is not None:

datahub/cli/docker_check.py CHANGED Viewed

@@ -13,8 +13,8 @@ import yaml
 from datahub.configuration.common import ExceptionWithProps
 # Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.
-MIN_MEMORY_NEEDED = 3.8  # GB
-MIN_DISK_SPACE_NEEDED = 12  # GB
+MIN_MEMORY_NEEDED = 4  # GB
+MIN_DISK_SPACE_NEEDED = 13  # GB
 DOCKER_COMPOSE_PROJECT_NAME = os.getenv("DATAHUB_COMPOSE_PROJECT_NAME", "datahub")
 DATAHUB_COMPOSE_PROJECT_FILTER = {

datahub/configuration/common.py CHANGED Viewed

@@ -1,20 +1,25 @@
+import dataclasses
 import re
 import unittest.mock
 from abc import ABC, abstractmethod
 from enum import auto
 from typing import (
     IO,
+    TYPE_CHECKING,
+    Annotated,
     Any,
     ClassVar,
     Dict,
     List,
     Optional,
     Type,
+    TypeVar,
     Union,
     runtime_checkable,
 )
 import pydantic
+import pydantic_core
 from cached_property import cached_property
 from pydantic import BaseModel, Extra, ValidationError
 from pydantic.fields import Field
@@ -83,6 +88,29 @@ def redact_raw_config(obj: Any) -> Any:
         return obj
+if TYPE_CHECKING:
+    AnyType = TypeVar("AnyType")
+    HiddenFromDocs = Annotated[AnyType, ...]
+else:
+    HiddenFromDocs = pydantic.json_schema.SkipJsonSchema
+LaxStr = Annotated[str, pydantic.BeforeValidator(lambda v: str(v))]
+@dataclasses.dataclass(frozen=True)
+class SupportedSources:
+    sources: List[str]
+    def __get_pydantic_json_schema__(
+        self,
+        core_schema: pydantic_core.core_schema.CoreSchema,
+        handler: pydantic.GetJsonSchemaHandler,
+    ) -> pydantic.json_schema.JsonSchemaValue:
+        json_schema = handler(core_schema)
+        json_schema.setdefault("schema_extra", {})["supported_sources"] = self.sources
+        return json_schema
 class ConfigModel(BaseModel):
     class Config:
         @staticmethod
@@ -334,4 +362,4 @@ class KeyValuePattern(ConfigModel):
 class VersionedConfig(ConfigModel):
-    version: str = "1"
+    version: LaxStr = "1"

datahub/configuration/connection_resolver.py CHANGED Viewed

@@ -1,13 +1,16 @@
-from typing import Type
+from typing import TYPE_CHECKING, Type
 import pydantic
 from datahub.ingestion.api.global_context import get_graph_context
+if TYPE_CHECKING:
+    from pydantic.deprecated.class_validators import V1RootValidator
 def auto_connection_resolver(
     connection_field: str = "connection",
-) -> classmethod:
+) -> "V1RootValidator":
     def _resolve_connection(cls: Type, values: dict) -> dict:
         if connection_field in values:
             connection_urn = values.pop(connection_field)

datahub/configuration/import_resolver.py CHANGED Viewed

@@ -1,15 +1,18 @@
-from typing import TypeVar, Union
+from typing import TYPE_CHECKING, Type, TypeVar, Union
 import pydantic
 from datahub.ingestion.api.registry import import_path
-T = TypeVar("T")
+if TYPE_CHECKING:
+    from pydantic.deprecated.class_validators import V1Validator
+_T = TypeVar("_T")
-def _pydantic_resolver(v: Union[T, str]) -> T:
+def _pydantic_resolver(cls: Type, v: Union[str, _T]) -> _T:
     return import_path(v) if isinstance(v, str) else v
-def pydantic_resolve_key(field: str) -> classmethod:
+def pydantic_resolve_key(field: str) -> "V1Validator":
     return pydantic.validator(field, pre=True, allow_reuse=True)(_pydantic_resolver)

datahub/configuration/pydantic_migration_helpers.py CHANGED Viewed

@@ -9,14 +9,6 @@ PYDANTIC_VERSION_2 = _pydantic_version >= Version("2.0")
 # https://docs.pydantic.dev/latest/changelog/#v250-2023-11-13
 PYDANTIC_SUPPORTS_CALLABLE_DISCRIMINATOR = _pydantic_version >= Version("2.5.0")
-# This can be used to silence deprecation warnings while we migrate.
-if PYDANTIC_VERSION_2:
-    from pydantic import PydanticDeprecatedSince20  # type: ignore
-else:
-    class PydanticDeprecatedSince20(Warning):  # type: ignore
-        pass
 if PYDANTIC_VERSION_2:
     from pydantic import BaseModel as GenericModel
@@ -52,7 +44,6 @@ class v1_ConfigModel(v1_BaseModel):
 __all__ = [
     "PYDANTIC_VERSION_2",
     "PYDANTIC_SUPPORTS_CALLABLE_DISCRIMINATOR",
-    "PydanticDeprecatedSince20",
     "GenericModel",
     "v1_ConfigModel",
     "v1_Field",

datahub/configuration/source_common.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Dict, Optional
-from pydantic import validator
+import pydantic
 from pydantic.fields import Field
 from datahub.configuration.common import ConfigModel
@@ -30,7 +30,8 @@ class EnvConfigMixin(ConfigModel):
         description="The environment that all assets produced by this connector belong to",
     )
-    @validator("env")
+    @pydantic.field_validator("env", mode="after")
+    @classmethod
     def env_must_be_one_of(cls, v: str) -> str:
         if v.upper() not in ALL_ENV_TYPES:
             raise ValueError(f"env must be one of {ALL_ENV_TYPES}, found {v}")

datahub/configuration/validate_field_deprecation.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import warnings
-from typing import Any, Optional, Type
+from typing import TYPE_CHECKING, Any, Optional, Type
 import pydantic
 from datahub.configuration.common import ConfigurationWarning
 from datahub.utilities.global_warning_util import add_global_warning
+if TYPE_CHECKING:
+    from pydantic.deprecated.class_validators import V1RootValidator
 _unset = object()
@@ -13,7 +16,7 @@ def pydantic_field_deprecated(
     field: str,
     warn_if_value_is_not: Any = _unset,
     message: Optional[str] = None,
-) -> classmethod:
+) -> "V1RootValidator":
     if message:
         output = message
     else:

datahub/configuration/validate_field_removal.py CHANGED Viewed

@@ -1,15 +1,18 @@
 import warnings
-from typing import Type
+from typing import TYPE_CHECKING, Type
 import pydantic
 from datahub.configuration.common import ConfigurationWarning
+if TYPE_CHECKING:
+    from pydantic.deprecated.class_validators import V1RootValidator
 def pydantic_removed_field(
     field: str,
     print_warning: bool = True,
-) -> classmethod:
+) -> "V1RootValidator":
     def _validate_field_removal(cls: Type, values: dict) -> dict:
         if field in values:
             if print_warning:

datahub/configuration/validate_field_rename.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import warnings
-from typing import Callable, Type, TypeVar
+from typing import TYPE_CHECKING, Callable, Type, TypeVar
 import pydantic
 from datahub.configuration.common import ConfigurationWarning
 from datahub.utilities.global_warning_util import add_global_warning
+if TYPE_CHECKING:
+    from pydantic.deprecated.class_validators import V1RootValidator
 _T = TypeVar("_T")
@@ -18,7 +21,7 @@ def pydantic_renamed_field(
     new_name: str,
     transform: Callable = _default_rename_transform,
     print_warning: bool = True,
-) -> classmethod:
+) -> "V1RootValidator":
     def _validate_field_rename(cls: Type, values: dict) -> dict:
         if old_name in values:
             if new_name in values:
@@ -49,6 +52,4 @@ def pydantic_renamed_field(
     # validator with pre=True gets all the values that were passed in.
     # Given that a renamed field doesn't show up in the fields list, we can't use
     # the field-level validator, even with a different field name.
-    return pydantic.root_validator(pre=True, skip_on_failure=True, allow_reuse=True)(
-        _validate_field_rename
-    )
+    return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename)

datahub/configuration/validate_multiline_string.py CHANGED Viewed

@@ -1,9 +1,12 @@
-from typing import Optional, Type, Union
+from typing import TYPE_CHECKING, Optional, Type, Union
 import pydantic
+if TYPE_CHECKING:
+    from pydantic.deprecated.class_validators import V1Validator
-def pydantic_multiline_string(field: str) -> classmethod:
+def pydantic_multiline_string(field: str) -> "V1Validator":
     """If the field is present and contains an escaped newline, replace it with a real newline.
     This makes the assumption that the field value is never supposed to have a

datahub/ingestion/autogenerated/capability_summary.json CHANGED Viewed

@@ -2968,6 +2968,38 @@
       "platform_name": "Slack",
       "support_status": "TESTING"
     },
+    "snaplogic": {
+      "capabilities": [
+        {
+          "capability": "LINEAGE_FINE",
+          "description": "Enabled by default",
+          "subtype_modifier": null,
+          "supported": true
+        },
+        {
+          "capability": "DELETION_DETECTION",
+          "description": "Not supported yet",
+          "subtype_modifier": null,
+          "supported": false
+        },
+        {
+          "capability": "PLATFORM_INSTANCE",
+          "description": "Snaplogic does not support platform instances",
+          "subtype_modifier": null,
+          "supported": false
+        },
+        {
+          "capability": "LINEAGE_COARSE",
+          "description": "Enabled by default",
+          "subtype_modifier": null,
+          "supported": true
+        }
+      ],
+      "classname": "datahub.ingestion.source.snaplogic.snaplogic.SnaplogicSource",
+      "platform_id": "snaplogic",
+      "platform_name": "Snaplogic",
+      "support_status": "TESTING"
+    },
     "snowflake": {
       "capabilities": [
         {
@@ -3617,4 +3649,4 @@
       "support_status": "CERTIFIED"
     }
   }
-}
+}

datahub/ingestion/run/pipeline_config.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional
 from pydantic import Field, validator
-from datahub.configuration.common import ConfigModel, DynamicTypedConfig
+from datahub.configuration.common import ConfigModel, DynamicTypedConfig, HiddenFromDocs
 from datahub.ingestion.graph.config import DatahubClientConfig
 from datahub.ingestion.sink.file import FileSinkConfig
@@ -85,7 +85,7 @@ class PipelineConfig(ConfigModel):
     source: SourceConfig
     sink: Optional[DynamicTypedConfig] = None
     transformers: Optional[List[DynamicTypedConfig]] = None
-    flags: FlagsConfig = Field(default=FlagsConfig(), hidden_from_docs=True)
+    flags: HiddenFromDocs[FlagsConfig] = FlagsConfig()
     reporting: List[ReporterConfig] = []
     run_id: str = DEFAULT_RUN_ID
     datahub_api: Optional[DatahubClientConfig] = None

datahub/ingestion/source/azure/azure_common.py CHANGED Viewed

@@ -81,7 +81,7 @@ class AzureConnectionConfig(ConfigModel):
             )
         return self.sas_token if self.sas_token is not None else self.account_key
-    @root_validator()
+    @root_validator(skip_on_failure=True)
     def _check_credential_values(cls, values: Dict) -> Dict:
         if (
             values.get("account_key")

datahub/ingestion/source/bigquery_v2/bigquery_config.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import logging
 import os
 import re
+from copy import deepcopy
 from datetime import timedelta
 from typing import Dict, List, Optional, Union
 from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
-from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
 from datahub.configuration.source_common import (
     EnvConfigMixin,
     LowerCaseDatasetUrnConfigMixin,
@@ -73,8 +74,10 @@ class BigQueryBaseConfig(ConfigModel):
             ) from e
         return v
-    @root_validator(pre=True, skip_on_failure=True)
+    @root_validator(pre=True)
     def project_id_backward_compatibility_configs_set(cls, values: Dict) -> Dict:
+        # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
+        values = deepcopy(values)
         project_id = values.pop("project_id", None)
         project_ids = values.get("project_ids")
@@ -182,13 +185,14 @@ class BigQueryFilterConfig(SQLFilterConfig):
     )
     # NOTE: `schema_pattern` is added here only to hide it from docs.
-    schema_pattern: AllowDenyPattern = Field(
+    schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
         default=AllowDenyPattern.allow_all(),
-        hidden_from_docs=True,
     )
     @root_validator(pre=False, skip_on_failure=True)
     def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
+        # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
+        values = deepcopy(values)
         dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
         schema_pattern = values.get("schema_pattern")
         if (
@@ -320,8 +324,7 @@ class BigQueryV2Config(
         description="Include full payload into events. It is only for debugging and internal use.",
     )
-    number_of_datasets_process_in_batch: int = Field(
-        hidden_from_docs=True,
+    number_of_datasets_process_in_batch: HiddenFromDocs[int] = Field(
         default=10000,
         description="Number of table queried in batch when getting metadata. This is a low level config property "
         "which should be touched with care.",
@@ -436,17 +439,15 @@ class BigQueryV2Config(
     upstream_lineage_in_report: bool = Field(
         default=False,
-        description="Useful for debugging lineage information. Set to True to see the raw lineage created internally.",
+        description="Useful for debugging lineage information. Set to True to see the raw lineage created internally. Only works with legacy approach (`use_queries_v2: False`).",
     )
-    run_optimized_column_query: bool = Field(
-        hidden_from_docs=True,
+    run_optimized_column_query: HiddenFromDocs[bool] = Field(
         default=False,
         description="Run optimized column query to get column information. This is an experimental feature and may not work for all cases.",
     )
-    file_backed_cache_size: int = Field(
-        hidden_from_docs=True,
+    file_backed_cache_size: HiddenFromDocs[int] = Field(
         default=2000,
         description="Maximum number of entries for the in-memory caches of FileBacked data structures.",
     )
@@ -456,10 +457,9 @@ class BigQueryV2Config(
         description="Option to exclude empty projects from being ingested.",
     )
-    schema_resolution_batch_size: int = Field(
+    schema_resolution_batch_size: HiddenFromDocs[int] = Field(
         default=100,
         description="The number of tables to process in a batch when resolving schema from DataHub.",
-        hidden_from_schema=True,
     )
     max_threads_dataset_parallelism: int = Field(
@@ -480,6 +480,8 @@ class BigQueryV2Config(
     @root_validator(pre=True)
     def set_include_schema_metadata(cls, values: Dict) -> Dict:
+        # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
+        values = deepcopy(values)
         # Historically this is used to disable schema ingestion
         if (
             "include_tables" in values
@@ -498,6 +500,8 @@ class BigQueryV2Config(
     @root_validator(skip_on_failure=True)
     def profile_default_settings(cls, values: Dict) -> Dict:
+        # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
+        values = deepcopy(values)
         # Extra default SQLAlchemy option for better connection pooling and threading.
         # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
         values["options"].setdefault("max_overflow", -1)
@@ -515,9 +519,19 @@ class BigQueryV2Config(
         return v
+    @validator("upstream_lineage_in_report")
+    def validate_upstream_lineage_in_report(cls, v: bool, values: Dict) -> bool:
+        if v and values.get("use_queries_v2", True):
+            logging.warning(
+                "`upstream_lineage_in_report` is enabled but will be ignored because `use_queries_v2` is enabled."
+                "This debugging feature only works with the legacy lineage approach (`use_queries_v2: false`)."
+            )
+        return v
     def get_table_pattern(self, pattern: List[str]) -> str:
         return "|".join(pattern) if pattern else ""
-    platform_instance_not_supported_for_bigquery = pydantic_removed_field(
+    _platform_instance_not_supported_for_bigquery = pydantic_removed_field(
         "platform_instance"
     )

datahub/ingestion/source/bigquery_v2/queries_extractor.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Collection, Dict, Iterable, List, Optional, TypedDict
 from google.cloud.bigquery import Client
 from pydantic import Field, PositiveInt
-from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
 from datahub.configuration.time_window_config import (
     BaseTimeWindowConfig,
     get_time_bucket,
@@ -86,12 +86,11 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig):
     # TODO: Support stateful ingestion for the time windows.
     window: BaseTimeWindowConfig = BaseTimeWindowConfig()
-    local_temp_path: Optional[pathlib.Path] = Field(
-        default=None,
-        description="Local path to store the audit log.",
+    local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = Field(
         # TODO: For now, this is simply an advanced config to make local testing easier.
         # Eventually, we will want to store date-specific files in the directory and use it as a cache.
-        hidden_from_docs=True,
+        default=None,
+        description="Local path to store the audit log.",
     )
     user_email_pattern: AllowDenyPattern = Field(

datahub/ingestion/source/common/gcp_credentials_config.py CHANGED Viewed

@@ -9,7 +9,9 @@ from datahub.configuration.validate_multiline_string import pydantic_multiline_s
 class GCPCredential(ConfigModel):
-    project_id: Optional[str] = Field(description="Project id to set the credentials")
+    project_id: Optional[str] = Field(
+        None, description="Project id to set the credentials"
+    )
     private_key_id: str = Field(description="Private key id")
     private_key: str = Field(
         description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"

acryl-datahub 1.2.0.10rc3__py3-none-any.whl → 1.2.0.10rc5__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.10rc3py3-none-any.whl → 1.2.0.10rc5py3-none-any.whl