PyPI - acryl-datahub - Versions diffs - 1.2.0.10rc2__py3-none-any.whl → 1.2.0.10rc4__py3-none-any.whl - Mend

acryl-datahub 1.2.0.10rc2py3-none-any.whl → 1.2.0.10rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (93) hide show

{acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/METADATA +2525 -2609
{acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/RECORD +93 -93
datahub/_version.py +1 -1
datahub/api/entities/assertion/assertion.py +1 -1
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/dataproduct/dataproduct.py +6 -3
datahub/api/entities/dataset/dataset.py +9 -18
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/api/graphql/operation.py +10 -6
datahub/cli/docker_check.py +2 -2
datahub/configuration/common.py +29 -1
datahub/configuration/connection_resolver.py +5 -2
datahub/configuration/import_resolver.py +7 -4
datahub/configuration/pydantic_migration_helpers.py +0 -9
datahub/configuration/source_common.py +3 -2
datahub/configuration/validate_field_deprecation.py +5 -2
datahub/configuration/validate_field_removal.py +5 -2
datahub/configuration/validate_field_rename.py +6 -5
datahub/configuration/validate_multiline_string.py +5 -2
datahub/ingestion/run/pipeline_config.py +2 -2
datahub/ingestion/source/azure/azure_common.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
datahub/ingestion/source/datahub/config.py +8 -9
datahub/ingestion/source/delta_lake/config.py +1 -1
datahub/ingestion/source/dremio/dremio_config.py +3 -4
datahub/ingestion/source/feast.py +8 -10
datahub/ingestion/source/fivetran/config.py +1 -1
datahub/ingestion/source/ge_profiling_config.py +26 -22
datahub/ingestion/source/grafana/grafana_config.py +2 -2
datahub/ingestion/source/grafana/models.py +12 -14
datahub/ingestion/source/hex/hex.py +6 -1
datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
datahub/ingestion/source/kafka_connect/common.py +2 -2
datahub/ingestion/source/looker/looker_common.py +55 -75
datahub/ingestion/source/looker/looker_config.py +15 -4
datahub/ingestion/source/looker/looker_source.py +445 -548
datahub/ingestion/source/looker/lookml_config.py +1 -1
datahub/ingestion/source/metadata/business_glossary.py +7 -7
datahub/ingestion/source/metadata/lineage.py +1 -1
datahub/ingestion/source/mode.py +13 -5
datahub/ingestion/source/nifi.py +1 -1
datahub/ingestion/source/powerbi/config.py +14 -21
datahub/ingestion/source/preset.py +1 -1
datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
datahub/ingestion/source/redshift/config.py +6 -3
datahub/ingestion/source/salesforce.py +13 -9
datahub/ingestion/source/schema/json_schema.py +14 -14
datahub/ingestion/source/sigma/data_classes.py +3 -0
datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
datahub/ingestion/source/sql/athena.py +2 -1
datahub/ingestion/source/sql/clickhouse.py +12 -7
datahub/ingestion/source/sql/cockroachdb.py +5 -3
datahub/ingestion/source/sql/druid.py +2 -2
datahub/ingestion/source/sql/hive.py +4 -3
datahub/ingestion/source/sql/hive_metastore.py +7 -9
datahub/ingestion/source/sql/mssql/source.py +2 -2
datahub/ingestion/source/sql/mysql.py +2 -2
datahub/ingestion/source/sql/oracle.py +3 -3
datahub/ingestion/source/sql/presto.py +2 -1
datahub/ingestion/source/sql/teradata.py +4 -4
datahub/ingestion/source/sql/trino.py +2 -1
datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
datahub/ingestion/source/sql/vertica.py +1 -1
datahub/ingestion/source/sql_queries.py +6 -6
datahub/ingestion/source/state/checkpoint.py +5 -1
datahub/ingestion/source/state/entity_removal_state.py +5 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
datahub/ingestion/source/superset.py +29 -4
datahub/ingestion/source/tableau/tableau.py +65 -11
datahub/ingestion/source/tableau/tableau_common.py +5 -0
datahub/ingestion/source/tableau/tableau_constant.py +1 -0
datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
datahub/ingestion/source/unity/config.py +7 -3
datahub/ingestion/source/usage/usage_common.py +3 -3
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/metadata/_internal_schema_classes.py +45 -1
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/schema.avsc +24 -1
datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
datahub/sdk/dashboard.py +0 -2
datahub/sdk/search_filters.py +1 -7
{acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/WHEEL +0 -0
{acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/data_lake_common/path_spec.py CHANGED Viewed

@@ -11,7 +11,7 @@ from cached_property import cached_property
 from pydantic.fields import Field
 from wcmatch import pathlib
-from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
 from datahub.ingestion.source.aws.s3_util import is_s3_uri
 from datahub.ingestion.source.azure.abs_utils import is_abs_uri
 from datahub.ingestion.source.gcs.gcs_utils import is_gcs_uri
@@ -89,63 +89,62 @@ class PathSpec(ConfigModel):
         description="Path to table. Name variable `{table}` is used to mark the folder with dataset. In absence of `{table}`, file level dataset will be created. Check below examples for more details."
     )
     exclude: Optional[List[str]] = Field(
-        default=[],
+        [],
         description="list of paths in glob pattern which will be excluded while scanning for the datasets",
     )
     file_types: List[str] = Field(
-        default=SUPPORTED_FILE_TYPES,
+        SUPPORTED_FILE_TYPES,
         description="Files with extenstions specified here (subset of default value) only will be scanned to create dataset. Other files will be omitted.",
     )
     default_extension: Optional[str] = Field(
-        default=None,
+        None,
         description="For files without extension it will assume the specified file type. If it is not set the files without extensions will be skipped.",
     )
     table_name: Optional[str] = Field(
-        default=None,
+        None,
         description="Display name of the dataset.Combination of named variables from include path and strings",
     )
     # This is not used yet, but will be used in the future to sort the partitions
-    sort_key: Optional[SortKey] = Field(
-        hidden_from_docs=True,
-        default=None,
+    sort_key: HiddenFromDocs[Optional[SortKey]] = Field(
+        None,
         description="Sort key to use when sorting the partitions. This is useful when the partitions are not sorted in the order of the data. The key can be a compound key based on the path_spec variables.",
     )
     enable_compression: bool = Field(
-        default=True,
+        True,
         description="Enable or disable processing compressed files. Currently .gz and .bz files are supported.",
     )
     sample_files: bool = Field(
-        default=True,
+        True,
         description="Not listing all the files but only taking a handful amount of sample file to infer the schema. File count and file size calculation will be disabled. This can affect performance significantly if enabled",
     )
     allow_double_stars: bool = Field(
-        default=False,
+        False,
         description="Allow double stars in the include path. This can affect performance significantly if enabled",
     )
     autodetect_partitions: bool = Field(
-        default=True,
+        True,
         description="Autodetect partition(s) from the path. If set to true, it will autodetect partition key/value if the folder format is {partition_key}={partition_value} for example `year=2024`",
     )
     traversal_method: FolderTraversalMethod = Field(
-        default=FolderTraversalMethod.MAX,
+        FolderTraversalMethod.MAX,
         description="Method to traverse the folder. ALL: Traverse all the folders, MIN_MAX: Traverse the folders by finding min and max value, MAX: Traverse the folder with max value",
     )
     include_hidden_folders: bool = Field(
-        default=False,
+        False,
         description="Include hidden folders in the traversal (folders starting with . or _",
     )
     tables_filter_pattern: AllowDenyPattern = Field(
-        default=AllowDenyPattern.allow_all(),
+        AllowDenyPattern.allow_all(),
         description="The tables_filter_pattern configuration field uses regular expressions to filter the tables part of the Pathspec for ingestion, allowing fine-grained control over which tables are included or excluded based on specified patterns. The default setting allows all tables.",
     )
@@ -479,7 +478,8 @@ class PathSpec(ConfigModel):
         return glob_include
     @pydantic.root_validator(skip_on_failure=True)
-    def validate_path_spec(cls, values: Dict) -> Dict[str, Any]:
+    @staticmethod
+    def validate_path_spec(values: Dict) -> Dict[str, Any]:
         # validate that main fields are populated
         required_fields = ["include", "file_types", "default_extension"]
         for f in required_fields:

datahub/ingestion/source/datahub/config.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Optional, Set
 import pydantic
 from pydantic import Field, root_validator
-from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
 from datahub.configuration.kafka import KafkaConsumerConnectionConfig
 from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
 from datahub.ingestion.source.state.stateful_ingestion_base import (
@@ -98,16 +98,14 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
         ),
     )
-    pull_from_datahub_api: bool = Field(
+    pull_from_datahub_api: HiddenFromDocs[bool] = Field(
         default=False,
         description="Use the DataHub API to fetch versioned aspects.",
-        hidden_from_docs=True,
     )
-    max_workers: int = Field(
+    max_workers: HiddenFromDocs[int] = Field(
         default=5 * (os.cpu_count() or 4),
         description="Number of worker threads to use for datahub api ingestion.",
-        hidden_from_docs=True,
     )
     urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
@@ -118,10 +116,11 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
         "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
     )
-    structured_properties_template_cache_invalidation_interval: int = Field(
-        hidden_from_docs=True,
-        default=60,
-        description="Interval in seconds to invalidate the structured properties template cache.",
+    structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
+        Field(
+            default=60,
+            description="Interval in seconds to invalidate the structured properties template cache.",
+        )
     )
     query_timeout: Optional[int] = Field(

datahub/ingestion/source/delta_lake/config.py CHANGED Viewed

@@ -78,7 +78,7 @@ class DeltaLakeSourceConfig(
         "When set to `False`, number_of_files in delta table can not be reported.",
     )
-    s3: Optional[S3] = Field()
+    s3: Optional[S3] = Field(None)
     @cached_property
     def is_s3(self):

datahub/ingestion/source/dremio/dremio_config.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import List, Literal, Optional
 import certifi
 from pydantic import Field, validator
-from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
 from datahub.configuration.source_common import (
     EnvConfigMixin,
     PlatformInstanceConfigMixin,
@@ -100,10 +100,9 @@ class ProfileConfig(GEProfilingBaseConfig):
     query_timeout: int = Field(
         default=300, description="Time before cancelling Dremio profiling query"
     )
-    include_field_median_value: bool = Field(
+    include_field_median_value: HiddenFromDocs[bool] = Field(
+        # Hidden because median causes a number of issues in Dremio.
         default=False,
-        hidden_from_docs=True,
-        description="Median causes a number of issues in Dremio.",
     )

datahub/ingestion/source/feast.py CHANGED Viewed

@@ -1,5 +1,6 @@
+import pathlib
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Union
 import feast.types
 from feast import (
@@ -97,7 +98,7 @@ class FeastRepositorySourceConfig(
     StatefulIngestionConfigBase,
 ):
     path: str = Field(description="Path to Feast repository")
-    fs_yaml_file: Optional[str] = Field(
+    fs_yaml_file: Optional[pathlib.Path] = Field(
         default=None,
         description="Path to the `feature_store.yaml` file used to configure the feature store",
     )
@@ -142,17 +143,14 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
     - Column types associated with each entity and feature
     """
-    platform = "feast"
-    source_config: FeastRepositorySourceConfig
-    report: StaleEntityRemovalSourceReport
-    feature_store: FeatureStore
+    platform: ClassVar[str] = "feast"
     def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
-        self.source_config = config
-        self.ctx = ctx
-        self.report = StaleEntityRemovalSourceReport()
-        self.feature_store = FeatureStore(
+        self.source_config: FeastRepositorySourceConfig = config
+        self.ctx: PipelineContext = ctx
+        self.report: StaleEntityRemovalSourceReport = StaleEntityRemovalSourceReport()
+        self.feature_store: FeatureStore = FeatureStore(
             repo_path=self.source_config.path,
             fs_yaml_file=self.source_config.fs_yaml_file,
         )

datahub/ingestion/source/fivetran/config.py CHANGED Viewed

@@ -102,7 +102,7 @@ class FivetranLogConfig(ConfigModel):
         "destination_config", "snowflake_destination_config"
     )
-    @root_validator(pre=True)
+    @root_validator(skip_on_failure=True)
     def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict:
         destination_platform = values["destination_platform"]
         if destination_platform == "snowflake":

datahub/ingestion/source/ge_profiling_config.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import datetime
 import logging
 import os
-from typing import Any, Dict, List, Optional
+from typing import Annotated, Any, Dict, List, Optional
 import pydantic
 from pydantic.fields import Field
-from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.common import AllowDenyPattern, ConfigModel, SupportedSources
 from datahub.ingestion.source_config.operation_config import OperationConfig
 _PROFILING_FLAGS_TO_REPORT = {
@@ -120,37 +120,40 @@ class GEProfilingConfig(GEProfilingBaseConfig):
         "number of columns to profile goes up.",
     )
-    profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
+    profile_if_updated_since_days: Annotated[
+        Optional[pydantic.PositiveFloat], SupportedSources(["snowflake", "bigquery"])
+    ] = Field(
         default=None,
         description="Profile table only if it has been updated since these many number of days. "
         "If set to `null`, no constraint of last modified time for tables to profile. "
         "Supported only in `snowflake` and `BigQuery`.",
-        schema_extra={"supported_sources": ["snowflake", "bigquery"]},
     )
-    profile_table_size_limit: Optional[int] = Field(
+    profile_table_size_limit: Annotated[
+        Optional[int],
+        SupportedSources(["snowflake", "bigquery", "unity-catalog", "oracle"]),
+    ] = Field(
         default=5,
         description="Profile tables only if their size is less than specified GBs. If set to `null`, "
         "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
         "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
-        schema_extra={
-            "supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"]
-        },
     )
-    profile_table_row_limit: Optional[int] = Field(
+    profile_table_row_limit: Annotated[
+        Optional[int], SupportedSources(["snowflake", "bigquery", "oracle"])
+    ] = Field(
         default=5000000,
         description="Profile tables only if their row count is less than specified count. "
         "If set to `null`, no limit on the row count of tables to profile. Supported only in "
         "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
-        schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]},
     )
-    profile_table_row_count_estimate_only: bool = Field(
+    profile_table_row_count_estimate_only: Annotated[
+        bool, SupportedSources(["postgres", "mysql"])
+    ] = Field(
         default=False,
         description="Use an approximate query for row count. This will be much faster but slightly "
         "less accurate. Only supported for Postgres and MySQL. ",
-        schema_extra={"supported_sources": ["postgres", "mysql"]},
     )
     # The query combiner enables us to combine multiple queries into a single query,
@@ -163,36 +166,37 @@ class GEProfilingConfig(GEProfilingBaseConfig):
     # Hidden option - used for debugging purposes.
     catch_exceptions: bool = Field(default=True, description="")
-    partition_profiling_enabled: bool = Field(
+    partition_profiling_enabled: Annotated[
+        bool, SupportedSources(["athena", "bigquery"])
+    ] = Field(
         default=True,
         description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
         "If enabled, latest partition data is used for profiling.",
-        schema_extra={"supported_sources": ["athena", "bigquery"]},
     )
-    partition_datetime: Optional[datetime.datetime] = Field(
+    partition_datetime: Annotated[
+        Optional[datetime.datetime], SupportedSources(["bigquery"])
+    ] = Field(
         default=None,
         description="If specified, profile only the partition which matches this datetime. "
         "If not specified, profile the latest partition. Only Bigquery supports this.",
-        schema_extra={"supported_sources": ["bigquery"]},
     )
-    use_sampling: bool = Field(
+    use_sampling: Annotated[bool, SupportedSources(["bigquery", "snowflake"])] = Field(
         default=True,
         description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
         "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
-        schema_extra={"supported_sources": ["bigquery", "snowflake"]},
     )
-    sample_size: int = Field(
+    sample_size: Annotated[int, SupportedSources(["bigquery", "snowflake"])] = Field(
         default=10000,
         description="Number of rows to be sampled from table for column level profiling."
         "Applicable only if `use_sampling` is set to True.",
-        schema_extra={"supported_sources": ["bigquery", "snowflake"]},
     )
-    profile_external_tables: bool = Field(
+    profile_external_tables: Annotated[
+        bool, SupportedSources(["redshift", "snowflake"])
+    ] = Field(
         default=False,
         description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
-        schema_extra={"supported_sources": ["redshift", "snowflake"]},
     )
     tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(

datahub/ingestion/source/grafana/grafana_config.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Dict, Optional
 from pydantic import Field, SecretStr, validator
-from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
 from datahub.configuration.source_common import (
     DatasetLineageProviderConfigBase,
     EnvConfigMixin,
@@ -37,7 +37,7 @@ class GrafanaSourceConfig(
 ):
     """Configuration for Grafana source"""
-    platform: str = Field(default="grafana", hidden_from_docs=True)
+    platform: HiddenFromDocs[str] = Field(default="grafana")
     url: str = Field(
         description="Grafana URL in the format http://your-grafana-instance with no trailing slash"
     )

datahub/ingestion/source/grafana/models.py CHANGED Viewed

@@ -10,9 +10,8 @@ References:
 from typing import Any, Dict, List, Optional
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, ConfigDict, Field
-from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
 from datahub.emitter.mcp_builder import ContainerKey
 # Grafana-specific type definitions for better type safety
@@ -25,7 +24,11 @@ GrafanaFieldConfig = Dict[
 GrafanaTransformation = Dict[str, Any]  # Transformations: id, options
-class DatasourceRef(BaseModel):
+class _GrafanaBaseModel(BaseModel):
+    model_config = ConfigDict(coerce_numbers_to_str=True)
+class DatasourceRef(_GrafanaBaseModel):
     """Reference to a Grafana datasource."""
     type: Optional[str] = None  # Datasource type (prometheus, mysql, postgres, etc.)
@@ -33,13 +36,13 @@ class DatasourceRef(BaseModel):
     name: Optional[str] = None  # Datasource display name
-class Panel(BaseModel):
+class Panel(_GrafanaBaseModel):
     """Represents a Grafana dashboard panel."""
     id: str
     title: str
     description: str = ""
-    type: Optional[str]
+    type: Optional[str] = None
     # Query targets - each contains refId (A,B,C...), query/expr, datasource ref, etc.
     query_targets: List[GrafanaQueryTarget] = Field(
         default_factory=list, alias="targets"
@@ -52,16 +55,16 @@ class Panel(BaseModel):
     transformations: List[GrafanaTransformation] = Field(default_factory=list)
-class Dashboard(BaseModel):
+class Dashboard(_GrafanaBaseModel):
     """Represents a Grafana dashboard."""
     uid: str
     title: str
     description: str = ""
-    version: Optional[str]
+    version: Optional[str] = None
     panels: List[Panel]
     tags: List[str]
-    timezone: Optional[str]
+    timezone: Optional[str] = None
     refresh: Optional[str] = None
     schema_version: Optional[str] = Field(default=None, alias="schemaVersion")
     folder_id: Optional[str] = Field(default=None, alias="meta.folderId")
@@ -100,18 +103,13 @@ class Dashboard(BaseModel):
         return super().parse_obj(dashboard_dict)
-class Folder(BaseModel):
+class Folder(_GrafanaBaseModel):
     """Represents a Grafana folder."""
     id: str
     title: str
     description: Optional[str] = ""
-    if PYDANTIC_VERSION_2:
-        from pydantic import ConfigDict
-        model_config = ConfigDict(coerce_numbers_to_str=True)  # type: ignore
 class FolderKey(ContainerKey):
     """Key for identifying a Grafana folder."""

datahub/ingestion/source/hex/hex.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from copy import deepcopy
 from dataclasses import dataclass
 from datetime import datetime, timedelta, timezone
 from typing import Any, Dict, Iterable, List, Optional
@@ -122,7 +123,11 @@ class HexSourceConfig(
     @root_validator(pre=True)
     def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
-        # lineage_end_time default = now
+        # In-place update of the input dict would cause state contamination. This was discovered through test failures
+        # in test_hex.py where the same dict is reused.
+        # So a deepcopy is performed first.
+        data = deepcopy(data)
         if "lineage_end_time" not in data or data["lineage_end_time"] is None:
             data["lineage_end_time"] = datetime.now(tz=timezone.utc)
         # if string is given, parse it

datahub/ingestion/source/iceberg/iceberg_profiler.py CHANGED Viewed

@@ -12,6 +12,7 @@ from pyiceberg.types import (
     IcebergType,
     IntegerType,
     LongType,
+    PrimitiveType,
     TimestampType,
     TimestamptzType,
     TimeType,
@@ -22,6 +23,7 @@ from pyiceberg.utils.datetime import (
     to_human_timestamp,
     to_human_timestamptz,
 )
+from typing_extensions import TypeGuard
 from datahub.emitter.mce_builder import get_sys_time
 from datahub.ingestion.source.iceberg.iceberg_common import (
@@ -65,7 +67,7 @@ class IcebergProfiler:
         aggregated_values: Dict[int, Any],
         manifest_values: Dict[int, bytes],
     ) -> None:
-        for field_id, value_encoded in manifest_values.items():  # type: int, Any
+        for field_id, value_encoded in manifest_values.items():
             try:
                 field = schema.find_field(field_id)
             except ValueError:
@@ -240,7 +242,7 @@ class IcebergProfiler:
             return None
     @staticmethod
-    def _is_numeric_type(type: IcebergType) -> bool:
+    def _is_numeric_type(type: IcebergType) -> TypeGuard[PrimitiveType]:
         return isinstance(
             type,
             (

datahub/ingestion/source/kafka_connect/common.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Dict, Iterable, List, Optional
 from pydantic.fields import Field
-from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.common import AllowDenyPattern, ConfigModel, LaxStr
 from datahub.configuration.source_common import (
     DatasetLineageProviderConfigBase,
     PlatformInstanceConfigMixin,
@@ -29,7 +29,7 @@ CONNECTOR_CLASS = "connector.class"
 class ProvidedConfig(ConfigModel):
     provider: str
     path_key: str
-    value: str
+    value: LaxStr
 class GenericConnectorConfig(ConfigModel):

acryl-datahub 1.2.0.10rc2__py3-none-any.whl → 1.2.0.10rc4__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.10rc2py3-none-any.whl → 1.2.0.10rc4py3-none-any.whl