PyPI - acryl-datahub - Versions diffs - 1.3.1__py3-none-any.whl → 1.3.1.1rc1__py3-none-any.whl - Mend

acryl-datahub 1.3.1py3-none-any.whl → 1.3.1.1rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (193) hide show

{acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/METADATA +2501 -2501
{acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/RECORD +193 -193
datahub/_version.py +1 -1
datahub/api/entities/common/serialized_value.py +2 -2
datahub/api/entities/corpgroup/corpgroup.py +11 -6
datahub/api/entities/corpuser/corpuser.py +11 -11
datahub/api/entities/dataproduct/dataproduct.py +47 -27
datahub/api/entities/dataset/dataset.py +32 -21
datahub/api/entities/external/lake_formation_external_entites.py +5 -6
datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
datahub/api/entities/forms/forms.py +16 -14
datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
datahub/cli/check_cli.py +2 -2
datahub/cli/config_utils.py +3 -3
datahub/cli/lite_cli.py +9 -7
datahub/cli/migrate.py +4 -4
datahub/cli/quickstart_versioning.py +3 -3
datahub/cli/specific/group_cli.py +1 -1
datahub/cli/specific/structuredproperties_cli.py +1 -1
datahub/cli/specific/user_cli.py +1 -1
datahub/configuration/common.py +14 -2
datahub/configuration/connection_resolver.py +2 -2
datahub/configuration/git.py +47 -30
datahub/configuration/import_resolver.py +2 -2
datahub/configuration/kafka.py +4 -3
datahub/configuration/time_window_config.py +26 -26
datahub/configuration/validate_field_deprecation.py +2 -2
datahub/configuration/validate_field_removal.py +2 -2
datahub/configuration/validate_field_rename.py +2 -2
datahub/configuration/validate_multiline_string.py +2 -1
datahub/emitter/kafka_emitter.py +3 -1
datahub/emitter/rest_emitter.py +2 -4
datahub/ingestion/api/decorators.py +1 -1
datahub/ingestion/api/report.py +1 -1
datahub/ingestion/api/sink.py +1 -1
datahub/ingestion/api/source.py +1 -1
datahub/ingestion/glossary/datahub_classifier.py +11 -8
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
datahub/ingestion/reporting/file_reporter.py +5 -4
datahub/ingestion/run/pipeline.py +6 -6
datahub/ingestion/run/pipeline_config.py +12 -14
datahub/ingestion/run/sink_callback.py +1 -1
datahub/ingestion/sink/datahub_rest.py +6 -4
datahub/ingestion/source/abs/config.py +19 -19
datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
datahub/ingestion/source/abs/source.py +2 -2
datahub/ingestion/source/aws/aws_common.py +1 -1
datahub/ingestion/source/aws/glue.py +6 -4
datahub/ingestion/source/aws/sagemaker.py +1 -1
datahub/ingestion/source/azure/azure_common.py +8 -12
datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
datahub/ingestion/source/cassandra/cassandra.py +1 -1
datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
datahub/ingestion/source/datahub/config.py +8 -8
datahub/ingestion/source/datahub/datahub_source.py +1 -1
datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
datahub/ingestion/source/dbt/dbt_common.py +39 -37
datahub/ingestion/source/dbt/dbt_core.py +10 -12
datahub/ingestion/source/debug/datahub_debug.py +1 -1
datahub/ingestion/source/delta_lake/config.py +6 -4
datahub/ingestion/source/dremio/dremio_config.py +10 -6
datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
datahub/ingestion/source/elastic_search.py +4 -3
datahub/ingestion/source/excel/source.py +1 -1
datahub/ingestion/source/feast.py +1 -1
datahub/ingestion/source/file.py +5 -4
datahub/ingestion/source/fivetran/config.py +17 -16
datahub/ingestion/source/fivetran/fivetran.py +2 -2
datahub/ingestion/source/gc/datahub_gc.py +1 -1
datahub/ingestion/source/gcs/gcs_source.py +8 -10
datahub/ingestion/source/ge_profiling_config.py +8 -5
datahub/ingestion/source/grafana/grafana_api.py +2 -2
datahub/ingestion/source/grafana/grafana_config.py +4 -3
datahub/ingestion/source/grafana/grafana_source.py +1 -1
datahub/ingestion/source/grafana/models.py +23 -5
datahub/ingestion/source/hex/api.py +7 -5
datahub/ingestion/source/hex/hex.py +4 -3
datahub/ingestion/source/iceberg/iceberg.py +1 -1
datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
datahub/ingestion/source/identity/azure_ad.py +1 -1
datahub/ingestion/source/identity/okta.py +10 -10
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/ldap.py +1 -1
datahub/ingestion/source/looker/looker_common.py +7 -5
datahub/ingestion/source/looker/looker_config.py +21 -20
datahub/ingestion/source/looker/lookml_config.py +47 -47
datahub/ingestion/source/metabase.py +8 -8
datahub/ingestion/source/metadata/business_glossary.py +2 -2
datahub/ingestion/source/metadata/lineage.py +13 -8
datahub/ingestion/source/mlflow.py +1 -1
datahub/ingestion/source/mode.py +6 -4
datahub/ingestion/source/mongodb.py +4 -3
datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
datahub/ingestion/source/nifi.py +17 -23
datahub/ingestion/source/openapi.py +6 -8
datahub/ingestion/source/powerbi/config.py +33 -32
datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
datahub/ingestion/source/powerbi/powerbi.py +1 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
datahub/ingestion/source/preset.py +8 -8
datahub/ingestion/source/pulsar.py +1 -1
datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
datahub/ingestion/source/redshift/config.py +18 -20
datahub/ingestion/source/redshift/redshift.py +2 -2
datahub/ingestion/source/redshift/usage.py +23 -3
datahub/ingestion/source/s3/config.py +83 -62
datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
datahub/ingestion/source/s3/source.py +8 -5
datahub/ingestion/source/sac/sac.py +5 -4
datahub/ingestion/source/salesforce.py +3 -2
datahub/ingestion/source/schema/json_schema.py +2 -2
datahub/ingestion/source/sigma/data_classes.py +3 -2
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/sigma/sigma_api.py +7 -7
datahub/ingestion/source/slack/slack.py +1 -1
datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
datahub/ingestion/source/snowflake/snowflake_queries.py +1 -1
datahub/ingestion/source/sql/athena.py +1 -1
datahub/ingestion/source/sql/clickhouse.py +4 -2
datahub/ingestion/source/sql/cockroachdb.py +1 -1
datahub/ingestion/source/sql/druid.py +1 -1
datahub/ingestion/source/sql/hana.py +1 -1
datahub/ingestion/source/sql/hive.py +7 -5
datahub/ingestion/source/sql/hive_metastore.py +1 -1
datahub/ingestion/source/sql/mssql/source.py +13 -6
datahub/ingestion/source/sql/mysql.py +1 -1
datahub/ingestion/source/sql/oracle.py +17 -10
datahub/ingestion/source/sql/postgres.py +2 -2
datahub/ingestion/source/sql/presto.py +1 -1
datahub/ingestion/source/sql/sql_config.py +8 -9
datahub/ingestion/source/sql/sql_generic.py +1 -1
datahub/ingestion/source/sql/teradata.py +1 -1
datahub/ingestion/source/sql/trino.py +1 -1
datahub/ingestion/source/sql/vertica.py +5 -4
datahub/ingestion/source/sql_queries.py +11 -8
datahub/ingestion/source/state/checkpoint.py +2 -2
datahub/ingestion/source/state/entity_removal_state.py +2 -1
datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
datahub/ingestion/source/superset.py +9 -9
datahub/ingestion/source/tableau/tableau.py +14 -16
datahub/ingestion/source/unity/config.py +33 -34
datahub/ingestion/source/unity/proxy.py +203 -0
datahub/ingestion/source/unity/proxy_types.py +91 -0
datahub/ingestion/source/unity/source.py +27 -2
datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
datahub/ingestion/source/usage/usage_common.py +5 -3
datahub/ingestion/source_config/csv_enricher.py +7 -6
datahub/ingestion/source_config/operation_config.py +7 -4
datahub/ingestion/source_config/pulsar.py +11 -15
datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
datahub/ingestion/transformer/add_dataset_properties.py +2 -2
datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
datahub/ingestion/transformer/add_dataset_tags.py +3 -3
datahub/ingestion/transformer/add_dataset_terms.py +3 -3
datahub/ingestion/transformer/dataset_domain.py +3 -3
datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
datahub/ingestion/transformer/mark_dataset_status.py +1 -1
datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
datahub/ingestion/transformer/replace_external_url.py +2 -2
datahub/ingestion/transformer/set_browse_path.py +1 -1
datahub/ingestion/transformer/tags_to_terms.py +1 -1
datahub/lite/duckdb_lite.py +1 -1
datahub/lite/lite_util.py +2 -2
datahub/sdk/search_filters.py +68 -40
datahub/secret/datahub_secret_store.py +7 -4
datahub/secret/file_secret_store.py +1 -1
datahub/sql_parsing/sqlglot_lineage.py +5 -2
datahub/testing/check_sql_parser_result.py +2 -2
datahub/utilities/ingest_utils.py +1 -1
{acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/WHEEL +0 -0
{acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.3.1.dist-info → acryl_datahub-1.3.1.1rc1.dist-info}/top_level.txt +0 -0

datahub/cli/migrate.py CHANGED Viewed

@@ -318,13 +318,13 @@ def migrate_containers(
         try:
             newKey: Union[SchemaKey, DatabaseKey, ProjectIdKey, BigQueryDatasetKey]
             if subType == "Schema":
-                newKey = SchemaKey.parse_obj(customProperties)
+                newKey = SchemaKey.model_validate(customProperties)
             elif subType == "Database":
-                newKey = DatabaseKey.parse_obj(customProperties)
+                newKey = DatabaseKey.model_validate(customProperties)
             elif subType == "Project":
-                newKey = ProjectIdKey.parse_obj(customProperties)
+                newKey = ProjectIdKey.model_validate(customProperties)
             elif subType == "Dataset":
-                newKey = BigQueryDatasetKey.parse_obj(customProperties)
+                newKey = BigQueryDatasetKey.model_validate(customProperties)
             else:
                 log.warning(f"Invalid subtype {subType}. Skipping")
                 continue

datahub/cli/quickstart_versioning.py CHANGED Viewed

@@ -80,7 +80,7 @@ class QuickstartVersionMappingConfig(BaseModel):
             path = os.path.expanduser(LOCAL_QUICKSTART_MAPPING_FILE)
             with open(path) as f:
                 config_raw = yaml.safe_load(f)
-            return cls.parse_obj(config_raw)
+            return cls.model_validate(config_raw)
         config_raw = None
         try:
@@ -110,7 +110,7 @@ class QuickstartVersionMappingConfig(BaseModel):
                 }
             )
-        config = cls.parse_obj(config_raw)
+        config = cls.model_validate(config_raw)
         # If stable is not defined in the config, we need to fetch the latest version from github.
         if config.quickstart_version_map.get("stable") is None:
@@ -177,7 +177,7 @@ def save_quickstart_config(
     path = os.path.expanduser(path)
     os.makedirs(os.path.dirname(path), exist_ok=True)
     with open(path, "w") as f:
-        yaml.dump(config.dict(), f)
+        yaml.dump(config.model_dump(), f)
     logger.info(f"Saved quickstart config to {path}.")

datahub/cli/specific/group_cli.py CHANGED Viewed

@@ -42,7 +42,7 @@ def upsert(file: Path, override_editable: bool) -> None:
     with get_default_graph(ClientMode.CLI) as emitter:
         for group_config in group_configs:
             try:
-                datahub_group = CorpGroup.parse_obj(group_config)
+                datahub_group = CorpGroup.model_validate(group_config)
                 for mcp in datahub_group.generate_mcp(
                     generation_config=CorpGroupGenerationConfig(
                         override_editable=override_editable, datahub_graph=emitter

datahub/cli/specific/structuredproperties_cli.py CHANGED Viewed

@@ -85,7 +85,7 @@ def list(details: bool, to_file: str) -> None:
             with open(file, "r") as fp:
                 existing_objects = yaml.load(fp)  # this is a list of dicts
                 existing_objects = [
-                    StructuredProperties.parse_obj(obj) for obj in existing_objects
+                    StructuredProperties.model_validate(obj) for obj in existing_objects
                 ]
                 objects = [obj for obj in objects]
                 # do a positional update of the existing objects

datahub/cli/specific/user_cli.py CHANGED Viewed

@@ -42,7 +42,7 @@ def upsert(file: Path, override_editable: bool) -> None:
     with get_default_graph(ClientMode.CLI) as emitter:
         for user_config in user_configs:
             try:
-                datahub_user: CorpUser = CorpUser.parse_obj(user_config)
+                datahub_user: CorpUser = CorpUser.model_validate(user_config)
                 emitter.emit_all(
                     datahub_user.generate_mcp(

datahub/configuration/common.py CHANGED Viewed

@@ -140,6 +140,18 @@ class ConfigModel(BaseModel):
     @classmethod
     def parse_obj_allow_extras(cls, obj: Any) -> Self:
+        """Parse an object while allowing extra fields.
+        'parse_obj' in Pydantic v1 is equivalent to 'model_validate' in Pydantic v2.
+        However, 'parse_obj_allow_extras' in v1 is not directly available in v2.
+        `model_validate(..., strict=False)` does not work because it still raises errors on extra fields;
+        strict=False only affects type coercion and validation strictness, not extra field handling.
+        This method temporarily modifies the model's configuration to allow extra fields
+        TODO: Do we really need to support this behaviour? Consider removing this method in future.
+        """
         if PYDANTIC_VERSION_2:
             try:
                 with unittest.mock.patch.dict(
@@ -148,12 +160,12 @@ class ConfigModel(BaseModel):
                     clear=False,
                 ):
                     cls.model_rebuild(force=True)  # type: ignore
-                    return cls.parse_obj(obj)
+                    return cls.model_validate(obj)
             finally:
                 cls.model_rebuild(force=True)  # type: ignore
         else:
             with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow):
-                return cls.parse_obj(obj)
+                return cls.model_validate(obj)
 class PermissiveConfigModel(ConfigModel):

datahub/configuration/connection_resolver.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import TYPE_CHECKING, Type
-import pydantic
+from pydantic import model_validator
 from datahub.ingestion.api.global_context import get_graph_context
@@ -40,4 +40,4 @@ def auto_connection_resolver(
     # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
     # This hack ensures that multiple validators do not overwrite each other.
     _resolve_connection.__name__ = f"{_resolve_connection.__name__}_{connection_field}"
-    return pydantic.root_validator(pre=True, allow_reuse=True)(_resolve_connection)
+    return model_validator(mode="before")(_resolve_connection)

datahub/configuration/git.py CHANGED Viewed

@@ -1,7 +1,14 @@
 import pathlib
+from copy import deepcopy
 from typing import Any, Dict, Optional, Union
-from pydantic import Field, FilePath, SecretStr, validator
+from pydantic import (
+    Field,
+    FilePath,
+    SecretStr,
+    field_validator,
+    model_validator,
+)
 from datahub.configuration.common import ConfigModel
 from datahub.configuration.validate_field_rename import pydantic_renamed_field
@@ -41,7 +48,8 @@ class GitReference(ConfigModel):
         transform=lambda url: _GITHUB_URL_TEMPLATE,
     )
-    @validator("repo", pre=True)
+    @field_validator("repo", mode="before")
+    @classmethod
     def simplify_repo_url(cls, repo: str) -> str:
         if repo.startswith("github.com/") or repo.startswith("gitlab.com"):
             repo = f"https://{repo}"
@@ -53,21 +61,22 @@ class GitReference(ConfigModel):
         return repo
-    @validator("url_template", always=True)
-    def infer_url_template(cls, url_template: Optional[str], values: dict) -> str:
-        if url_template is not None:
-            return url_template
+    @model_validator(mode="after")
+    def infer_url_template(self) -> "GitReference":
+        if self.url_template is not None:
+            return self
-        repo: str = values["repo"]
-        if repo.startswith(_GITHUB_PREFIX):
-            return _GITHUB_URL_TEMPLATE
-        elif repo.startswith(_GITLAB_PREFIX):
-            return _GITLAB_URL_TEMPLATE
+        if self.repo.startswith(_GITHUB_PREFIX):
+            self.url_template = _GITHUB_URL_TEMPLATE
+        elif self.repo.startswith(_GITLAB_PREFIX):
+            self.url_template = _GITLAB_URL_TEMPLATE
         else:
             raise ValueError(
                 "Unable to infer URL template from repo. Please set url_template manually."
             )
+        return self
     def get_url_for_file_path(self, file_path: str) -> str:
         assert self.url_template
         if self.url_subdir:
@@ -98,35 +107,43 @@ class GitInfo(GitReference):
     _fix_deploy_key_newlines = pydantic_multiline_string("deploy_key")
-    @validator("deploy_key", pre=True, always=True)
+    @model_validator(mode="before")
+    @classmethod
     def deploy_key_filled_from_deploy_key_file(
-        cls, v: Optional[SecretStr], values: Dict[str, Any]
-    ) -> Optional[SecretStr]:
-        if v is None:
+        cls, values: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        # In-place update of the input dict would cause state contamination.
+        # So a deepcopy is performed first.
+        values = deepcopy(values)
+        if values.get("deploy_key") is None:
             deploy_key_file = values.get("deploy_key_file")
             if deploy_key_file is not None:
                 with open(deploy_key_file) as fp:
                     deploy_key = SecretStr(fp.read())
-                    return deploy_key
-        return v
-    @validator("repo_ssh_locator", always=True)
-    def infer_repo_ssh_locator(
-        cls, repo_ssh_locator: Optional[str], values: dict
-    ) -> str:
-        if repo_ssh_locator is not None:
-            return repo_ssh_locator
-        repo: str = values["repo"]
-        if repo.startswith(_GITHUB_PREFIX):
-            return f"git@github.com:{repo[len(_GITHUB_PREFIX) :]}.git"
-        elif repo.startswith(_GITLAB_PREFIX):
-            return f"git@gitlab.com:{repo[len(_GITLAB_PREFIX) :]}.git"
+                    values["deploy_key"] = deploy_key
+        return values
+    @model_validator(mode="after")
+    def infer_repo_ssh_locator(self) -> "GitInfo":
+        if self.repo_ssh_locator is not None:
+            return self
+        if self.repo.startswith(_GITHUB_PREFIX):
+            self.repo_ssh_locator = (
+                f"git@github.com:{self.repo[len(_GITHUB_PREFIX) :]}.git"
+            )
+        elif self.repo.startswith(_GITLAB_PREFIX):
+            self.repo_ssh_locator = (
+                f"git@gitlab.com:{self.repo[len(_GITLAB_PREFIX) :]}.git"
+            )
         else:
             raise ValueError(
                 "Unable to infer repo_ssh_locator from repo. Please set repo_ssh_locator manually."
             )
+        return self
     @property
     def branch_for_clone(self) -> Optional[str]:
         # If branch was manually set, we should use it. Otherwise return None.

datahub/configuration/import_resolver.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import TYPE_CHECKING, Type, TypeVar, Union
-import pydantic
+from pydantic import field_validator
 from datahub.ingestion.api.registry import import_path
@@ -15,4 +15,4 @@ def _pydantic_resolver(cls: Type, v: Union[str, _T]) -> _T:
 def pydantic_resolve_key(field: str) -> "V1Validator":
-    return pydantic.validator(field, pre=True, allow_reuse=True)(_pydantic_resolver)
+    return field_validator(field, mode="before")(_pydantic_resolver)

datahub/configuration/kafka.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from pydantic import Field, validator
+from pydantic import Field, field_validator
 from datahub.configuration.common import ConfigModel, ConfigurationError
 from datahub.configuration.env_vars import (
@@ -42,7 +42,8 @@ class _KafkaConnectionConfig(ConfigModel):
         description="The request timeout used when interacting with the Kafka APIs.",
     )
-    @validator("bootstrap")
+    @field_validator("bootstrap", mode="after")
+    @classmethod
     def bootstrap_host_colon_port_comma(cls, val: str) -> str:
         for entry in val.split(","):
             validate_host_port(entry)
@@ -57,7 +58,7 @@ class KafkaConsumerConnectionConfig(_KafkaConnectionConfig):
         description="Extra consumer config serialized as JSON. These options will be passed into Kafka's DeserializingConsumer. See https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer and https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md .",
     )
-    @validator("consumer_config")
+    @field_validator("consumer_config", mode="after")
     @classmethod
     def resolve_callback(cls, value: dict) -> dict:
         if CallableConsumerConfig.is_callable_config(value):

datahub/configuration/time_window_config.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import enum
 from datetime import datetime, timedelta, timezone
-from typing import Any, Dict, List
+from typing import Any, List
 import humanfriendly
-import pydantic
-from pydantic.fields import Field
+from pydantic import Field, ValidationInfo, field_validator, model_validator
 from datahub.configuration.common import ConfigModel
 from datahub.configuration.datetimes import parse_absolute_time, parse_relative_timespan
@@ -52,45 +51,46 @@ class BaseTimeWindowConfig(ConfigModel):
         description="Earliest date of lineage/usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`). You can also specify relative time with respect to end_time such as '-7 days' Or '-7d'.",
     )  # type: ignore
-    @pydantic.validator("start_time", pre=True, always=True)
-    def default_start_time(
-        cls, v: Any, values: Dict[str, Any], **kwargs: Any
-    ) -> datetime:
-        if v is None:
-            return get_time_bucket(
-                values["end_time"]
-                - get_bucket_duration_delta(values["bucket_duration"]),
-                values["bucket_duration"],
-            )
-        elif isinstance(v, str):
+    @field_validator("start_time", mode="before")
+    @classmethod
+    def parse_start_time(cls, v: Any, info: ValidationInfo) -> Any:
+        if isinstance(v, str):
             # This is where start_time str is resolved to datetime
             try:
                 delta = parse_relative_timespan(v)
                 assert delta < timedelta(0), (
                     "Relative start time should start with minus sign (-) e.g. '-2 days'."
                 )
-                assert abs(delta) >= get_bucket_duration_delta(
-                    values["bucket_duration"]
-                ), (
+                bucket_duration = info.data.get("bucket_duration", BucketDuration.DAY)
+                assert abs(delta) >= get_bucket_duration_delta(bucket_duration), (
                     "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'."
                 )
-                # The end_time's default value is not yet populated, in which case
-                # we can just manually generate it here.
-                if "end_time" not in values:
-                    values["end_time"] = datetime.now(tz=timezone.utc)
+                # We need end_time, but it might not be set yet
+                # In that case, we'll use the default
+                end_time = info.data.get("end_time")
+                if end_time is None:
+                    end_time = datetime.now(tz=timezone.utc)
-                return get_time_bucket(
-                    values["end_time"] + delta, values["bucket_duration"]
-                )
+                return get_time_bucket(end_time + delta, bucket_duration)
             except humanfriendly.InvalidTimespan:
                 # We do not floor start_time to the bucket start time if absolute start time is specified.
                 # If user has specified absolute start time in recipe, it's most likely that he means it.
                 return parse_absolute_time(v)
         return v
-    @pydantic.validator("start_time", "end_time")
+    @model_validator(mode="after")
+    def default_start_time(self) -> "BaseTimeWindowConfig":
+        # Only calculate start_time if it was None (not provided by user)
+        if self.start_time is None:
+            self.start_time = get_time_bucket(
+                self.end_time - get_bucket_duration_delta(self.bucket_duration),
+                self.bucket_duration,
+            )
+        return self
+    @field_validator("start_time", "end_time", mode="after")
+    @classmethod
     def ensure_timestamps_in_utc(cls, v: datetime) -> datetime:
         if v.tzinfo is None:
             raise ValueError(

datahub/configuration/validate_field_deprecation.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import warnings
 from typing import TYPE_CHECKING, Any, Optional, Type
-import pydantic
+from pydantic import model_validator
 from datahub.configuration.common import ConfigurationWarning
 from datahub.utilities.global_warning_util import add_global_warning
@@ -34,4 +34,4 @@ def pydantic_field_deprecated(
     # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
     # This hack ensures that multiple field deprecated do not overwrite each other.
     _validate_deprecated.__name__ = f"{_validate_deprecated.__name__}_{field}"
-    return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_deprecated)
+    return model_validator(mode="before")(_validate_deprecated)

datahub/configuration/validate_field_removal.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import warnings
 from typing import TYPE_CHECKING, Type
-import pydantic
+from pydantic import model_validator
 from datahub.configuration.common import ConfigurationWarning
@@ -31,4 +31,4 @@ def pydantic_removed_field(
     # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
     # This hack ensures that multiple field removals do not overwrite each other.
     _validate_field_removal.__name__ = f"{_validate_field_removal.__name__}_{field}"
-    return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_removal)
+    return model_validator(mode="before")(_validate_field_removal)

datahub/configuration/validate_field_rename.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import warnings
 from typing import TYPE_CHECKING, Callable, Type, TypeVar
-import pydantic
+from pydantic import model_validator
 from datahub.configuration.common import ConfigurationWarning
 from datahub.utilities.global_warning_util import add_global_warning
@@ -52,4 +52,4 @@ def pydantic_renamed_field(
     # validator with pre=True gets all the values that were passed in.
     # Given that a renamed field doesn't show up in the fields list, we can't use
     # the field-level validator, even with a different field name.
-    return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename)
+    return model_validator(mode="before")(_validate_field_rename)

datahub/configuration/validate_multiline_string.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING, Optional, Type, Union
 import pydantic
+from pydantic import field_validator
 if TYPE_CHECKING:
     from pydantic.deprecated.class_validators import V1Validator
@@ -31,4 +32,4 @@ def pydantic_multiline_string(field: str) -> "V1Validator":
     # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
     # This hack ensures that multiple field deprecated do not overwrite each other.
     _validate_field.__name__ = f"{_validate_field.__name__}_{field}"
-    return pydantic.validator(field, pre=True, allow_reuse=True)(_validate_field)
+    return field_validator(field, mode="before")(_validate_field)

datahub/emitter/kafka_emitter.py CHANGED Viewed

@@ -6,6 +6,7 @@ from confluent_kafka import SerializingProducer
 from confluent_kafka.schema_registry import SchemaRegistryClient
 from confluent_kafka.schema_registry.avro import AvroSerializer
 from confluent_kafka.serialization import SerializationContext, StringSerializer
+from pydantic import field_validator
 from datahub.configuration.common import ConfigModel
 from datahub.configuration.kafka import KafkaProducerConnectionConfig
@@ -49,7 +50,8 @@ class KafkaEmitterConfig(ConfigModel):
         },
     )
-    @pydantic.validator("topic_routes")
+    @field_validator("topic_routes", mode="after")
+    @classmethod
     def validate_topic_routes(cls, v: Dict[str, str]) -> Dict[str, str]:
         assert MCE_KEY in v, f"topic_routes must contain a route for {MCE_KEY}"
         assert MCP_KEY in v, f"topic_routes must contain a route for {MCP_KEY}"

datahub/emitter/rest_emitter.py CHANGED Viewed

@@ -145,8 +145,7 @@ class EmitMode(ConfigEnum):
     ASYNC_WAIT = auto()
-_DEFAULT_EMIT_MODE = pydantic.parse_obj_as(
-    EmitMode,
+_DEFAULT_EMIT_MODE = pydantic.TypeAdapter(EmitMode).validate_python(
     get_emit_mode() or EmitMode.SYNC_PRIMARY,
 )
@@ -156,8 +155,7 @@ class RestSinkEndpoint(ConfigEnum):
     OPENAPI = auto()
-DEFAULT_REST_EMITTER_ENDPOINT = pydantic.parse_obj_as(
-    RestSinkEndpoint,
+DEFAULT_REST_EMITTER_ENDPOINT = pydantic.TypeAdapter(RestSinkEndpoint).validate_python(
     get_rest_emitter_default_endpoint() or RestSinkEndpoint.RESTLI,
 )

datahub/ingestion/api/decorators.py CHANGED Viewed

@@ -17,7 +17,7 @@ def config_class(config_cls: Type) -> Callable[[Type], Type]:
     """Adds a get_config_class method to the decorated class"""
     def default_create(cls: Type, config_dict: Dict, ctx: PipelineContext) -> Type:
-        config = config_cls.parse_obj(config_dict)
+        config = config_cls.model_validate(config_dict)
         return cls(config=config, ctx=ctx)
     def wrapper(cls: Type) -> Type:

datahub/ingestion/api/report.py CHANGED Viewed

@@ -65,7 +65,7 @@ class Report(SupportsAsObj):
         if isinstance(some_val, SupportsAsObj):
             return some_val.as_obj()
         elif isinstance(some_val, pydantic.BaseModel):
-            return Report.to_pure_python_obj(some_val.dict())
+            return Report.to_pure_python_obj(some_val.model_dump())
         elif dataclasses.is_dataclass(some_val) and not isinstance(some_val, type):
             # The `is_dataclass` function returns `True` for both instances and classes.
             # We need an extra check to ensure an instance was passed in.

datahub/ingestion/api/sink.py CHANGED Viewed

@@ -123,7 +123,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
     @classmethod
     def create(cls, config_dict: dict, ctx: PipelineContext) -> "Self":
-        return cls(ctx, cls.get_config_class().parse_obj(config_dict))
+        return cls(ctx, cls.get_config_class().model_validate(config_dict))
     def handle_work_unit_start(self, workunit: WorkUnit) -> None:
         """Called at the start of each new workunit.

datahub/ingestion/api/source.py CHANGED Viewed

@@ -480,7 +480,7 @@ class Extractor(Generic[WorkUnitType, ExtractorConfig], Closeable, metaclass=ABC
         config_class = self.get_config_class()
         self.ctx = ctx
-        self.config = config_class.parse_obj(config_dict)
+        self.config = config_class.model_validate(config_dict)
     @abstractmethod
     def get_records(self, workunit: WorkUnitType) -> Iterable[RecordEnvelope]:

datahub/ingestion/glossary/datahub_classifier.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional
 from datahub_classify.helper_classes import ColumnInfo
 from datahub_classify.infotype_predictor import predict_infotypes
 from datahub_classify.reference_input import input1 as default_config
-from pydantic import validator
+from pydantic import field_validator
 from pydantic.fields import Field
 from datahub.configuration.common import ConfigModel
@@ -90,7 +90,7 @@ class InfoTypeConfig(ConfigModel):
 DEFAULT_CLASSIFIER_CONFIG = {
-    k: InfoTypeConfig.parse_obj(v) for k, v in default_config.items()
+    k: InfoTypeConfig.model_validate(v) for k, v in default_config.items()
 }
@@ -114,8 +114,11 @@ class DataHubClassifierConfig(ConfigModel):
         description="Minimum number of non-null column values required to process `values` prediction factor.",
     )
-    @validator("info_types_config")
-    def input_config_selectively_overrides_default_config(cls, info_types_config):
+    @field_validator("info_types_config", mode="after")
+    @classmethod
+    def input_config_selectively_overrides_default_config(
+        cls, info_types_config: Dict[str, Any]
+    ) -> Dict[str, Any]:
         for infotype, infotype_config in DEFAULT_CLASSIFIER_CONFIG.items():
             if infotype not in info_types_config:
                 # if config for some info type is not provided by user, use default config for that info type.
@@ -125,7 +128,7 @@ class DataHubClassifierConfig(ConfigModel):
                 # use default config for that prediction factor.
                 for factor, weight in (
                     info_types_config[infotype]
-                    .Prediction_Factors_and_Weights.dict()
+                    .Prediction_Factors_and_Weights.model_dump()
                     .items()
                 ):
                     if (
@@ -146,7 +149,7 @@ class DataHubClassifierConfig(ConfigModel):
             for (
                 factor,
                 weight,
-            ) in custom_infotype_config.Prediction_Factors_and_Weights.dict().items():
+            ) in custom_infotype_config.Prediction_Factors_and_Weights.model_dump().items():
                 if weight > 0:
                     assert getattr(custom_infotype_config, factor) is not None, (
                         f"Missing Configuration for Prediction Factor {factor} for Custom Info Type {custom_infotype}"
@@ -173,7 +176,7 @@ class DataHubClassifier(Classifier):
     def create(cls, config_dict: Optional[Dict[str, Any]]) -> "DataHubClassifier":
         # This could be replaced by parsing to particular class, if required
         if config_dict is not None:
-            config = DataHubClassifierConfig.parse_obj(config_dict)
+            config = DataHubClassifierConfig.model_validate(config_dict)
         else:
             config = DataHubClassifierConfig()
         return cls(config)
@@ -183,7 +186,7 @@ class DataHubClassifier(Classifier):
             column_infos=columns,
             confidence_level_threshold=self.config.confidence_level_threshold,
             global_config={
-                k: v.dict() for k, v in self.config.info_types_config.items()
+                k: v.model_dump() for k, v in self.config.info_types_config.items()
             },
             infotypes=self.config.info_types,
             minimum_values_threshold=self.config.minimum_values_threshold,

datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py CHANGED Viewed

@@ -82,7 +82,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
         ctx: PipelineContext,
         sink: Sink,
     ) -> PipelineRunListener:
-        reporter_config = DatahubIngestionRunSummaryProviderConfig.parse_obj(
+        reporter_config = DatahubIngestionRunSummaryProviderConfig.model_validate(
             config_dict or {}
         )
         if reporter_config.sink:

datahub/ingestion/reporting/file_reporter.py CHANGED Viewed

@@ -2,7 +2,7 @@ import json
 import logging
 from typing import Any, Dict
-from pydantic import validator
+from pydantic import field_validator
 from datahub.configuration.common import ConfigModel
 from datahub.ingestion.api.common import PipelineContext
@@ -16,8 +16,9 @@ class FileReporterConfig(ConfigModel):
     filename: str
     format: str = "json"
-    @validator("format")
-    def only_json_supported(cls, v):
+    @field_validator("format", mode="after")
+    @classmethod
+    def only_json_supported(cls, v: str) -> str:
         if v and v.lower() != "json":
             raise ValueError(
                 f"Format {v} is not yet supported. Only json is supported at this time"
@@ -33,7 +34,7 @@ class FileReporter(PipelineRunListener):
         ctx: PipelineContext,
         sink: Sink,
     ) -> PipelineRunListener:
-        reporter_config = FileReporterConfig.parse_obj(config_dict)
+        reporter_config = FileReporterConfig.model_validate(config_dict)
         return cls(reporter_config)
     def __init__(self, reporter_config: FileReporterConfig) -> None:

acryl-datahub 1.3.1__py3-none-any.whl → 1.3.1.1rc1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.3.1py3-none-any.whl → 1.3.1.1rc1py3-none-any.whl