PyPI - patito - Versions diffs - 0.6.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

patito 0.6.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

patito/_pydantic/column_info.py +101 -46
patito/_pydantic/dtypes/dtypes.py +19 -15
patito/_pydantic/dtypes/utils.py +32 -28
patito/_pydantic/repr.py +7 -15
patito/_pydantic/schema.py +10 -9
patito/exceptions.py +11 -16
patito/polars.py +124 -65
patito/pydantic.py +98 -89
patito/validators.py +111 -71
{patito-0.6.2.dist-info → patito-0.8.0.dist-info}/METADATA +6 -4
patito-0.8.0.dist-info/RECORD +17 -0
{patito-0.6.2.dist-info → patito-0.8.0.dist-info}/WHEEL +1 -1
patito-0.6.2.dist-info/RECORD +0 -17
{patito-0.6.2.dist-info → patito-0.8.0.dist-info}/LICENSE +0 -0

patito/validators.py CHANGED Viewed

@@ -2,14 +2,10 @@
 from __future__ import annotations
+from collections.abc import Sequence
 from typing import (
     TYPE_CHECKING,
     Any,
-    Optional,
-    Sequence,
-    Type,
-    Union,
-    _UnionGenericAlias,
     cast,
 )
@@ -18,6 +14,7 @@ from pydantic.aliases import AliasGenerator
 from typing_extensions import get_args
 from patito._pydantic.dtypes import is_optional
+from patito._pydantic.dtypes.utils import unwrap_optional
 from patito.exceptions import (
     ColumnDTypeError,
     DataFrameValidationError,
@@ -57,29 +54,6 @@ VALID_POLARS_TYPES = {
 }
-def _dewrap_optional(type_annotation: Type[Any] | Any) -> Type:
-    """Return the inner, wrapped type of an Optional.
-    Is a no-op for non-Optional types.
-    Args:
-        type_annotation: The type annotation to be dewrapped.
-    Returns:
-        The input type, but with the outermost Optional removed.
-    """
-    return (
-        next(  # pragma: no cover
-            valid_type
-            for valid_type in get_args(type_annotation)
-            if valid_type is not type(None)  # noqa: E721
-        )
-        if is_optional(type_annotation)
-        else type_annotation
-    )
 def _transform_df(dataframe: pl.DataFrame, schema: type[Model]) -> pl.DataFrame:
     """Transform any properties of the dataframe according to the model.
@@ -109,8 +83,8 @@ def _transform_df(dataframe: pl.DataFrame, schema: type[Model]) -> pl.DataFrame:
 def _find_errors(  # noqa: C901
     dataframe: pl.DataFrame,
-    schema: Type[Model],
-    columns: Optional[Sequence[str]] = None,
+    schema: type[Model],
+    columns: Sequence[str] | None = None,
     allow_missing_columns: bool = False,
     allow_superfluous_columns: bool = False,
 ) -> list[ErrorWrapper]:
@@ -144,6 +118,10 @@ def _find_errors(  # noqa: C901
     if not allow_missing_columns:
         # Check if any columns are missing
         for missing_column in set(schema_subset) - set(dataframe.columns):
+            col_info = schema.column_infos.get(missing_column)
+            if col_info is not None and col_info.allow_missing:
+                continue
             errors.append(
                 ErrorWrapper(
                     MissingColumnsError("Missing column"),
@@ -185,7 +163,7 @@ def _find_errors(  # noqa: C901
         # Retrieve the annotation of the list itself,
         # dewrapping any potential Optional[...]
-        list_type = _dewrap_optional(annotation)
+        list_type = unwrap_optional(annotation)
         # Check if the list items themselves should be considered nullable
         item_type = get_args(list_type)[0]
@@ -197,6 +175,8 @@ def _find_errors(  # noqa: C901
             .select(column)
             # Remove those rows that do not contain lists at all
             .filter(pl.col(column).is_not_null())
+            # Remove empty lists
+            .filter(pl.col(column).list.len() > 0)
             # Convert lists of N items to N individual rows
             .explode(column)
             # Calculate how many nulls are present in lists
@@ -220,24 +200,31 @@ def _find_errors(  # noqa: C901
     valid_dtypes = schema.valid_dtypes
     dataframe_datatypes = dict(zip(dataframe.columns, dataframe.dtypes))
     for column_name, column_properties in schema._schema_properties().items():
+        # We rename to _tmp here to avoid overwriting the dataframe during filters below
+        # TODO! Really we should be passing *Series* around rather than the entire dataframe
+        dataframe_tmp = dataframe
         column_info = schema.column_infos[column_name]
-        if column_name not in dataframe.columns or column_name not in column_subset:
+        if column_name not in dataframe_tmp.columns or column_name not in column_subset:
             continue
         polars_type = dataframe_datatypes[column_name]
-        if polars_type not in valid_dtypes[column_name]:
-            errors.append(
-                ErrorWrapper(
-                    ColumnDTypeError(
-                        f"Polars dtype {polars_type} does not match model field type."
-                    ),
-                    loc=column_name,
+        if polars_type not in [
+            pl.Struct,
+            pl.List(pl.Struct),
+        ]:  # defer struct validation for recursive call to _find_errors later
+            if polars_type not in valid_dtypes[column_name]:
+                errors.append(
+                    ErrorWrapper(
+                        ColumnDTypeError(
+                            f"Polars dtype {polars_type} does not match model field type."
+                        ),
+                        loc=column_name,
+                    )
                 )
-            )
         # Test for when only specific values are accepted
         e = _find_enum_errors(
-            df=dataframe,
+            df=dataframe_tmp,
             column_name=column_name,
             props=column_properties,
             schema=schema,
@@ -247,7 +234,7 @@ def _find_errors(  # noqa: C901
         if column_info.unique:
             # Coalescing to 0 in the case of dataframe of height 0
-            num_duplicated = dataframe[column_name].is_duplicated().sum() or 0
+            num_duplicated = dataframe_tmp[column_name].is_duplicated().sum() or 0
             if num_duplicated > 0:
                 errors.append(
                     ErrorWrapper(
@@ -259,19 +246,31 @@ def _find_errors(  # noqa: C901
         # Intercept struct columns, and process errors separately
         if schema.dtypes[column_name] == pl.Struct:
             nested_schema = schema.model_fields[column_name].annotation
+            assert nested_schema is not None
             # Additional unpack required if structs column is optional
-            if type(nested_schema) == _UnionGenericAlias:
-                nested_schema = nested_schema.__args__[0]
-                # We need to filter out any null rows as the submodel won't know
-                # that all of a row's columns may be null
-                dataframe = dataframe.filter(pl.col(column_name).is_not_null())
-                if dataframe.is_empty():
+            if is_optional(nested_schema):
+                nested_schema = unwrap_optional(nested_schema)
+                # An optional struct means that we allow the struct entry to be
+                # null. It is the inner model that is responsible for determining
+                # whether its fields are optional or not. Since the struct is optional,
+                # we need to filter out any null rows as the inner model may disallow
+                # nulls on a particular field
+                # NB As of Polars 1.1, struct_col.is_null() cannot return True
+                # The following code has been added to accomodate this
+                struct_fields = dataframe_tmp[column_name].struct.fields
+                col_struct = pl.col(column_name).struct
+                only_non_null_expr = ~pl.all_horizontal(
+                    [col_struct.field(name).is_null() for name in struct_fields]
+                )
+                dataframe_tmp = dataframe_tmp.filter(only_non_null_expr)
+                if dataframe_tmp.is_empty():
                     continue
             struct_errors = _find_errors(
-                dataframe=dataframe.select(column_name).unnest(column_name),
+                dataframe=dataframe_tmp.select(column_name).unnest(column_name),
                 schema=nested_schema,
             )
@@ -286,22 +285,36 @@ def _find_errors(  # noqa: C901
         # Intercept list of structs columns, and process errors separately
         elif schema.dtypes[column_name] == pl.List(pl.Struct):
-            nested_schema = schema.model_fields[column_name].annotation.__args__[0]
+            list_annotation = schema.model_fields[column_name].annotation
+            assert list_annotation is not None
-            # Additional unpack required if structs column is optional
-            if type(nested_schema) == _UnionGenericAlias:
-                nested_schema = nested_schema.__args__[0]
+            # Handle Optional[list[pl.Struct]]
+            if is_optional(list_annotation):
+                list_annotation = unwrap_optional(list_annotation)
-                # We need to filter out any null rows as the submodel won't know
-                # that all of a row's columns may be null
-                dataframe = dataframe.filter(pl.col(column_name).is_not_null())
-                if dataframe.is_empty():
+                dataframe_tmp = dataframe_tmp.filter(pl.col(column_name).is_not_null())
+                if dataframe_tmp.is_empty():
                     continue
-            list_struct_errors = _find_errors(
-                dataframe=dataframe.select(column_name)
+            # Unpack list schema
+            nested_schema = list_annotation.__args__[0]
+            dataframe_tmp = (
+                dataframe_tmp.select(column_name)
                 .explode(column_name)
-                .unnest(column_name),
+                .unnest(column_name)
+            )
+            # Handle list[Optional[pl.Struct]]
+            if is_optional(nested_schema):
+                nested_schema = unwrap_optional(nested_schema)
+                dataframe_tmp = dataframe_tmp.filter(pl.all().is_not_null())
+                if dataframe_tmp.is_empty():
+                    continue
+            list_struct_errors = _find_errors(
+                dataframe=dataframe_tmp,
                 schema=nested_schema,
             )
@@ -344,7 +357,7 @@ def _find_errors(  # noqa: C901
         if checks:
             n_invalid_rows = 0
             for check in checks:
-                lazy_df = dataframe.lazy()
+                lazy_df = dataframe_tmp.lazy()
                 lazy_df = lazy_df.filter(
                     ~check
                 )  # get failing rows (nulls will evaluate to null on boolean check, we only want failures (false)))
@@ -370,11 +383,11 @@ def _find_errors(  # noqa: C901
             )
             if "_" in constraints.meta.root_names():
                 # An underscore is an alias for the current field
-                illegal_rows = dataframe.with_columns(
+                illegal_rows = dataframe_tmp.with_columns(
                     pl.col(column_name).alias("_")
                 ).filter(constraints)
             else:
-                illegal_rows = dataframe.filter(constraints)
+                illegal_rows = dataframe_tmp.filter(constraints)
             if illegal_rows.height > 0:
                 errors.append(
                     ErrorWrapper(
@@ -391,11 +404,23 @@ def _find_errors(  # noqa: C901
 def _find_enum_errors(
-    df: pl.DataFrame, column_name: str, props: dict[str, Any], schema: Type[Model]
+    df: pl.DataFrame, column_name: str, props: dict[str, Any], schema: type[Model]
 ) -> ErrorWrapper | None:
     if "enum" not in props:
         if "items" in props and "enum" in props["items"]:
             return _find_enum_errors(df, column_name, props["items"], schema)
+        for item in props.get("anyOf", []):
+            if "enum" in item:
+                return _find_enum_errors(df, column_name, item, schema)
+            if (
+                "$ref" in item
+            ):  # If the item is a reference to another definition pass it as the properties
+                return _find_enum_errors(
+                    df,
+                    column_name,
+                    schema.model_json_schema()["$defs"][item["$ref"]],
+                    schema,
+                )
         return None
     permissible_values = set(props["enum"])
     if column_name in schema.nullable_columns:
@@ -414,12 +439,13 @@ def _find_enum_errors(
 def validate(
-    dataframe: Union["pd.DataFrame", pl.DataFrame],
-    schema: Type[Model],
-    columns: Optional[Sequence[str]] = None,
+    dataframe: pd.DataFrame | pl.DataFrame,
+    schema: type[Model],
+    columns: Sequence[str] | None = None,
     allow_missing_columns: bool = False,
     allow_superfluous_columns: bool = False,
-) -> None:
+    drop_superfluous_columns: bool = False,
+) -> pl.DataFrame:
     """Validate the given dataframe.
     Args:
@@ -429,17 +455,29 @@ def validate(
             of the dataframe will be validated.
         allow_missing_columns: If True, missing columns will not be considered an error.
         allow_superfluous_columns: If True, additional columns will not be considered an error.
+        drop_superfluous_columns: If True, drop any columns not specified in the schema before validation.
     Raises:
         DataFrameValidationError: If the given dataframe does not match the given schema.
     """
+    if drop_superfluous_columns and columns:
+        raise ValueError(
+            "Cannot specify both 'columns' and 'drop_superfluous_columns'."
+        )
     if _PANDAS_AVAILABLE and isinstance(dataframe, pd.DataFrame):
         polars_dataframe = pl.from_pandas(dataframe)
     else:
-        polars_dataframe = cast(pl.DataFrame, dataframe)
+        polars_dataframe = cast(pl.DataFrame, dataframe).clone()
     polars_dataframe = _transform_df(polars_dataframe, schema)
+    if drop_superfluous_columns:
+        # NOTE: dropping rather than selecting to get the correct error messages
+        to_drop = set(dataframe.columns) - set(schema.columns)
+        polars_dataframe = polars_dataframe.drop(to_drop)
     errors = _find_errors(
         dataframe=polars_dataframe,
         schema=schema,
@@ -449,3 +487,5 @@ def validate(
     )
     if errors:
         raise DataFrameValidationError(errors=errors, model=schema)
+    return polars_dataframe

{patito-0.6.2.dist-info → patito-0.8.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: patito
-Version: 0.6.2
+Version: 0.8.0
 Summary: A dataframe modelling library built on top of polars and pydantic.
 Home-page: https://github.com/JakobGM/patito
 License: MIT
@@ -13,14 +13,16 @@ Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 Provides-Extra: caching
 Provides-Extra: docs
 Provides-Extra: pandas
 Requires-Dist: Sphinx (<7) ; extra == "docs"
 Requires-Dist: pandas ; extra == "pandas"
-Requires-Dist: polars (>=0.20.1)
+Requires-Dist: polars (>=1.10.0)
+Requires-Dist: pre-commit (>=3.8.0,<4.0.0)
 Requires-Dist: pyarrow (>=5.0.0) ; extra == "caching"
-Requires-Dist: pydantic (>=2.4.1)
+Requires-Dist: pydantic (>=2.7.0)
 Requires-Dist: sphinx-autobuild ; extra == "docs"
 Requires-Dist: sphinx-autodoc-typehints ; extra == "docs"
 Requires-Dist: sphinx-rtd-theme ; extra == "docs"
@@ -74,7 +76,7 @@ pip install patito
 ## Documentation
-The full documentation of Patio can be found [here](https://patito.readthedocs.io).
+The full documentation of Patito can be found [here](https://patito.readthedocs.io).
 ## 👮 Data validation

patito-0.8.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,17 @@
+patito/__init__.py,sha256=4qD13kfoa85_kyTCChm3xQcKKzIy3G8AZQp8T_bjcmo,844
+patito/_docs.py,sha256=9mfttyylWpqaOZv8xfDMEwCHHaY7GQwfyI7CDg7tWe8,162
+patito/_pydantic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+patito/_pydantic/column_info.py,sha256=RZdEdv41Z34t1CVewKlOSjnvgBF2bfriT_FxkDiaRBI,5442
+patito/_pydantic/dtypes/__init__.py,sha256=2vTvL4N4yMN0cbv2CSoa1OFwCswx6FhuQdsYhMaz_dU,578
+patito/_pydantic/dtypes/dtypes.py,sha256=GDZxEBsNzc3jOsEF_5qkClFeKwbGUML_3gwkqLMQYeM,9612
+patito/_pydantic/dtypes/utils.py,sha256=mjGOsrJ7R-WJ5KLpureaFBnYY8lGbqpuYDh_8LVJjnI,6674
+patito/_pydantic/repr.py,sha256=P7ojqTeNM4htzZgw2qMO6XzFqkiIXLwP-WUIEqNt7I0,4182
+patito/_pydantic/schema.py,sha256=BI2qAhNM29NxS366K9eRi8thgE2P3t8GFt1HzwlWxos,3603
+patito/exceptions.py,sha256=XK6UF_UojeOR45TJnZqS19SHZAUIvu-nswqf1tnFJ08,6034
+patito/polars.py,sha256=nd73Hzpji6rP5wpbYI_FtGGPoQVQ2pvij8GXlx_MrPc,37738
+patito/pydantic.py,sha256=zo9321xdRLzFZdL35xr8pQaQp07VOhLGPJfCwrvPYyA,49473
+patito/validators.py,sha256=WZqtXVH3gHSdKJXqIMM9j90CPfghihqLpqVJJeoVwBw,18494
+patito-0.8.0.dist-info/LICENSE,sha256=3bc4YyuF0e5nd59E3CsR8QM1Ua7pqKfC9DD1LVBVMs4,1139
+patito-0.8.0.dist-info/METADATA,sha256=Dz-oHA5zk4d0YVCsmbSJpzP2jR22lLX4mkSv1Wz_CJQ,14042
+patito-0.8.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+patito-0.8.0.dist-info/RECORD,,

{patito-0.6.2.dist-info → patito-0.8.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 1.6.1
+Generator: poetry-core 1.9.0
 Root-Is-Purelib: true
 Tag: py3-none-any

patito-0.6.2.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-patito/__init__.py,sha256=4qD13kfoa85_kyTCChm3xQcKKzIy3G8AZQp8T_bjcmo,844
-patito/_docs.py,sha256=9mfttyylWpqaOZv8xfDMEwCHHaY7GQwfyI7CDg7tWe8,162
-patito/_pydantic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-patito/_pydantic/column_info.py,sha256=zy3z0gCdQZhNA3_eQ9mEf3Di-gOR8Gt4vmS3v2iULkc,3536
-patito/_pydantic/dtypes/__init__.py,sha256=2vTvL4N4yMN0cbv2CSoa1OFwCswx6FhuQdsYhMaz_dU,578
-patito/_pydantic/dtypes/dtypes.py,sha256=nHtgyI0LsvA_hEIELUxtS4JnDwutqem7iT6nxMDLJxc,9510
-patito/_pydantic/dtypes/utils.py,sha256=6g2mVVSYCs0LSqiPlc4D2Wm3X2gm8sKJnXZYcthfabY,7017
-patito/_pydantic/repr.py,sha256=l9WLjwJ85nJwZCxLIwHih7UuMVVgz17W5th_UD7XZAM,4341
-patito/_pydantic/schema.py,sha256=1XLByZ1jJVP7PUNTkoSPDo0D_hy8QncNLjXKV2N0XDE,3622
-patito/exceptions.py,sha256=VfkkpLblu2Go4QnfWwew7g1NJ_gmynv28p-eGH84tLs,6060
-patito/polars.py,sha256=iAnMFfVyJfSdHantESrIdaX6tZDWj71jyWBery325ac,35333
-patito/pydantic.py,sha256=1gyPfo8-68sdy26yC8c7CQhk_9Mmr0KRsyuS4g54Ddw,48685
-patito/validators.py,sha256=d7lu3MBqaaLLvBVMd5BgarLYpGYHMeJEuTSGAoYqDf0,16231
-patito-0.6.2.dist-info/LICENSE,sha256=3bc4YyuF0e5nd59E3CsR8QM1Ua7pqKfC9DD1LVBVMs4,1139
-patito-0.6.2.dist-info/METADATA,sha256=vUijDkEO0zT5uxED3sN3fTvgsFEIojFYFcENJ1u9_cA,13947
-patito-0.6.2.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
-patito-0.6.2.dist-info/RECORD,,

{patito-0.6.2.dist-info → patito-0.8.0.dist-info}/LICENSE RENAMED Viewed

File without changes

patito 0.6.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

patito 0.6.2py3-none-any.whl → 0.8.0py3-none-any.whl