PyPI - patito - Versions diffs - 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

patito 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

patito/__init__.py +1 -0
patito/_docs.py +1 -0
patito/_pydantic/__init__.py +0 -1
patito/_pydantic/column_info.py +21 -6
patito/_pydantic/dtypes/dtypes.py +20 -26
patito/_pydantic/dtypes/utils.py +7 -8
patito/_pydantic/repr.py +2 -2
patito/_pydantic/schema.py +2 -4
patito/exceptions.py +13 -0
patito/polars.py +41 -69
patito/pydantic.py +52 -106
patito/validators.py +142 -33
{patito-0.6.1.dist-info → patito-0.7.0.dist-info}/METADATA +11 -22
patito-0.7.0.dist-info/RECORD +17 -0
patito/xdg.py +0 -24
patito-0.6.1.dist-info/RECORD +0 -18
{patito-0.6.1.dist-info → patito-0.7.0.dist-info}/LICENSE +0 -0
{patito-0.6.1.dist-info → patito-0.7.0.dist-info}/WHEEL +0 -0

patito/validators.py CHANGED Viewed

@@ -1,9 +1,20 @@
 """Module for validating datastructures with respect to model specifications."""
 from __future__ import annotations
-from typing import TYPE_CHECKING, Optional, Sequence, Type, Union, cast, Any
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Optional,
+    Sequence,
+    Type,
+    Union,
+    _UnionGenericAlias,
+    cast,
+)
 import polars as pl
+from pydantic.aliases import AliasGenerator
 from typing_extensions import get_args
 from patito._pydantic.dtypes import is_optional
@@ -18,7 +29,7 @@ from patito.exceptions import (
 )
 try:
-    import pandas as pd
+    import pandas as pd  # type: ignore
     _PANDAS_AVAILABLE = True
 except ImportError:
@@ -52,11 +63,9 @@ def _dewrap_optional(type_annotation: Type[Any] | Any) -> Type:
     Is a no-op for non-Optional types.
     Args:
-    ----
         type_annotation: The type annotation to be dewrapped.
     Returns:
-    -------
         The input type, but with the outermost Optional removed.
     """
@@ -71,6 +80,33 @@ def _dewrap_optional(type_annotation: Type[Any] | Any) -> Type:
     )
+def _transform_df(dataframe: pl.DataFrame, schema: type[Model]) -> pl.DataFrame:
+    """Transform any properties of the dataframe according to the model.
+    Currently only supports using AliasGenerator to transform column names to match a model.
+    Args:
+        dataframe: Polars DataFrame to be validated.
+        schema: Patito model which specifies how the dataframe should be structured.
+    """
+    # Check if an alias generator is present in model_config
+    if alias_gen := schema.model_config.get("alias_generator"):
+        if isinstance(alias_gen, AliasGenerator):
+            alias_func = alias_gen.validation_alias or alias_gen.alias
+            assert (
+                alias_func is not None
+            ), "An AliasGenerator must contain a transforming function"
+        else:  # alias_gen is a function
+            alias_func = alias_gen
+        new_cols: list[str] = [
+            alias_func(field_name) for field_name in dataframe.columns
+        ]  # type: ignore
+        dataframe.columns = new_cols
+    return dataframe
 def _find_errors(  # noqa: C901
     dataframe: pl.DataFrame,
     schema: Type[Model],
@@ -81,7 +117,6 @@ def _find_errors(  # noqa: C901
     """Validate the given dataframe.
     Args:
-    ----
         dataframe: Polars DataFrame to be validated.
         schema: Patito model which specifies how the dataframe should be structured.
         columns: If specified, only validate the given columns. Missing columns will
@@ -92,7 +127,6 @@ def _find_errors(  # noqa: C901
         allow_superfluous_columns: If True, additional columns will not be considered an error.
     Returns:
-    -------
         A list of patito.exception.ErrorWrapper instances. The specific validation
         error can be retrieved from the "exc" attribute on each error wrapper instance.
@@ -202,21 +236,14 @@ def _find_errors(  # noqa: C901
             )
         # Test for when only specific values are accepted
-        if "enum" in column_properties:
-            permissible_values = set(column_properties["enum"])
-            if column_name in schema.nullable_columns:
-                permissible_values.add(None)
-            actual_values = set(dataframe[column_name].unique())
-            impermissible_values = actual_values - permissible_values
-            if impermissible_values:
-                errors.append(
-                    ErrorWrapper(
-                        RowValueError(
-                            f"Rows with invalid values: {impermissible_values}."
-                        ),
-                        loc=column_name,
-                    )
-                )
+        e = _find_enum_errors(
+            df=dataframe,
+            column_name=column_name,
+            props=column_properties,
+            schema=schema,
+        )
+        if e is not None:
+            errors.append(e)
         if column_info.unique:
             # Coalescing to 0 in the case of dataframe of height 0
@@ -229,18 +256,76 @@ def _find_errors(  # noqa: C901
                     )
                 )
+        # Intercept struct columns, and process errors separately
+        if schema.dtypes[column_name] == pl.Struct:
+            nested_schema = schema.model_fields[column_name].annotation
+            # Additional unpack required if structs column is optional
+            if type(nested_schema) == _UnionGenericAlias:
+                nested_schema = nested_schema.__args__[0]
+                # We need to filter out any null rows as the submodel won't know
+                # that all of a row's columns may be null
+                dataframe = dataframe.filter(pl.col(column_name).is_not_null())
+                if dataframe.is_empty():
+                    continue
+            struct_errors = _find_errors(
+                dataframe=dataframe.select(column_name).unnest(column_name),
+                schema=nested_schema,
+            )
+            # Format nested errors
+            for error in struct_errors:
+                error._loc = f"{column_name}.{error._loc}"
+            errors.extend(struct_errors)
+            # No need to do any more checks
+            continue
+        # Intercept list of structs columns, and process errors separately
+        elif schema.dtypes[column_name] == pl.List(pl.Struct):
+            nested_schema = schema.model_fields[column_name].annotation.__args__[0]
+            # Additional unpack required if structs column is optional
+            if type(nested_schema) == _UnionGenericAlias:
+                nested_schema = nested_schema.__args__[0]
+                # We need to filter out any null rows as the submodel won't know
+                # that all of a row's columns may be null
+                dataframe = dataframe.filter(pl.col(column_name).is_not_null())
+                if dataframe.is_empty():
+                    continue
+            list_struct_errors = _find_errors(
+                dataframe=dataframe.select(column_name)
+                .explode(column_name)
+                .unnest(column_name),
+                schema=nested_schema,
+            )
+            # Format nested errors
+            for error in list_struct_errors:
+                error._loc = f"{column_name}.{error._loc}"
+            errors.extend(list_struct_errors)
+            # No need to do any more checks
+            continue
         # Check for bounded value fields
         col = pl.col(column_name)
         filters = {
-            "maximum": lambda v: col <= v,
-            "exclusiveMaximum": lambda v: col < v,
-            "minimum": lambda v: col >= v,
-            "exclusiveMinimum": lambda v: col > v,
-            "multipleOf": lambda v: (col == 0) | ((col % v) == 0),
-            "const": lambda v: col == v,
-            "pattern": lambda v: col.str.contains(v),
-            "minLength": lambda v: col.str.len_chars() >= v,
-            "maxLength": lambda v: col.str.len_chars() <= v,
+            "maximum": lambda v, col=col: col <= v,
+            "exclusiveMaximum": lambda v, col=col: col < v,
+            "minimum": lambda v, col=col: col >= v,
+            "exclusiveMinimum": lambda v, col=col: col > v,
+            "multipleOf": lambda v, col=col: (col == 0) | ((col % v) == 0),
+            "const": lambda v, col=col: col == v,
+            "pattern": lambda v, col=col: col.str.contains(v),
+            "minLength": lambda v, col=col: col.str.len_chars() >= v,
+            "maxLength": lambda v, col=col: col.str.len_chars() <= v,
         }
         if "anyOf" in column_properties:
             checks = [
@@ -280,7 +365,7 @@ def _find_errors(  # noqa: C901
             custom_constraints = column_info.constraints
             if isinstance(custom_constraints, pl.Expr):
                 custom_constraints = [custom_constraints]
-            constraints = pl.all_horizontal(
+            constraints = pl.any_horizontal(
                 [constraint.not_() for constraint in custom_constraints]
             )
             if "_" in constraints.meta.root_names():
@@ -305,6 +390,29 @@ def _find_errors(  # noqa: C901
     return errors
+def _find_enum_errors(
+    df: pl.DataFrame, column_name: str, props: dict[str, Any], schema: Type[Model]
+) -> ErrorWrapper | None:
+    if "enum" not in props:
+        if "items" in props and "enum" in props["items"]:
+            return _find_enum_errors(df, column_name, props["items"], schema)
+        return None
+    permissible_values = set(props["enum"])
+    if column_name in schema.nullable_columns:
+        permissible_values.add(None)
+    if isinstance(df[column_name].dtype, pl.List):
+        actual_values = set(df[column_name].explode().unique())
+    else:
+        actual_values = set(df[column_name].unique())
+    impermissible_values = actual_values - permissible_values
+    if impermissible_values:
+        return ErrorWrapper(
+            RowValueError(f"Rows with invalid values: {impermissible_values}."),
+            loc=column_name,
+        )
+    return None
 def validate(
     dataframe: Union["pd.DataFrame", pl.DataFrame],
     schema: Type[Model],
@@ -315,14 +423,14 @@ def validate(
     """Validate the given dataframe.
     Args:
-    ----
         dataframe: Polars DataFrame to be validated.
         schema: Patito model which specifies how the dataframe should be structured.
+        columns: Optional list of columns to validate. If not provided, all columns
+            of the dataframe will be validated.
         allow_missing_columns: If True, missing columns will not be considered an error.
         allow_superfluous_columns: If True, additional columns will not be considered an error.
     Raises:
-    ------
         DataFrameValidationError: If the given dataframe does not match the given schema.
     """
@@ -331,6 +439,7 @@ def validate(
     else:
         polars_dataframe = cast(pl.DataFrame, dataframe)
+    polars_dataframe = _transform_df(polars_dataframe, schema)
     errors = _find_errors(
         dataframe=polars_dataframe,
         schema=schema,

{patito-0.6.1.dist-info → patito-0.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,13 +1,13 @@
 Metadata-Version: 2.1
 Name: patito
-Version: 0.6.1
+Version: 0.7.0
 Summary: A dataframe modelling library built on top of polars and pydantic.
-Home-page: https://github.com/kolonialno/patito
+Home-page: https://github.com/JakobGM/patito
 License: MIT
 Keywords: validation,dataframe
 Author: Jakob Gerhard Martinussen
 Author-email: jakobgm@gmail.com
-Requires-Python: >=3.9,<4.0
+Requires-Python: >=3.9
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.9
@@ -17,10 +17,10 @@ Provides-Extra: caching
 Provides-Extra: docs
 Provides-Extra: pandas
 Requires-Dist: Sphinx (<7) ; extra == "docs"
-Requires-Dist: pandas ; (python_version >= "3.9" and python_version < "4.0") and (extra == "pandas")
-Requires-Dist: polars (>=0.20.1)
-Requires-Dist: pyarrow (>=5.0.0) ; (python_version >= "3.9" and python_version < "4.0") and (extra == "caching")
-Requires-Dist: pydantic (>=2.0.0)
+Requires-Dist: pandas ; extra == "pandas"
+Requires-Dist: polars (>=1.0.0)
+Requires-Dist: pyarrow (>=5.0.0) ; extra == "caching"
+Requires-Dist: pydantic (>=2.7.0)
 Requires-Dist: sphinx-autobuild ; extra == "docs"
 Requires-Dist: sphinx-autodoc-typehints ; extra == "docs"
 Requires-Dist: sphinx-rtd-theme ; extra == "docs"
@@ -28,7 +28,7 @@ Requires-Dist: sphinx-toolbox ; extra == "docs"
 Requires-Dist: sphinxcontrib-mermaid ; extra == "docs"
 Requires-Dist: typing-extensions
 Project-URL: Documentation, https://patito.readthedocs.io
-Project-URL: Repository, https://github.com/kolonialno/patito
+Project-URL: Repository, https://github.com/JakobGM/patito
 Description-Content-Type: text/markdown
 # <center><img height="30px" src="https://em-content.zobj.net/thumbs/120/samsung/78/duck_1f986.png"> Patito<center>
@@ -63,7 +63,6 @@ These schema can be used for:
 🧪 Easy generation of valid mock data frames for tests.\
 🐍 Retrieve and represent singular rows in an object-oriented manner.\
 🧠 Provide a single source of truth for the core data models in your code base. \
-🦆 Integration with DuckDB for running flexible SQL queries.
 Patito has first-class support for [polars]("https://github.com/pola-rs/polars"), a _"blazingly fast DataFrames library written in Rust"_.
@@ -73,16 +72,6 @@ Patito has first-class support for [polars]("https://github.com/pola-rs/polars")
 pip install patito
 ```
-#### DuckDB Integration
-Patito can also integrate with [DuckDB](https://duckdb.org/).
-In order to enable this integration you must explicitly specify it during installation:
-```sh
-pip install 'patito[duckdb]'
-```
 ## Documentation
 The full documentation of Patio can be found [here](https://patito.readthedocs.io).
@@ -93,7 +82,7 @@ Patito allows you to specify the type of each column in your dataframe by creati
 ```py
 # models.py
-from typing import Literal, Optional
+from typing import Literal
 import patito as pt
@@ -118,7 +107,7 @@ df = pl.DataFrame(
 )
 try:
     Product.validate(df)
-except pt.ValidationError as exc:
+except pt.exceptions.DataFrameValidationError as exc:
     print(exc)
 # 3 validation errors for Product
 # is_for_sale
@@ -164,7 +153,7 @@ def num_products_for_sale(products: pl.DataFrame) -> int:
     return products.filter(pl.col("is_for_sale")).height
 ```
-The following test would fail with a `patito.ValidationError`:
+The following test would fail with a `patito.exceptions.DataFrameValidationError`:
 ```py
 def test_num_products_for_sale():

patito-0.7.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,17 @@
+patito/__init__.py,sha256=4qD13kfoa85_kyTCChm3xQcKKzIy3G8AZQp8T_bjcmo,844
+patito/_docs.py,sha256=9mfttyylWpqaOZv8xfDMEwCHHaY7GQwfyI7CDg7tWe8,162
+patito/_pydantic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+patito/_pydantic/column_info.py,sha256=MMsbMAif0h71-qZYGh5Lcq4bcU_87hmYWwam4zDPFDg,3545
+patito/_pydantic/dtypes/__init__.py,sha256=2vTvL4N4yMN0cbv2CSoa1OFwCswx6FhuQdsYhMaz_dU,578
+patito/_pydantic/dtypes/dtypes.py,sha256=alappjjAYpQ_YZTMwRx9TeDqKsCo4cmiM2HVxrCph2g,9610
+patito/_pydantic/dtypes/utils.py,sha256=idWZORrs3FCTd6FtI8h9A4wZRVTgfUMzcCb7JqLVyiQ,7001
+patito/_pydantic/repr.py,sha256=l9WLjwJ85nJwZCxLIwHih7UuMVVgz17W5th_UD7XZAM,4341
+patito/_pydantic/schema.py,sha256=1XLByZ1jJVP7PUNTkoSPDo0D_hy8QncNLjXKV2N0XDE,3622
+patito/exceptions.py,sha256=VfkkpLblu2Go4QnfWwew7g1NJ_gmynv28p-eGH84tLs,6060
+patito/polars.py,sha256=pv5W_1b-E8523VbYdFfsVxR1lCqE2n2vwlZQ3KdkReA,35436
+patito/pydantic.py,sha256=REaMK0vUwpPO3t-ktP_5PNsiUYcoAAFhMfgFIjaDA5A,48672
+patito/validators.py,sha256=d7lu3MBqaaLLvBVMd5BgarLYpGYHMeJEuTSGAoYqDf0,16231
+patito-0.7.0.dist-info/LICENSE,sha256=3bc4YyuF0e5nd59E3CsR8QM1Ua7pqKfC9DD1LVBVMs4,1139
+patito-0.7.0.dist-info/METADATA,sha256=pLAXcJKh7eFdulpqQWrf5q2r4FqvOwNJlRP3AoZAYlw,13946
+patito-0.7.0.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
+patito-0.7.0.dist-info/RECORD,,

patito/xdg.py DELETED Viewed

@@ -1,24 +0,0 @@
-"""Module implementing the XDG directory standard."""
-import os
-from pathlib import Path
-from typing import Optional
-def cache_home(application: Optional[str] = None) -> Path:
-    """Return path to directory containing user-specific non-essential data files.
-    Args:
-    ----
-        application: An optional name of an application for which to return an
-            application-specific cache directory for.
-    Returns:
-    -------
-        A path object pointing to a directory to store cache files.
-    """
-    path = Path(os.environ.get("XDG_CACHE_HOME", "~/.cache")).resolve()
-    if application:
-        path = path / application
-    path.mkdir(exist_ok=True, parents=True)
-    return path

patito-0.6.1.dist-info/RECORD DELETED Viewed

@@ -1,18 +0,0 @@
-patito/__init__.py,sha256=pW3q3tt3gR7JbEdRZ9OZtSoLcyUmrWTtXX6ulvJrwdA,843
-patito/_docs.py,sha256=bobkmo8-RRdz80_KY53y_i1Gcp1WWTH5-D5ZHGidpok,161
-patito/_pydantic/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
-patito/_pydantic/column_info.py,sha256=ifGTRkyst2GkErpK4hAnXoFy4XkXjFEymjklYMDwta8,2956
-patito/_pydantic/dtypes/__init__.py,sha256=2vTvL4N4yMN0cbv2CSoa1OFwCswx6FhuQdsYhMaz_dU,578
-patito/_pydantic/dtypes/dtypes.py,sha256=54s4lhH76QPrk4kxV-twa0gxd0e4A9MFysvKU76W2uo,9743
-patito/_pydantic/dtypes/utils.py,sha256=GDIeqYBGyznYkj4m6sP4csmHpVqWXCf_bU7H1x2fCn4,7036
-patito/_pydantic/repr.py,sha256=1UlDlQD5l0Q7n8cGFcd3K8zu6Cb-eWQYeGHwsJWLxhA,4339
-patito/_pydantic/schema.py,sha256=i_P-sBGQf_u5AbmWN1RCu5awe-LLH5JAORY_QCydOrI,3643
-patito/exceptions.py,sha256=wEBFdo7OVhbSOc3zwd23OyNHck5tPid7FLKSr2aRTKo,5637
-patito/polars.py,sha256=Jq9wemYtO58r-0eSQNgKdeva-BsUhrENcliCv9FtHMA,35352
-patito/pydantic.py,sha256=E8ktj99-Q-saGq4Tei2BAn2zm5gC83456yzUxSLyUwQ,48696
-patito/validators.py,sha256=6mzSNTMS8FiHjPoaAo9fTyl2yTiWtot5mRREvnd9_UU,12144
-patito/xdg.py,sha256=XS-dBWjeRTV_vSveplK6CIMe0lVkSzr1F1nNM5Hb6L0,703
-patito-0.6.1.dist-info/LICENSE,sha256=3bc4YyuF0e5nd59E3CsR8QM1Ua7pqKfC9DD1LVBVMs4,1139
-patito-0.6.1.dist-info/METADATA,sha256=UwEG_kRvluYX7lHRZouzP4lbGRA9TMMEuHBjwZv4bSI,14326
-patito-0.6.1.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
-patito-0.6.1.dist-info/RECORD,,

{patito-0.6.1.dist-info → patito-0.7.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{patito-0.6.1.dist-info → patito-0.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

patito 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

patito 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl