PyPI - patito - Versions diffs - 0.8.0__tar.gz → 0.8.3__tar.gz - Mend

patito 0.8.0tar.gz → 0.8.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{patito-0.8.0 → patito-0.8.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: patito
-Version: 0.8.0
+Version: 0.8.3
 Summary: A dataframe modelling library built on top of polars and pydantic.
 Home-page: https://github.com/JakobGM/patito
 License: MIT
@@ -20,7 +20,6 @@ Provides-Extra: pandas
 Requires-Dist: Sphinx (<7) ; extra == "docs"
 Requires-Dist: pandas ; extra == "pandas"
 Requires-Dist: polars (>=1.10.0)
-Requires-Dist: pre-commit (>=3.8.0,<4.0.0)
 Requires-Dist: pyarrow (>=5.0.0) ; extra == "caching"
 Requires-Dist: pydantic (>=2.7.0)
 Requires-Dist: sphinx-autobuild ; extra == "docs"

{patito-0.8.0 → patito-0.8.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "patito"
-version = "0.8.0"
+version = "0.8.3"
 description = "A dataframe modelling library built on top of polars and pydantic."
 authors = ["Jakob Gerhard Martinussen <jakobgm@gmail.com>", "Thomas Aarholt <thomasaarholt@gmail.com>"]
 license = "MIT"
@@ -25,7 +25,6 @@ sphinx-autobuild = {version = "*", optional = true}
 sphinx-autodoc-typehints = {version = "*", optional = true}
 sphinx-toolbox = {version = "*", optional = true}
 sphinxcontrib-mermaid = {version = "*", optional = true}
-pre-commit = "^3.8.0"
 [tool.poetry.extras]
 # The pyarrow.parquet module is required for writing parquet caches to disk
@@ -42,6 +41,7 @@ docs = [
 [tool.poetry.group.dev.dependencies]
 ruff = ">=0.2.1"
+pre-commit = "^3.8.0"
 coverage = {version = "*", extras = ["toml"]}
 pyright = ">=1.1.239"
 pytest = ">=7.1.2"
@@ -133,6 +133,7 @@ extend-exclude= ["tests/__init__.py"]
 [tool.ruff.lint]
 select = ["E4", "E7", "E9", "F", "I", "B", "D", "UP"]
+ignore = ["UP007"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"

{patito-0.8.0 → patito-0.8.3}/src/patito/_pydantic/column_info.py RENAMED Viewed

@@ -97,20 +97,20 @@ class ColumnInfo(BaseModel, arbitrary_types_allowed=True):
     """
-    allow_missing: Optional[bool] = None  # noqa: UP007
+    allow_missing: Optional[bool] = None
     dtype: Annotated[
-        Optional[Union[DataTypeClass, DataType]],  # noqa: UP007
+        Optional[Union[DataTypeClass, DataType]],
         BeforeValidator(dtype_deserializer),
     ] = None
     constraints: Annotated[
-        Optional[Union[pl.Expr, list[pl.Expr]]],  # noqa: UP007
+        Optional[Union[pl.Expr, list[pl.Expr]]],
         BeforeValidator(expr_deserializer),
     ] = None
     derived_from: Annotated[
-        Optional[Union[str, pl.Expr]],  # noqa: UP007
+        Optional[Union[str, pl.Expr]],
         BeforeValidator(expr_or_col_name_deserializer),
     ] = None
-    unique: Optional[bool] = None  # noqa : UP007
+    unique: Optional[bool] = None
     def __repr__(self) -> str:
         """Print only Field attributes whose values are not default (mainly None)."""

{patito-0.8.0 → patito-0.8.3}/src/patito/_pydantic/dtypes/dtypes.py RENAMED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 from collections.abc import Mapping
 from functools import cache, reduce
-from operator import and_
+from operator import or_
 from typing import TYPE_CHECKING, Any
 import polars as pl
@@ -115,7 +115,8 @@ def validate_annotation(
 class DtypeResolver:
     def __init__(self, annotation: Any | None):
         self.annotation = annotation
-        self.schema = TypeAdapter(annotation).json_schema()
+        # mode='serialization' allows nested models with structs, see #86
+        self.schema = TypeAdapter(annotation).json_schema(mode="serialization")
         self.defs = self.schema.get("$defs", {})
     def valid_polars_dtypes(self) -> DataTypeGroup:
@@ -143,7 +144,7 @@ class DtypeResolver:
             valid_type_sets.append(
                 self._pydantic_subschema_to_valid_polars_types(schema)
             )
-        return reduce(and_, valid_type_sets) if valid_type_sets else DataTypeGroup([])
+        return reduce(or_, valid_type_sets) if valid_type_sets else DataTypeGroup([])
     def _pydantic_subschema_to_valid_polars_types(
         self,
@@ -159,6 +160,7 @@ class DtypeResolver:
                     self.defs[props["$ref"].split("/")[-1]]
                 )
             return DataTypeGroup([])
         pyd_type = props.get("type")
         if pyd_type == "array":
             if "items" not in props:
@@ -169,28 +171,27 @@ class DtypeResolver:
             return DataTypeGroup(
                 [pl.List(dtype) for dtype in item_dtypes], match_base_type=False
             )
         elif pyd_type == "object":
             if "properties" not in props:
                 return DataTypeGroup([])
             object_props = props["properties"]
+            struct_fields: list[pl.Field] = []
+            for name, sub_props in object_props.items():
+                dtype = self._default_polars_dtype_for_schema(sub_props)
+                assert dtype is not None
+                struct_fields.append(pl.Field(name, dtype))
             return DataTypeGroup(
-                [
-                    pl.Struct(
-                        [
-                            pl.Field(
-                                name, self._default_polars_dtype_for_schema(sub_props)
-                            )
-                            for name, sub_props in object_props.items()
-                        ]
-                    )
-                ],
+                [pl.Struct(struct_fields)],
                 match_base_type=False,
             )  # for structs, return only the default dtype set to avoid combinatoric issues
         return _pyd_type_to_valid_dtypes(
             PydanticBaseType(pyd_type), props.get("format"), props.get("enum")
         )
-    def _default_polars_dtype_for_schema(self, schema: dict) -> DataType | None:
+    def _default_polars_dtype_for_schema(
+        self, schema: dict[str, Any]
+    ) -> DataType | None:
         if "anyOf" in schema:
             if len(schema["anyOf"]) == 2:  # look for optionals first
                 schema = _without_optional(schema)
@@ -206,13 +207,14 @@ class DtypeResolver:
     def _pydantic_subschema_to_default_dtype(
         self,
-        props: dict,
+        props: dict[str, Any],
     ) -> DataType | None:
         if "column_info" in props:  # user has specified in patito model
             ci = ColumnInfo.model_validate_json(props["column_info"])
             if ci.dtype is not None:
                 dtype = ci.dtype() if isinstance(ci.dtype, DataTypeClass) else ci.dtype
                 return dtype
         if "type" not in props:
             if "enum" in props:
                 raise TypeError("Mixed type enums not supported by patito.")
@@ -223,10 +225,12 @@ class DtypeResolver:
                     self.defs[props["$ref"].split("/")[-1]]
                 )
             return None
         pyd_type = props.get("type")
         if pyd_type == "numeric":
             pyd_type = "number"
-        if pyd_type == "array":
+        elif pyd_type == "array":
             if "items" not in props:
                 raise NotImplementedError(
                     "Unexpected error processing pydantic schema. Please file an issue."
@@ -236,18 +240,21 @@ class DtypeResolver:
             if inner_default_type is None:
                 return None
             return pl.List(inner_default_type)
-        elif pyd_type == "object":
+        elif pyd_type == "object":  # these are structs
             if "properties" not in props:
                 raise NotImplementedError(
                     "dictionaries not currently supported by patito"
                 )
-            object_props = props["properties"]
-            return pl.Struct(
-                [
-                    pl.Field(name, self._default_polars_dtype_for_schema(sub_props))
-                    for name, sub_props in object_props.items()
-                ]
-            )
+            object_props: dict[str, dict[str, str]] = props["properties"]
+            struct_fields: list[pl.Field] = []
+            for name, sub_props in object_props.items():
+                dtype = self._default_polars_dtype_for_schema(sub_props)
+                assert dtype is not None
+                struct_fields.append(pl.Field(name, dtype))
+            return pl.Struct(struct_fields)
         return _pyd_type_to_default_dtype(
             PydanticBaseType(pyd_type), props.get("format"), props.get("enum")
         )

{patito-0.8.0 → patito-0.8.3}/src/patito/_pydantic/dtypes/utils.py RENAMED Viewed

@@ -124,7 +124,7 @@ def _pyd_type_to_valid_dtypes(
         _validate_enum_values(pyd_type, enum)
         return DataTypeGroup([pl.Enum(enum), pl.String], match_base_type=False)
     if pyd_type.value == "integer":
-        return DataTypeGroup(INTEGER_DTYPES | FLOAT_DTYPES)
+        return DataTypeGroup(INTEGER_DTYPES)
     elif pyd_type.value == "number":
         return (
             FLOAT_DTYPES

{patito-0.8.0 → patito-0.8.3}/src/patito/polars.py RENAMED Viewed

@@ -54,33 +54,63 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
     model: type[ModelType]
-    @classmethod
-    def _construct_lazyframe_model_class(
-        cls: type[LDF], model: type[ModelType] | None
-    ) -> type[LazyFrame[ModelType]]:
-        """Return custom LazyFrame sub-class where LazyFrame.model is set.
+    def set_model(self, model: type[OtherModelType]) -> LazyFrame[OtherModelType]:
+        """Associate a given patito ``Model`` with the dataframe.
-        Can be used to construct a LazyFrame class where
-        DataFrame.set_model(model) is implicitly invoked at collection.
+        The model schema is used by methods that depend on a model being associated with
+        the given dataframe such as :ref:`DataFrame.validate() <DataFrame.validate>`
+        and :ref:`DataFrame.get() <DataFrame.get>`.
+        ``DataFrame(...).set_model(Model)`` is equivalent with ``Model.DataFrame(...)``.
         Args:
-            model: A patito model which should be used to validate the final dataframe.
-                If None is provided, the regular LazyFrame class will be returned.
+            model (Model): Sub-class of ``patito.Model`` declaring the schema of the
+                dataframe.
         Returns:
-            A custom LazyFrame model class where LazyFrame.model has been correctly
-                "hard-coded" to the given model.
+            DataFrame[Model]: Returns the same dataframe, but with an attached model
+            that is required for certain model-specific dataframe methods to work.
-        """
-        if model is None:
-            return cls
+        Examples:
+            >>> from typing_extensions import Literal
+            >>> import patito as pt
+            >>> import polars as pl
+            >>> class SchoolClass(pt.Model):
+            ...     year: int = pt.Field(dtype=pl.UInt16)
+            ...     letter: Literal["A", "B"] = pt.Field(dtype=pl.Categorical)
+            ...
+            >>> classes = pt.DataFrame(
+            ...     {"year": [1, 1, 2, 2], "letter": list("ABAB")}
+            ... ).set_model(SchoolClass)
+            >>> classes
+            shape: (4, 2)
+            ┌──────┬────────┐
+            │ year ┆ letter │
+            │ ---  ┆ ---    │
+            │ i64  ┆ str    │
+            ╞══════╪════════╡
+            │ 1    ┆ A      │
+            │ 1    ┆ B      │
+            │ 2    ┆ A      │
+            │ 2    ┆ B      │
+            └──────┴────────┘
+            >>> casted_classes = classes.cast()
+            >>> casted_classes
+            shape: (4, 2)
+            ┌──────┬────────┐
+            │ year ┆ letter │
+            │ ---  ┆ ---    │
+            │ u16  ┆ cat    │
+            ╞══════╪════════╡
+            │ 1    ┆ A      │
+            │ 1    ┆ B      │
+            │ 2    ┆ A      │
+            │ 2    ┆ B      │
+            └──────┴────────┘
+            >>> casted_classes.validate()
-        new_class = type(
-            f"{model.__name__}LazyFrame",
-            (cls,),
-            {"model": model},
-        )
-        return new_class
+        """
+        return model.LazyFrame._from_pyldf(self._ldf)  # type: ignore
     def collect(
         self,
@@ -93,12 +123,11 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
         parameters.
         """
         background = kwargs.pop("background", False)
-        df = super().collect(*args, background=background, **kwargs)
+        df: pl.DataFrame = super().collect(*args, background=background, **kwargs)
+        df = DataFrame(df)
         if getattr(self, "model", False):
-            cls = DataFrame._construct_dataframe_model_class(model=self.model)
-        else:
-            cls = DataFrame
-        return cls._from_pydf(df._df)
+            df = df.set_model(self.model)
+        return df
     def derive(self: LDF, columns: list[str] | None = None) -> LDF:
         """Populate columns which have ``pt.Field(derived_from=...)`` definitions.
@@ -213,7 +242,7 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
                         f"TODO figure out how this AliasPath behaves ({va})"
                     )
                 return (
-                    pl.col(va.path[0]).list.get(va.path[1], null_on_oob=True)
+                    pl.col(str(va.path[0])).list.get(va.path[1], null_on_oob=True)
                     if va.path[0] in self.collect_schema()
                     else None
                 )
@@ -307,7 +336,10 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
     @classmethod
     def from_existing(cls: type[LDF], lf: pl.LazyFrame) -> LDF:
         """Construct a patito.DataFrame object from an existing polars.DataFrame object."""
-        return cls.model.LazyFrame._from_pyldf(lf._ldf).cast()
+        if getattr(cls, "model", False):
+            return cls.model.LazyFrame._from_pyldf(super().lazy()._ldf)  # type: ignore
+        return LazyFrame._from_pyldf(lf._ldf)  # type: ignore
 class DataFrame(pl.DataFrame, Generic[ModelType]):
@@ -341,30 +373,6 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
     model: type[ModelType]
-    @classmethod
-    def _construct_dataframe_model_class(
-        cls: type[DF], model: type[OtherModelType]
-    ) -> type[DataFrame[OtherModelType]]:
-        """Return custom DataFrame sub-class where DataFrame.model is set.
-        Can be used to construct a DataFrame class where
-        DataFrame.set_model(model) is implicitly invoked at instantiation.
-        Args:
-            model: A patito model which should be used to validate the dataframe.
-        Returns:
-            A custom DataFrame model class where DataFrame._model has been correctly
-                "hard-coded" to the given model.
-        """
-        new_class = type(
-            f"{model.model_json_schema()['title']}DataFrame",
-            (cls,),
-            {"model": model},
-        )
-        return new_class
     def lazy(self: DataFrame[ModelType]) -> LazyFrame[ModelType]:
         """Convert DataFrame into LazyFrame.
@@ -374,15 +382,12 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
             A new LazyFrame object.
         """
-        lazyframe_class: LazyFrame[ModelType] = (
-            LazyFrame._construct_lazyframe_model_class(
-                model=getattr(self, "model", None)
-            )
-        )  # type: ignore
-        ldf = lazyframe_class._from_pyldf(super().lazy()._ldf)
-        return ldf
+        if getattr(self, "model", False):
+            return self.model.LazyFrame._from_pyldf(super().lazy()._ldf)  # type: ignore
+        return LazyFrame._from_pyldf(super().lazy()._ldf)  # type: ignore
-    def set_model(self, model):  # type: ignore[no-untyped-def] # noqa: ANN001, ANN201
+    def set_model(self, model: type[OtherModelType]) -> DataFrame[OtherModelType]:
         """Associate a given patito ``Model`` with the dataframe.
         The model schema is used by methods that depend on a model being associated with
@@ -438,11 +443,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
             >>> casted_classes.validate()
         """
-        cls = self._construct_dataframe_model_class(model=model)
-        return cast(
-            DataFrame[model],
-            cls._from_pydf(self._df),
-        )
+        return model.DataFrame(self._df)
     def unalias(self: DF) -> DF:
         """Un-aliases column names using information from pydantic validation_alias.
@@ -503,7 +504,6 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
     def drop(
         self: DF,
         columns: str | Collection[str] | None = None,
-        *more_columns: str,
     ) -> DF:
         """Drop one or more columns from the dataframe.
@@ -515,7 +515,6 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
             columns: A single column string name, or list of strings, indicating
                 which columns to drop. If not specified, all columns *not*
                 specified by the associated dataframe model will be dropped.
-            more_columns: Additional named columns to drop.
         Returns:
             DataFrame[Model]: New dataframe without the specified columns.
@@ -538,7 +537,9 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
         """
         if columns is not None:
-            return self._from_pydf(super().drop(columns)._df)
+            # I get a single null row if I try to use super() here, so go via
+            # pl.DataFrame instead.
+            return self._from_pydf(pl.DataFrame(self._df).drop(columns)._df)
         else:
             return self.drop(list(set(self.columns) - set(self.model.columns)))
@@ -705,7 +706,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
                 )
                 for column, default_value in self.model.defaults.items()
             ]
-        ).set_model(self.model)
+        ).set_model(self.model)  # type: ignore
     def get(self, predicate: pl.Expr | None = None) -> ModelType:
         """Fetch the single row that matches the given polars predicate.
@@ -815,7 +816,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
             >>> class Product(pt.Model):
             ...     product_id: int = pt.Field(unique=True)
             ...     price: float
-            ...
             >>> df = pt.DataFrame({"product_id": [1, 2], "price": [10., 20.]})
             >>> df = df.set_model(Product)
             >>> for product in df.iter_models():
@@ -833,10 +834,23 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
         df = self.validate(drop_superfluous_columns=True) if validate_df else self
-        def _iter_models(_df: DF) -> Iterator[ModelType]:
-            for idx in range(_df.height):
-                yield self.model.from_row(_df[idx], validate=validate_model)
+        def _iter_models_with_validate(
+            _df: DataFrame[ModelType],
+        ) -> Iterator[ModelType]:
+            for row in _df.iter_rows(named=True):
+                yield self.model(**row)
+        def _iter_models_without_validate(
+            _df: DataFrame[ModelType],
+        ) -> Iterator[ModelType]:
+            for row in _df.iter_rows(named=True):
+                yield self.model.model_construct(**row)
+        _iter_models = (
+            _iter_models_with_validate
+            if validate_model
+            else _iter_models_without_validate
+        )
         return ModelGenerator(_iter_models(df))
     def _pydantic_model(self) -> type[Model]:

{patito-0.8.0 → patito-0.8.3}/src/patito/pydantic.py RENAMED Viewed

@@ -76,27 +76,31 @@ class ModelMetaclass(PydanticModelMetaclass):
         """
         super().__init__(name, bases, clsdict, **kwargs)
-        # Add a custom subclass of patito.DataFrame to the model class,
-        # where .set_model() has been implicitly set.
-        cls.DataFrame = DataFrame._construct_dataframe_model_class(
-            model=cls,  # type: ignore
+        NewDataFrame = type(
+            f"{cls.__name__}DataFrame",
+            (DataFrame,),
+            {"model": cls},
         )
-        # Similarly for LazyFrame
-        cls.LazyFrame = LazyFrame._construct_lazyframe_model_class(
-            model=cls,  # type: ignore
+        cls.DataFrame: type[DataFrame[cls]] = NewDataFrame  # type: ignore
+        NewLazyFrame = type(
+            f"{cls.__name__}LazyFrame",
+            (LazyFrame,),
+            {"model": cls},
         )
+        cls.LazyFrame: type[LazyFrame[cls]] = NewLazyFrame  # type: ignore
     def __hash__(self) -> int:
         """Return hash of the model class."""
         return super().__hash__()
     @property
-    def column_infos(cls: type[ModelType]) -> Mapping[str, ColumnInfo]:
+    def column_infos(cls: type[Model]) -> Mapping[str, ColumnInfo]:
         """Return column information for the model."""
         return column_infos_for_model(cls)
     @property
-    def model_schema(cls: type[ModelType]) -> Mapping[str, Mapping[str, Any]]:
+    def model_schema(cls: type[Model]) -> Mapping[str, Mapping[str, Any]]:
         """Return schema properties where definition references have been resolved.
         Returns:
@@ -112,7 +116,7 @@ class ModelMetaclass(PydanticModelMetaclass):
         return schema_for_model(cls)
     @property
-    def columns(cls: type[ModelType]) -> list[str]:
+    def columns(cls: type[Model]) -> list[str]:
         """Return the name of the dataframe columns specified by the fields of the model.
         Returns:
@@ -131,7 +135,7 @@ class ModelMetaclass(PydanticModelMetaclass):
         return list(cls.model_fields.keys())
     @property
-    def dtypes(cls: type[ModelType]) -> dict[str, DataTypeClass | DataType]:
+    def dtypes(cls: type[Model]) -> dict[str, DataTypeClass | DataType]:
         """Return the polars dtypes of the dataframe.
         Unless Field(dtype=...) is specified, the highest signed column dtype
@@ -155,7 +159,7 @@ class ModelMetaclass(PydanticModelMetaclass):
     @property
     def valid_dtypes(
-        cls: type[ModelType],
+        cls: type[Model],
     ) -> Mapping[str, frozenset[DataTypeClass | DataType]]:
         """Return a list of polars dtypes which Patito considers valid for each field.
@@ -172,7 +176,7 @@ class ModelMetaclass(PydanticModelMetaclass):
         return valid_dtypes_for_model(cls)
     @property
-    def defaults(cls: type[ModelType]) -> dict[str, Any]:
+    def defaults(cls: type[Model]) -> dict[str, Any]:
         """Return default field values specified on the model.
         Returns:
@@ -197,7 +201,7 @@ class ModelMetaclass(PydanticModelMetaclass):
         }
     @property
-    def non_nullable_columns(cls: type[ModelType]) -> set[str]:
+    def non_nullable_columns(cls: type[Model]) -> set[str]:
         """Return names of those columns that are non-nullable in the schema.
         Returns:
@@ -226,7 +230,7 @@ class ModelMetaclass(PydanticModelMetaclass):
         )
     @property
-    def nullable_columns(cls: type[ModelType]) -> set[str]:
+    def nullable_columns(cls: type[Model]) -> set[str]:
         """Return names of those columns that are nullable in the schema.
         Returns:
@@ -248,7 +252,7 @@ class ModelMetaclass(PydanticModelMetaclass):
         return set(cls.columns) - cls.non_nullable_columns
     @property
-    def unique_columns(cls: type[ModelType]) -> set[str]:
+    def unique_columns(cls: type[Model]) -> set[str]:
         """Return columns with uniqueness constraint.
         Returns:
@@ -271,7 +275,7 @@ class ModelMetaclass(PydanticModelMetaclass):
         return {column for column in cls.columns if infos[column].unique}
     @property
-    def derived_columns(cls: type[ModelType]) -> set[str]:
+    def derived_columns(cls: type[Model]) -> set[str]:
         """Return set of columns which are derived from other columns."""
         infos = cls.column_infos
         return {

{patito-0.8.0 → patito-0.8.3}/src/patito/validators.py RENAMED Viewed

@@ -301,6 +301,7 @@ def _find_errors(  # noqa: C901
             dataframe_tmp = (
                 dataframe_tmp.select(column_name)
+                .filter(pl.col(column_name).list.len() > 0)
                 .explode(column_name)
                 .unnest(column_name)
             )