PyPI - patito - Versions diffs - 0.6.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

patito 0.6.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

patito/_pydantic/column_info.py +101 -46
patito/_pydantic/dtypes/dtypes.py +19 -15
patito/_pydantic/dtypes/utils.py +32 -28
patito/_pydantic/repr.py +7 -15
patito/_pydantic/schema.py +10 -9
patito/exceptions.py +11 -16
patito/polars.py +124 -65
patito/pydantic.py +98 -89
patito/validators.py +111 -71
{patito-0.6.2.dist-info → patito-0.8.0.dist-info}/METADATA +6 -4
patito-0.8.0.dist-info/RECORD +17 -0
{patito-0.6.2.dist-info → patito-0.8.0.dist-info}/WHEEL +1 -1
patito-0.6.2.dist-info/RECORD +0 -17
{patito-0.6.2.dist-info → patito-0.8.0.dist-info}/LICENSE +0 -0

patito/polars.py CHANGED Viewed

@@ -2,25 +2,18 @@
 from __future__ import annotations
+from collections.abc import Collection, Iterable, Iterator, Sequence
 from typing import (
     TYPE_CHECKING,
     Any,
-    Collection,
-    Dict,
     Generic,
-    Iterable,
     Literal,
-    Optional,
-    Sequence,
-    Tuple,
-    Type,
     TypeVar,
-    Union,
     cast,
 )
 import polars as pl
-from polars.type_aliases import IntoExpr
+from polars._typing import IntoExpr
 from pydantic import AliasChoices, AliasPath, create_model
 from patito._pydantic.column_info import ColumnInfo
@@ -31,22 +24,40 @@ if TYPE_CHECKING:
     from patito.pydantic import Model
 DF = TypeVar("DF", bound="DataFrame")
 LDF = TypeVar("LDF", bound="LazyFrame")
 ModelType = TypeVar("ModelType", bound="Model")
 OtherModelType = TypeVar("OtherModelType", bound="Model")
+T = TypeVar("T")
+class ModelGenerator(Iterator[ModelType], Generic[ModelType]):
+    """An iterator that can be converted to a list."""
+    def __init__(self, iterator: Iterator[ModelType]) -> None:
+        """Construct a ModelGenerator from an iterator."""
+        self._iterator = iterator
+    def to_list(self) -> list[ModelType]:
+        """Convert iterator to list."""
+        return list(self)
+    def __next__(self) -> ModelType:  # noqa: D105
+        return next(self._iterator)
+    def __iter__(self) -> Iterator[ModelType]:  # noqa: D105
+        return self
 class LazyFrame(pl.LazyFrame, Generic[ModelType]):
     """LazyFrame class associated to DataFrame."""
-    model: Type[ModelType]
+    model: type[ModelType]
     @classmethod
     def _construct_lazyframe_model_class(
-        cls: Type[LDF], model: Optional[Type[ModelType]]
-    ) -> Type[LazyFrame[ModelType]]:
+        cls: type[LDF], model: type[ModelType] | None
+    ) -> type[LazyFrame[ModelType]]:
         """Return custom LazyFrame sub-class where LazyFrame.model is set.
         Can be used to construct a LazyFrame class where
@@ -75,7 +86,7 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
         self,
         *args,
         **kwargs,
-    ) -> "DataFrame[ModelType]":  # noqa: DAR101, DAR201
+    ) -> DataFrame[ModelType]:  # noqa: DAR101, DAR201
         """Collect into a DataFrame.
         See documentation of polars.DataFrame.collect for full description of
@@ -130,7 +141,7 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
         """
         derived_columns = []
         props = self.model._schema_properties()
-        original_columns = set(self.columns)
+        original_columns = set(self.collect_schema())
         to_derive = self.model.derived_columns if columns is None else columns
         for column_name in to_derive:
             if column_name not in derived_columns:
@@ -148,33 +159,35 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
     def _derive_column(
         self,
-        df: LDF,
+        lf: LDF,
         column_name: str,
-        column_infos: Dict[str, ColumnInfo],
-    ) -> Tuple[LDF, Sequence[str]]:
+        column_infos: dict[str, ColumnInfo],
+    ) -> tuple[LDF, Sequence[str]]:
         if (
             column_infos.get(column_name, None) is None
             or column_infos[column_name].derived_from is None
         ):
-            return df, []
+            return lf, []
         derived_from = column_infos[column_name].derived_from
         dtype = self.model.dtypes[column_name]
         derived_columns = []
         if isinstance(derived_from, str):
-            df = df.with_columns(pl.col(derived_from).cast(dtype).alias(column_name))
+            lf = lf.with_columns(pl.col(derived_from).cast(dtype).alias(column_name))
         elif isinstance(derived_from, pl.Expr):
             root_cols = derived_from.meta.root_names()
             while root_cols:
                 root_col = root_cols.pop()
-                df, _derived_columns = self._derive_column(df, root_col, column_infos)
+                lf, _derived_columns = self._derive_column(lf, root_col, column_infos)
                 derived_columns.extend(_derived_columns)
-            df = df.with_columns(derived_from.cast(dtype).alias(column_name))
+            lf = lf.with_columns(derived_from.cast(dtype).alias(column_name))
         else:
             raise TypeError(
                 "Can not derive dataframe column from type " f"{type(derived_from)}."
             )
         derived_columns.append(column_name)
-        return df, derived_columns
+        return lf, derived_columns
     def unalias(self: LDF) -> LDF:
         """Un-aliases column names using information from pydantic validation_alias.
@@ -191,21 +204,21 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
             return self
         exprs = []
-        def to_expr(va: str | AliasPath | AliasChoices) -> Optional[pl.Expr]:
+        def to_expr(va: str | AliasPath | AliasChoices) -> pl.Expr | None:
             if isinstance(va, str):
-                return pl.col(va) if va in self.columns else None
+                return pl.col(va) if va in self.collect_schema() else None
             elif isinstance(va, AliasPath):
                 if len(va.path) != 2 or not isinstance(va.path[1], int):
                     raise NotImplementedError(
                         f"TODO figure out how this AliasPath behaves ({va})"
                     )
                 return (
-                    pl.col(va.path[0]).list.get(va.path[1])
-                    if va.path[0] in self.columns
+                    pl.col(va.path[0]).list.get(va.path[1], null_on_oob=True)
+                    if va.path[0] in self.collect_schema()
                     else None
                 )
             elif isinstance(va, AliasChoices):
-                local_expr: Optional[pl.Expr] = None
+                local_expr: pl.Expr | None = None
                 for choice in va.choices:
                     if (part := to_expr(choice)) is not None:
                         local_expr = (
@@ -224,7 +237,7 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
                 exprs.append(pl.col(name))
             else:
                 expr = to_expr(field_info.validation_alias)
-                if name in self.columns:
+                if name in self.collect_schema().names():
                     if expr is None:
                         exprs.append(pl.col(name))
                     else:
@@ -235,7 +248,7 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
         return self.select(exprs)
     def cast(
-        self: LDF, strict: bool = False, columns: Optional[Sequence[str]] = None
+        self: LDF, strict: bool = False, columns: Sequence[str] | None = None
     ) -> LDF:
         """Cast columns to `dtypes` specified by the associated Patito model.
@@ -278,9 +291,9 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
         properties = self.model._schema_properties()
         valid_dtypes = self.model.valid_dtypes
         default_dtypes = self.model.dtypes
-        columns = columns or self.columns
+        columns = columns or self.collect_schema().names()
         exprs = []
-        for column, current_dtype in zip(self.columns, self.dtypes):
+        for column, current_dtype in self.collect_schema().items():
             if (column not in columns) or (column not in properties):
                 exprs.append(pl.col(column))
             elif "dtype" in properties[column]:
@@ -292,7 +305,7 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
         return self.with_columns(exprs)
     @classmethod
-    def from_existing(cls: Type[LDF], lf: pl.LazyFrame) -> LDF:
+    def from_existing(cls: type[LDF], lf: pl.LazyFrame) -> LDF:
         """Construct a patito.DataFrame object from an existing polars.DataFrame object."""
         return cls.model.LazyFrame._from_pyldf(lf._ldf).cast()
@@ -326,12 +339,12 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
     :ref:`Product.validate <DataFrame.validate>`.
     """
-    model: Type[ModelType]
+    model: type[ModelType]
     @classmethod
     def _construct_dataframe_model_class(
-        cls: Type[DF], model: Type[OtherModelType]
-    ) -> Type[DataFrame[OtherModelType]]:
+        cls: type[DF], model: type[OtherModelType]
+    ) -> type[DataFrame[OtherModelType]]:
         """Return custom DataFrame sub-class where DataFrame.model is set.
         Can be used to construct a DataFrame class where
@@ -445,7 +458,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
         return self.lazy().unalias().collect()
     def cast(
-        self: DF, strict: bool = False, columns: Optional[Sequence[str]] = None
+        self: DF, strict: bool = False, columns: Sequence[str] | None = None
     ) -> DF:
         """Cast columns to `dtypes` specified by the associated Patito model.
@@ -489,7 +502,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
     def drop(
         self: DF,
-        columns: Optional[Union[str, Collection[str]]] = None,
+        columns: str | Collection[str] | None = None,
         *more_columns: str,
     ) -> DF:
         """Drop one or more columns from the dataframe.
@@ -529,23 +542,23 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
         else:
             return self.drop(list(set(self.columns) - set(self.model.columns)))
-    def validate(self, columns: Optional[Sequence[str]] = None, **kwargs: Any):
+    def validate(self, columns: Sequence[str] | None = None, **kwargs: Any):
         """Validate the schema and content of the dataframe.
         You must invoke ``.set_model()`` before invoking ``.validate()`` in order
         to specify how the dataframe should be validated.
         Returns:
-            DataFrame[Model]: The original dataframe, if correctly validated.
+            DataFrame[Model]: The original patito dataframe, if correctly validated.
         Raises:
+            patito.exceptions.DataFrameValidationError: If the dataframe does not match the
+                specified schema.
             TypeError: If ``DataFrame.set_model()`` has not been invoked prior to
                 validation. Note that ``patito.Model.DataFrame`` automatically invokes
                 ``DataFrame.set_model()`` for you.
-            patito.exceptions.DataFrameValidationError: If the dataframe does not match the
-                specified schema.
         Examples:
             >>> import patito as pt
@@ -623,13 +636,12 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
     def fill_null(
         self: DF,
-        value: Optional[Any] = None,
-        strategy: Optional[
-            Literal[
-                "forward", "backward", "min", "max", "mean", "zero", "one", "defaults"
-            ]
-        ] = None,
-        limit: Optional[int] = None,
+        value: Any | None = None,
+        strategy: Literal[
+            "forward", "backward", "min", "max", "mean", "zero", "one", "defaults"
+        ]
+        | None = None,
+        limit: int | None = None,
         matches_supertype: bool = True,
     ) -> DF:
         """Fill null values using a filling strategy, literal, or ``Expr``.
@@ -689,14 +701,13 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
                         pl.lit(default_value, self.model.dtypes[column])
                     )
                     if column in self.columns
-                    else pl.Series(column, [default_value], self.model.dtypes[column])
-                )  # NOTE: hack to get around polars bug https://github.com/pola-rs/polars/issues/13602
-                # else pl.lit(default_value, self.model.dtypes[column]).alias(column)
+                    else pl.lit(default_value, self.model.dtypes[column]).alias(column)
+                )
                 for column, default_value in self.model.defaults.items()
             ]
         ).set_model(self.model)
-    def get(self, predicate: Optional[pl.Expr] = None) -> ModelType:
+    def get(self, predicate: pl.Expr | None = None) -> ModelType:
         """Fetch the single row that matches the given polars predicate.
         If you expect a data frame to already consist of one single row,
@@ -778,7 +789,57 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
         else:
             return self._pydantic_model().from_row(row)  # type: ignore
-    def _pydantic_model(self) -> Type[Model]:
+    def iter_models(
+        self, validate_df: bool = True, validate_model: bool = False
+    ) -> ModelGenerator[ModelType]:
+        """Iterate over all rows in the dataframe as pydantic models.
+        Args:
+            validate_df: If set to ``True``, the dataframe will be validated before
+                making models out of each row. If set to ``False``, beware that columns
+                need to be the exact same as the model fields.
+            validate_model: If set to ``True``, each model will be validated when
+                constructing. Disabled by default since df validation should cover this case.
+        Yields:
+            Model: A pydantic-derived model representing the given row. .to_list() can be
+                used to convert the iterator to a list.
+        Raises:
+            TypeError: If ``DataFrame.set_model()`` has not been invoked prior to
+                iteration.
+        Example:
+            >>> import patito as pt
+            >>> import polars as pl
+            >>> class Product(pt.Model):
+            ...     product_id: int = pt.Field(unique=True)
+            ...     price: float
+            ...
+            >>> df = pt.DataFrame({"product_id": [1, 2], "price": [10., 20.]})
+            >>> df = df.set_model(Product)
+            >>> for product in df.iter_models():
+            ...     print(product)
+            ...
+            Product(product_id=1, price=10.0)
+            Product(product_id=2, price=20.0)
+        """
+        if not hasattr(self, "model"):
+            raise TypeError(
+                f"You must invoke {self.__class__.__name__}.set_model() "
+                f"before invoking {self.__class__.__name__}.iter_models()."
+            )
+        df = self.validate(drop_superfluous_columns=True) if validate_df else self
+        def _iter_models(_df: DF) -> Iterator[ModelType]:
+            for idx in range(_df.height):
+                yield self.model.from_row(_df[idx], validate=validate_model)
+        return ModelGenerator(_iter_models(df))
+    def _pydantic_model(self) -> type[Model]:
         """Dynamically construct patito model compliant with dataframe.
         Returns:
@@ -790,7 +851,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
         pydantic_annotations = {column: (Any, ...) for column in self.columns}
         return cast(
-            Type[Model],
+            type[Model],
             create_model(  # type: ignore
                 "UntypedRow",
                 __base__=Model,
@@ -804,7 +865,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
     @classmethod
     def read_csv(  # type: ignore[no-untyped-def]
-        cls: Type[DF],
+        cls: type[DF],
         *args,  # noqa: ANN002
         **kwargs,  # noqa: ANN003
     ) -> DF:
@@ -865,7 +926,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
             # └─────┴─────┘
         """
-        kwargs.setdefault("dtypes", cls.model.dtypes)
+        kwargs.setdefault("schema_overrides", cls.model.dtypes)
         has_header = kwargs.get("has_header", True)
         if not has_header and "columns" not in kwargs:
             kwargs.setdefault("new_columns", cls.model.columns)
@@ -877,9 +938,9 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
                 field_name: alias_func(field_name)
                 for field_name in cls.model.model_fields
             }
-            kwargs["dtypes"] = {
+            kwargs["schema_overrides"] = {
                 fields_to_cols.get(field, field): dtype
-                for field, dtype in kwargs["dtypes"].items()
+                for field, dtype in kwargs["schema_overrides"].items()
             }
             # TODO: other forms of alias setting like in Field
         df = cls.model.DataFrame._from_pydf(pl.read_csv(*args, **kwargs)._df)
@@ -888,15 +949,13 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
     # --- Type annotation overrides ---
     def filter(  # noqa: D102
         self: DF,
-        predicate: Union[
-            pl.Expr, str, pl.Series, list[bool], np.ndarray[Any, Any], bool
-        ],
+        predicate: pl.Expr | str | pl.Series | list[bool] | np.ndarray[Any, Any] | bool,
     ) -> DF:
         return cast(DF, super().filter(predicate))
     def select(  # noqa: D102
         self: DF,
-        *exprs: Union[IntoExpr, Iterable[IntoExpr]],
+        *exprs: IntoExpr | Iterable[IntoExpr],
         **named_exprs: IntoExpr,
     ) -> DF:
         return cast(  # pyright: ignore[redundant-cast]
@@ -905,7 +964,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
     def with_columns(  # noqa: D102
         self: DF,
-        *exprs: Union[IntoExpr, Iterable[IntoExpr]],
+        *exprs: IntoExpr | Iterable[IntoExpr],
         **named_exprs: IntoExpr,
     ) -> DF:
         return cast(DF, super().with_columns(*exprs, **named_exprs))

patito 0.6.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

patito 0.6.2py3-none-any.whl → 0.8.0py3-none-any.whl