PyPI - dataframely - Versions diffs - 2.8.2__tar.gz → 2.9.0__tar.gz - Mend

dataframely 2.8.2tar.gz → 2.9.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

{dataframely-2.8.2 → dataframely-2.9.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataframely
-Version: 2.8.2
+Version: 2.9.0
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/_base.py RENAMED Viewed

@@ -5,14 +5,15 @@ from __future__ import annotations
 import inspect
 import sys
+import warnings
 from abc import ABC, abstractmethod
 from collections import Counter
 from collections.abc import Callable, Mapping, Sequence
-from typing import Any, TypeAlias, cast
+from typing import Annotated, Any, TypeAlias, cast
 import polars as pl
-from dataframely._compat import pa, sa, sa_TypeEngine
+from dataframely._compat import pa, pydantic, sa, sa_TypeEngine
 from dataframely._polars import PolarsDataType
 from dataframely.random import Generator
@@ -222,6 +223,50 @@ class Column(ABC):
     def pyarrow_dtype(self) -> pa.DataType:
         """The :mod:`pyarrow` dtype equivalent of this column data type."""
+    # ----------------------------------- PYDANTIC ----------------------------------- #
+    def pydantic_field(self) -> Any:
+        """Obtain a pydantic field type for this column definition.
+        Returns:
+            A pydantic-compatible type annotation that includes structured constraints
+            (such as `min`, `max`, ...).
+        Warning:
+            Custom checks are not translated to pydantic validators.
+        """
+        if self.check is not None:
+            warnings.warn(
+                f"Custom checks for column '{self.name or self.__class__.__name__}' "
+                "are not translated to pydantic constraints."
+            )
+        python_type = self._python_type
+        if self.nullable:
+            python_type = python_type | None
+        field_kwargs = self._pydantic_field_kwargs()
+        if field_kwargs:
+            return Annotated[python_type, pydantic.Field(**field_kwargs)]
+        return python_type
+    @property
+    @abstractmethod
+    def _python_type(self) -> Any:
+        """The native Python type corresponding to this column definition."""
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        """Return kwargs for pydantic.Field initialization.
+        This method should be extended by subclasses and mixins to add their
+        specific constraints. Subclasses should call super() and extend the
+        returned dictionary.
+        Returns:
+            A dictionary of kwargs to pass to pydantic.Field.
+        """
+        return {}
     # ------------------------------------ HELPER ------------------------------------ #
     @property
@@ -234,6 +279,77 @@ class Column(ABC):
         """Obtain a Polars column expression for the column."""
         return pl.col(self.name)
+    def with_properties(self, **kwargs: Any) -> Self:
+        """Copy the current column definition while updating the provided properties.
+        All other properties from the original column are preserved.
+        Args:
+            **kwargs: Properties to update on the new column instance. The set of allowed properties depends on the type of the column.
+        Returns:
+            A new column instance with updated properties.
+        """
+        new_kwargs = {
+            k: getattr(self, k) for k in inspect.signature(self.__class__).parameters
+        } | kwargs
+        return self.__class__(**new_kwargs)
+    def with_nullable(self, nullable: bool) -> Self:
+        """Return a new column definition with specified nullability.
+        Args:
+            nullable: Whether the new column may contain null values.
+        Returns:
+            A new column instance with updated nullability.
+        """
+        return self.with_properties(nullable=nullable)
+    def with_alias(self, alias: str) -> Self:
+        """Return a new column definition with a specified alias.
+        Args:
+            alias: The alias to use for the column name.
+        Returns:
+            A new column instance with the specified alias.
+        """
+        return self.with_properties(alias=alias)
+    def with_check(self, check: Check) -> Self:
+        """Return a new column definition with a specified check.
+        Args:
+            check: A custom validation rule or rules for the column.
+        Returns:
+            A new column instance with the specified check.
+        """
+        return self.with_properties(check=check)
+    def with_primary_key(self, primary_key: bool) -> Self:
+        """Return a new column definition with a specified primary key status.
+        Args:
+            primary_key: Whether the column should be part of the primary key.
+        Returns:
+            A new column instance with updated primary key status.
+        """
+        return self.with_properties(primary_key=primary_key)
+    def with_metadata(self, metadata: dict[str, Any]) -> Self:
+        """Return a new column definition with specified metadata.
+        Args:
+            metadata: A dictionary of metadata to attach to the column.
+        Returns:
+            A new column instance with the specified metadata.
+        """
+        return self.with_properties(metadata=metadata)
     # ----------------------------------- SAMPLING ----------------------------------- #
     def sample(self, generator: Generator, n: int = 1) -> pl.Series:

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/_mixins.py RENAMED Viewed

@@ -80,6 +80,18 @@ class OrdinalMixin(Generic[T], Base):
             result["max_exclusive"] = expr < self.max_exclusive  # type: ignore
         return result
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        kwargs = super()._pydantic_field_kwargs()
+        if self.min is not None:
+            kwargs["ge"] = self.min
+        if self.min_exclusive is not None:
+            kwargs["gt"] = self.min_exclusive
+        if self.max is not None:
+            kwargs["le"] = self.max
+        if self.max_exclusive is not None:
+            kwargs["lt"] = self.max_exclusive
+        return kwargs
 # ------------------------------------ IS IN MIXIN ----------------------------------- #

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/any.py RENAMED Viewed

@@ -3,6 +3,8 @@
 from __future__ import annotations
+from typing import Any as AnyType
 import polars as pl
 from dataframely._compat import pa, sa, sa_mssql, sa_TypeEngine
@@ -77,5 +79,9 @@ class Any(Column):
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.null()
+    @property
+    def _python_type(self) -> AnyType:
+        return AnyType
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return pl.repeat(None, n, dtype=pl.Null, eager=True)

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/array.py RENAMED Viewed

@@ -5,6 +5,7 @@ from __future__ import annotations
 import math
 import sys
+import warnings
 from collections.abc import Sequence
 from typing import Any, Literal, cast
@@ -121,6 +122,23 @@ class Array(Column):
     def pyarrow_dtype(self) -> pa.DataType:
         return self._pyarrow_field_of_shape(self.shape).type
+    @property
+    def _python_type(self) -> Any:
+        inner_type = self.inner.pydantic_field()
+        return list[inner_type]  # type: ignore
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        if len(self.shape) != 1:
+            warnings.warn(
+                "Multi-dimensional arrays are flattened for pydantic validation."
+            )
+        return {
+            **super()._pydantic_field_kwargs(),
+            "min_length": math.prod(self.shape),
+            "max_length": math.prod(self.shape),
+        }
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         # Sample the inner elements in a flat series
         n_elements = n * math.prod(self.shape)

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/binary.py RENAMED Viewed

@@ -3,6 +3,8 @@
 from __future__ import annotations
+from typing import Any
 import polars as pl
 from dataframely._compat import pa, sa, sa_TypeEngine
@@ -31,6 +33,10 @@ class Binary(Column):
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.large_binary()
+    @property
+    def _python_type(self) -> Any:
+        return bytes
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return generator.sample_binary(
             n,

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/bool.py RENAMED Viewed

@@ -3,6 +3,8 @@
 from __future__ import annotations
+from typing import Any
 import polars as pl
 from dataframely._compat import pa, sa, sa_TypeEngine
@@ -27,5 +29,9 @@ class Bool(Column):
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.bool_()
+    @property
+    def _python_type(self) -> Any:
+        return bool
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return generator.sample_bool(n, null_probability=self._null_probability)

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/categorical.py RENAMED Viewed

@@ -71,6 +71,10 @@ class Categorical(Column):
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.dictionary(pa.uint32(), pa.large_string())
+    @property
+    def _python_type(self) -> Any:
+        return str
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         # We simply sample low-cardinality strings here
         return generator.sample_string(

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/datetime.py RENAMED Viewed

@@ -4,6 +4,7 @@
 from __future__ import annotations
 import datetime as dt
+import warnings
 from typing import Any, cast
 import polars as pl
@@ -132,6 +133,16 @@ class Date(OrdinalMixin[dt.date], Column):
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.date32()
+    @property
+    def _python_type(self) -> Any:
+        return dt.date
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        if self.resolution is not None:
+            warnings.warn("Date resolution is not translated to a pydantic constraint.")
+        return super()._pydantic_field_kwargs()
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return generator.sample_date(
             n,
@@ -261,6 +272,16 @@ class Time(OrdinalMixin[dt.time], Column):
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.time64("ns")
+    @property
+    def _python_type(self) -> Any:
+        return dt.time
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        if self.resolution is not None:
+            warnings.warn("Time resolution is not translated to a pydantic constraint.")
+        return super()._pydantic_field_kwargs()
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return generator.sample_time(
             n,
@@ -394,6 +415,22 @@ class Datetime(OrdinalMixin[dt.datetime], Column):
         )
         return pa.timestamp(self.time_unit, time_zone)
+    @property
+    def _python_type(self) -> Any:
+        return dt.datetime
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        if self.resolution is not None:
+            warnings.warn(
+                "Datetime resolution is not translated to a pydantic constraint."
+            )
+        if self.time_zone is not None:
+            warnings.warn(
+                "Datetime time zone is not translated to a pydantic constraint."
+            )
+        return super()._pydantic_field_kwargs()
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return generator.sample_datetime(
             n,
@@ -531,6 +568,18 @@ class Duration(OrdinalMixin[dt.timedelta], Column):
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.duration(self.time_unit)
+    @property
+    def _python_type(self) -> Any:
+        return dt.timedelta
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        if self.resolution is not None:
+            warnings.warn(
+                "Duration resolution is not translated to a pydantic constraint."
+            )
+        return super()._pydantic_field_kwargs()
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         # NOTE: If no duration is specified, we default to 100 years
         return generator.sample_duration(

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/decimal.py RENAMED Viewed

@@ -128,6 +128,16 @@ class Decimal(OrdinalMixin[decimal.Decimal], Column):
         # We do not use decimal256 since its values cannot be represented in SQL Server.
         return pa.decimal128(self.precision or 38, self.scale)
+    @property
+    def _python_type(self) -> Any:
+        return decimal.Decimal
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        return {
+            **super()._pydantic_field_kwargs(),
+            "decimal_places": self.scale,
+        }
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         # NOTE: Default precision to 38 for sampling, just like for SQL and Pyarrow
         precision = self.precision or 38

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/enum.py RENAMED Viewed

@@ -6,7 +6,7 @@ from __future__ import annotations
 import enum
 from collections.abc import Iterable
 from inspect import isclass
-from typing import Any
+from typing import Any, Literal
 import polars as pl
@@ -95,6 +95,10 @@ class Enum(Column):
             dtype = pa.uint32()
         return pa.dictionary(dtype, pa.large_string())
+    @property
+    def _python_type(self) -> Any:
+        return Literal[tuple(self.categories)]
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return generator.sample_choice(
             n,

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/float.py RENAMED Viewed

@@ -5,6 +5,7 @@ from __future__ import annotations
 import math
 import sys
+import warnings
 from abc import abstractmethod
 from typing import Any
@@ -101,6 +102,26 @@ class _BaseFloat(OrdinalMixin[float], Column):
     def min_value(self) -> float:
         """Minimum value of the column's type."""
+    @property
+    def _python_type(self) -> Any:
+        return float
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        if self.allow_inf != self.allow_nan:
+            warnings.warn(
+                "Unequal settings of `allow_inf` and `allow_nan` cannot be translated to "
+                "pydantic constraints."
+            )
+        kwargs = super()._pydantic_field_kwargs()
+        if self.allow_inf == self.allow_nan:
+            kwargs["allow_inf_nan"] = self.allow_inf
+        if "le" not in kwargs:
+            kwargs["le"] = self.max_value
+        if "ge" not in kwargs:
+            kwargs["ge"] = self.min_value
+        return kwargs
     @property
     def _nan_probability(self) -> float:
         """Private utility for the null probability used during sampling."""

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/integer.py RENAMED Viewed

@@ -5,7 +5,7 @@ from __future__ import annotations
 from abc import abstractmethod
 from collections.abc import Sequence
-from typing import Any
+from typing import Any, Literal
 import polars as pl
 from polars.datatypes.group import INTEGER_DTYPES
@@ -114,6 +114,20 @@ class _BaseInteger(IsInMixin[int], OrdinalMixin[int], Column):
         """Minimum value of the column's type."""
         return 0 if self.is_unsigned else -(2 ** (self.num_bytes * 8 - 1))
+    @property
+    def _python_type(self) -> Any:
+        if self.is_in is not None:
+            return Literal[tuple(self.is_in)]
+        return int
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        kwargs = super()._pydantic_field_kwargs()
+        if "le" not in kwargs:
+            kwargs["le"] = self.max_value
+        if "ge" not in kwargs:
+            kwargs["ge"] = self.min_value
+        return kwargs
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         if self.is_in is not None:
             return generator.sample_choice(

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/list.py RENAMED Viewed

@@ -133,6 +133,19 @@ class List(Column):
         # NOTE: Polars uses `large_list`s by default.
         return pa.large_list(self.inner.pyarrow_field("item"))
+    @property
+    def _python_type(self) -> Any:
+        inner_type = self.inner.pydantic_field()
+        return list[inner_type]  # type: ignore
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        kwargs = super()._pydantic_field_kwargs()
+        if self.min_length is not None:
+            kwargs["min_length"] = self.min_length
+        if self.max_length is not None:
+            kwargs["max_length"] = self.max_length
+        return kwargs
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         # First, sample the number of items per list element
         # NOTE: We default to 32 for the upper bound as we need some kind of reasonable

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/object.py RENAMED Viewed

@@ -67,6 +67,10 @@ class Object(Column):
     def pyarrow_dtype(self) -> pa.DataType:
         raise NotImplementedError("PyArrow column cannot have 'Object' type.")
+    @property
+    def _python_type(self) -> Any:
+        return Any
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         raise NotImplementedError(
             "Random data sampling not implemented for 'Object' type."

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/string.py RENAMED Viewed

@@ -112,6 +112,20 @@ class String(Column):
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.large_string()
+    @property
+    def _python_type(self) -> Any:
+        return str
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        kwargs = super()._pydantic_field_kwargs()
+        if self.min_length is not None:
+            kwargs["min_length"] = self.min_length
+        if self.max_length is not None:
+            kwargs["max_length"] = self.max_length
+        if self.regex is not None:
+            kwargs["pattern"] = self.regex
+        return kwargs
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         if (
             self.min_length is not None or self.max_length is not None

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/columns/struct.py RENAMED Viewed

@@ -8,7 +8,7 @@ from typing import Any, cast
 import polars as pl
-from dataframely._compat import pa, sa, sa_postgresql, sa_TypeEngine
+from dataframely._compat import pa, pydantic, sa, sa_postgresql, sa_TypeEngine
 from dataframely._polars import PolarsDataType
 from dataframely.random import Generator
@@ -117,6 +117,11 @@ class Struct(Column):
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.struct([col.pyarrow_field(name) for name, col in self.inner.items()])
+    @property
+    def _python_type(self) -> Any:
+        fields = {name: col.pydantic_field() for name, col in self.inner.items()}
+        return pydantic.create_model("StructModel", **fields)
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         series = (
             pl.DataFrame(

{dataframely-2.8.2 → dataframely-2.9.0}/dataframely/schema.py RENAMED Viewed

@@ -16,10 +16,8 @@ import polars as pl
 import polars.exceptions as plexc
 from polars._typing import FileSource
-from dataframely._compat import deltalake
 from ._base_schema import ORIGINAL_COLUMN_PREFIX, BaseSchema
-from ._compat import PartitionSchemeOrSinkDirectory, pa, sa
+from ._compat import PartitionSchemeOrSinkDirectory, deltalake, pa, pydantic, sa
 from ._match_to_schema import match_to_schema
 from ._native import format_rule_failures
 from ._plugin import all_rules, all_rules_horizontal, all_rules_required
@@ -813,9 +811,7 @@ class Schema(BaseSchema, ABC):
             the lazy frame's schema but also means that a call to :meth:`polars.LazyFrame.collect`
             further down the line might fail because of the cast and/or missing columns.
         """
-        lf = df.lazy().select(
-            pl.col(name).cast(col.dtype) for name, col in cls.columns().items()
-        )
+        lf = match_to_schema(df.lazy(), cls, casting="strict")
         if isinstance(df, pl.DataFrame):
             return lf.collect()  # type: ignore
         return lf  # type: ignore
@@ -1341,6 +1337,32 @@ class Schema(BaseSchema, ABC):
             [col.pyarrow_field(name) for name, col in cls.columns().items()]
         )
+    @classmethod
+    def to_pydantic_model(cls, name: str | None = None) -> type[pydantic.BaseModel]:
+        """Convert this schema to a pydantic model.
+        The pydantic model includes all columns defined in the schema along with their
+        (structured) constraints. Custom checks and schema-level rules are not included
+        in the pydantic model.
+        Args:
+            name: The name of the returned pydantic model. If `None`, a default name is
+                generated based on the name of this schema.
+        Returns:
+            A :mod:`pydantic` model class.
+        """
+        if cls._schema_validation_rules():
+            warnings.warn(
+                "Schema-level rules are not translated to pydantic validators."
+            )
+        model_name = name or f"{cls.__name__.removesuffix('Schema')}Model"
+        fields = {
+            col_name: col.pydantic_field() for col_name, col in cls.columns().items()
+        }
+        return pydantic.create_model(model_name, **fields)
     # ----------------------------------- EQUALITY ----------------------------------- #
     @classmethod

{dataframely-2.8.2 → dataframely-2.9.0}/docs/guides/coding-agents.md RENAMED Viewed

@@ -15,7 +15,7 @@ find it. For example, if you are using Claude Code:
 ```bash
 mkdir -p .claude/skills/dataframely/
-curl -o .claude/skills/dataframely/SKILL.md https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/SKILL.md
+curl -o .claude/skills/dataframely/SKILL.md https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/skills/SKILL.md
 ```
 or if you are using [skills.sh](https://skills.sh/) to manage your skills:

dataframely 2.8.2__tar.gz → 2.9.0__tar.gz

dataframely 2.8.2tar.gz → 2.9.0tar.gz