patito 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- patito/__init__.py +1 -0
- patito/_docs.py +1 -0
- patito/_pydantic/__init__.py +0 -1
- patito/_pydantic/column_info.py +21 -6
- patito/_pydantic/dtypes/dtypes.py +16 -25
- patito/_pydantic/dtypes/utils.py +2 -4
- patito/_pydantic/repr.py +2 -2
- patito/_pydantic/schema.py +2 -4
- patito/exceptions.py +13 -0
- patito/polars.py +32 -60
- patito/pydantic.py +51 -105
- patito/validators.py +142 -33
- {patito-0.6.1.dist-info → patito-0.6.2.dist-info}/METADATA +10 -21
- patito-0.6.2.dist-info/RECORD +17 -0
- patito/xdg.py +0 -24
- patito-0.6.1.dist-info/RECORD +0 -18
- {patito-0.6.1.dist-info → patito-0.6.2.dist-info}/LICENSE +0 -0
- {patito-0.6.1.dist-info → patito-0.6.2.dist-info}/WHEEL +0 -0
patito/__init__.py
CHANGED
patito/_docs.py
CHANGED
patito/_pydantic/__init__.py
CHANGED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
|
patito/_pydantic/column_info.py
CHANGED
|
@@ -19,10 +19,9 @@ from patito._pydantic.dtypes import parse_composite_dtype
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class ColumnInfo(BaseModel, arbitrary_types_allowed=True):
|
|
22
|
-
"""patito-side model for storing column metadata
|
|
22
|
+
"""patito-side model for storing column metadata.
|
|
23
23
|
|
|
24
24
|
Args:
|
|
25
|
-
----
|
|
26
25
|
constraints (Union[polars.Expression, List[polars.Expression]): A single
|
|
27
26
|
constraint or list of constraints, expressed as a polars expression objects.
|
|
28
27
|
All rows must satisfy the given constraint. You can refer to the given column
|
|
@@ -40,6 +39,22 @@ class ColumnInfo(BaseModel, arbitrary_types_allowed=True):
|
|
|
40
39
|
derived_from: Optional[Union[str, pl.Expr]] = None
|
|
41
40
|
unique: Optional[bool] = None
|
|
42
41
|
|
|
42
|
+
def __repr__(self) -> str:
|
|
43
|
+
"""Print only Field attributes whose values are not default (mainly None)."""
|
|
44
|
+
not_default_field = {
|
|
45
|
+
field: getattr(self, field)
|
|
46
|
+
for field in self.model_fields
|
|
47
|
+
if getattr(self, field) is not self.model_fields[field].default
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
string = ""
|
|
51
|
+
for field, value in not_default_field.items():
|
|
52
|
+
string += f"{field}={value}, "
|
|
53
|
+
if string:
|
|
54
|
+
# remove trailing comma and space
|
|
55
|
+
string = string[:-2]
|
|
56
|
+
return f"ColumnInfo({string})"
|
|
57
|
+
|
|
43
58
|
@field_serializer("constraints", "derived_from")
|
|
44
59
|
def serialize_exprs(self, exprs: str | pl.Expr | Sequence[pl.Expr] | None) -> Any:
|
|
45
60
|
if exprs is None:
|
|
@@ -56,17 +71,17 @@ class ColumnInfo(BaseModel, arbitrary_types_allowed=True):
|
|
|
56
71
|
def _serialize_expr(self, expr: pl.Expr) -> Dict:
|
|
57
72
|
if isinstance(expr, pl.Expr):
|
|
58
73
|
return json.loads(
|
|
59
|
-
expr.meta.
|
|
74
|
+
expr.meta.serialize(None)
|
|
60
75
|
) # can we access the dictionary directly?
|
|
61
76
|
else:
|
|
62
77
|
raise ValueError(f"Invalid type for expr: {type(expr)}")
|
|
63
78
|
|
|
64
79
|
@field_serializer("dtype")
|
|
65
80
|
def serialize_dtype(self, dtype: DataTypeClass | DataType | None) -> Any:
|
|
66
|
-
"""
|
|
67
|
-
----------
|
|
68
|
-
[1] https://stackoverflow.com/questions/76572310/how-to-serialize-deserialize-polars-datatypes
|
|
81
|
+
"""Serialize a polars dtype.
|
|
69
82
|
|
|
83
|
+
References:
|
|
84
|
+
[1] https://stackoverflow.com/questions/76572310/how-to-serialize-deserialize-polars-datatypes
|
|
70
85
|
"""
|
|
71
86
|
if dtype is None:
|
|
72
87
|
return None
|
|
@@ -39,24 +39,17 @@ def valid_dtypes_for_model(
|
|
|
39
39
|
@cache
|
|
40
40
|
def default_dtypes_for_model(
|
|
41
41
|
cls: Type[ModelType],
|
|
42
|
-
) -> dict[str,
|
|
43
|
-
default_dtypes = {}
|
|
42
|
+
) -> dict[str, DataType]:
|
|
43
|
+
default_dtypes: dict[str, DataType] = {}
|
|
44
44
|
for column in cls.columns:
|
|
45
|
-
dtype =
|
|
45
|
+
dtype = (
|
|
46
|
+
cls.column_infos[column].dtype
|
|
47
|
+
or DtypeResolver(cls.model_fields[column].annotation).default_polars_dtype()
|
|
48
|
+
)
|
|
46
49
|
if dtype is None:
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
if default_dtype is None:
|
|
51
|
-
raise ValueError(
|
|
52
|
-
f"Unable to find a default dtype for column `{column}`"
|
|
53
|
-
)
|
|
54
|
-
else:
|
|
55
|
-
default_dtypes[column] = default_dtype
|
|
56
|
-
else:
|
|
57
|
-
default_dtypes[column] = (
|
|
58
|
-
dtype if isinstance(dtype, DataType) else dtype()
|
|
59
|
-
) # if dtype is not instantiated, instantiate it
|
|
50
|
+
raise ValueError(f"Unable to find a default dtype for column `{column}`")
|
|
51
|
+
|
|
52
|
+
default_dtypes[column] = dtype if isinstance(dtype, DataType) else dtype()
|
|
60
53
|
return default_dtypes
|
|
61
54
|
|
|
62
55
|
|
|
@@ -68,7 +61,6 @@ def validate_polars_dtype(
|
|
|
68
61
|
"""Check that the polars dtype is valid for the given annotation. Raises ValueError if not.
|
|
69
62
|
|
|
70
63
|
Args:
|
|
71
|
-
----
|
|
72
64
|
annotation (type[Any] | None): python type annotation
|
|
73
65
|
dtype (DataType | DataTypeClass | None): polars dtype
|
|
74
66
|
column (Optional[str], optional): column name. Defaults to None.
|
|
@@ -96,7 +88,6 @@ def validate_annotation(
|
|
|
96
88
|
"""Check that the provided annotation has polars/patito support (we can resolve it to a default dtype). Raises ValueError if not.
|
|
97
89
|
|
|
98
90
|
Args:
|
|
99
|
-
----
|
|
100
91
|
annotation (type[Any] | None): python type annotation
|
|
101
92
|
column (Optional[str], optional): column name. Defaults to None.
|
|
102
93
|
|
|
@@ -130,9 +121,9 @@ class DtypeResolver:
|
|
|
130
121
|
return PT_BASE_SUPPORTED_DTYPES
|
|
131
122
|
return self._valid_polars_dtypes_for_schema(self.schema)
|
|
132
123
|
|
|
133
|
-
def default_polars_dtype(self) ->
|
|
124
|
+
def default_polars_dtype(self) -> DataType | None:
|
|
134
125
|
if self.annotation == Any:
|
|
135
|
-
return pl.String
|
|
126
|
+
return pl.String()
|
|
136
127
|
return self._default_polars_dtype_for_schema(self.schema)
|
|
137
128
|
|
|
138
129
|
def _valid_polars_dtypes_for_schema(
|
|
@@ -197,9 +188,7 @@ class DtypeResolver:
|
|
|
197
188
|
PydanticBaseType(pyd_type), props.get("format"), props.get("enum")
|
|
198
189
|
)
|
|
199
190
|
|
|
200
|
-
def _default_polars_dtype_for_schema(
|
|
201
|
-
self, schema: Dict
|
|
202
|
-
) -> DataTypeClass | DataType | None:
|
|
191
|
+
def _default_polars_dtype_for_schema(self, schema: Dict) -> DataType | None:
|
|
203
192
|
if "anyOf" in schema:
|
|
204
193
|
if len(schema["anyOf"]) == 2: # look for optionals first
|
|
205
194
|
schema = _without_optional(schema)
|
|
@@ -216,10 +205,12 @@ class DtypeResolver:
|
|
|
216
205
|
def _pydantic_subschema_to_default_dtype(
|
|
217
206
|
self,
|
|
218
207
|
props: Dict,
|
|
219
|
-
) ->
|
|
208
|
+
) -> DataType | None:
|
|
220
209
|
if "column_info" in props: # user has specified in patito model
|
|
221
210
|
if props["column_info"]["dtype"] is not None:
|
|
222
|
-
|
|
211
|
+
dtype = dtype_from_string(props["column_info"]["dtype"])
|
|
212
|
+
dtype = dtype() if isinstance(dtype, DataTypeClass) else dtype
|
|
213
|
+
return dtype
|
|
223
214
|
if "type" not in props:
|
|
224
215
|
if "enum" in props:
|
|
225
216
|
raise TypeError("Mixed type enums not supported by patito.")
|
patito/_pydantic/dtypes/utils.py
CHANGED
|
@@ -78,11 +78,9 @@ def is_optional(type_annotation: type[Any] | Any | None) -> bool:
|
|
|
78
78
|
"""Return True if the given type annotation is an Optional annotation.
|
|
79
79
|
|
|
80
80
|
Args:
|
|
81
|
-
----
|
|
82
81
|
type_annotation: The type annotation to be checked.
|
|
83
82
|
|
|
84
83
|
Returns:
|
|
85
|
-
-------
|
|
86
84
|
True if the outermost type is Optional.
|
|
87
85
|
|
|
88
86
|
"""
|
|
@@ -92,7 +90,7 @@ def is_optional(type_annotation: type[Any] | Any | None) -> bool:
|
|
|
92
90
|
|
|
93
91
|
|
|
94
92
|
def parse_composite_dtype(dtype: DataTypeClass | DataType) -> str:
|
|
95
|
-
"""For serialization, converts polars dtype to string representation"""
|
|
93
|
+
"""For serialization, converts polars dtype to string representation."""
|
|
96
94
|
if dtype in pl.NESTED_DTYPES:
|
|
97
95
|
if dtype == pl.Struct or isinstance(dtype, pl.Struct):
|
|
98
96
|
raise NotImplementedError("Structs not yet supported by patito")
|
|
@@ -110,7 +108,7 @@ def parse_composite_dtype(dtype: DataTypeClass | DataType) -> str:
|
|
|
110
108
|
|
|
111
109
|
|
|
112
110
|
def dtype_from_string(v: str) -> Optional[Union[DataTypeClass, DataType]]:
|
|
113
|
-
"""For deserialization"""
|
|
111
|
+
"""For deserialization."""
|
|
114
112
|
# TODO test all dtypes
|
|
115
113
|
return convert.dtype_short_repr_to_dtype(v)
|
|
116
114
|
|
patito/_pydantic/repr.py
CHANGED
|
@@ -82,7 +82,7 @@ class Representation:
|
|
|
82
82
|
def __pretty__(
|
|
83
83
|
self, fmt: Callable[[Any], Any], **kwargs: Any
|
|
84
84
|
) -> Generator[Any, None, None]:
|
|
85
|
-
"""Used by devtools (https://python-devtools.helpmanual.io/) to provide a human readable representations of objects"""
|
|
85
|
+
"""Used by devtools (https://python-devtools.helpmanual.io/) to provide a human readable representations of objects."""
|
|
86
86
|
yield self.__repr_name__() + "("
|
|
87
87
|
yield 1
|
|
88
88
|
for name, value in self.__repr_args__():
|
|
@@ -101,7 +101,7 @@ class Representation:
|
|
|
101
101
|
return f'{self.__repr_name__()}({self.__repr_str__(", ")})'
|
|
102
102
|
|
|
103
103
|
def __rich_repr__(self) -> "RichReprResult":
|
|
104
|
-
"""Get fields for Rich library"""
|
|
104
|
+
"""Get fields for Rich library."""
|
|
105
105
|
for name, field_repr in self.__repr_args__():
|
|
106
106
|
if name is None:
|
|
107
107
|
yield field_repr
|
patito/_pydantic/schema.py
CHANGED
|
@@ -16,14 +16,12 @@ if TYPE_CHECKING:
|
|
|
16
16
|
def schema_for_model(cls: Type[ModelType]) -> Dict[str, Dict[str, Any]]:
|
|
17
17
|
"""Return schema properties where definition references have been resolved.
|
|
18
18
|
|
|
19
|
-
Returns
|
|
20
|
-
-------
|
|
19
|
+
Returns:
|
|
21
20
|
Field information as a dictionary where the keys are field names and the
|
|
22
21
|
values are dictionaries containing metadata information about the field
|
|
23
22
|
itself.
|
|
24
23
|
|
|
25
|
-
Raises
|
|
26
|
-
------
|
|
24
|
+
Raises:
|
|
27
25
|
TypeError: if a field is annotated with an enum where the values are of
|
|
28
26
|
different types.
|
|
29
27
|
|
patito/exceptions.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Exceptions used by patito."""
|
|
2
|
+
|
|
1
3
|
from typing import (
|
|
2
4
|
TYPE_CHECKING,
|
|
3
5
|
Any,
|
|
@@ -34,19 +36,24 @@ __all__ = "ErrorWrapper", "DataFrameValidationError"
|
|
|
34
36
|
|
|
35
37
|
|
|
36
38
|
class ErrorWrapper(Representation):
|
|
39
|
+
"""Error handler for nicely accumulating errors."""
|
|
40
|
+
|
|
37
41
|
__slots__ = "exc", "_loc"
|
|
38
42
|
|
|
39
43
|
def __init__(self, exc: Exception, loc: Union[str, "Loc"]) -> None:
|
|
44
|
+
"""Wrap an error in an ErrorWrapper."""
|
|
40
45
|
self.exc = exc
|
|
41
46
|
self._loc = loc
|
|
42
47
|
|
|
43
48
|
def loc_tuple(self) -> "Loc":
|
|
49
|
+
"""Represent error as tuple."""
|
|
44
50
|
if isinstance(self._loc, tuple):
|
|
45
51
|
return self._loc
|
|
46
52
|
else:
|
|
47
53
|
return (self._loc,)
|
|
48
54
|
|
|
49
55
|
def __repr_args__(self) -> "ReprArgs":
|
|
56
|
+
"""Pydantic repr."""
|
|
50
57
|
return [("exc", self.exc), ("loc", self.loc_tuple())]
|
|
51
58
|
|
|
52
59
|
|
|
@@ -56,19 +63,24 @@ ErrorList = Union[Sequence[Any], ErrorWrapper]
|
|
|
56
63
|
|
|
57
64
|
|
|
58
65
|
class DataFrameValidationError(Representation, ValueError):
|
|
66
|
+
"""Parent error for DataFrame validation errors."""
|
|
67
|
+
|
|
59
68
|
__slots__ = "raw_errors", "model", "_error_cache"
|
|
60
69
|
|
|
61
70
|
def __init__(self, errors: Sequence[ErrorList], model: Type["BaseModel"]) -> None:
|
|
71
|
+
"""Create a dataframe validation error."""
|
|
62
72
|
self.raw_errors = errors
|
|
63
73
|
self.model = model
|
|
64
74
|
self._error_cache: Optional[List["ErrorDict"]] = None
|
|
65
75
|
|
|
66
76
|
def errors(self) -> List["ErrorDict"]:
|
|
77
|
+
"""Get list of errors."""
|
|
67
78
|
if self._error_cache is None:
|
|
68
79
|
self._error_cache = list(flatten_errors(self.raw_errors))
|
|
69
80
|
return self._error_cache
|
|
70
81
|
|
|
71
82
|
def __str__(self) -> str:
|
|
83
|
+
"""String reprentation of error."""
|
|
72
84
|
errors = self.errors()
|
|
73
85
|
no_errors = len(errors)
|
|
74
86
|
return (
|
|
@@ -77,6 +89,7 @@ class DataFrameValidationError(Representation, ValueError):
|
|
|
77
89
|
)
|
|
78
90
|
|
|
79
91
|
def __repr_args__(self) -> "ReprArgs":
|
|
92
|
+
"""Pydantic repr."""
|
|
80
93
|
return [("model", self.model.__name__), ("errors", self.errors())]
|
|
81
94
|
|
|
82
95
|
|
patito/polars.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing import (
|
|
|
9
9
|
Dict,
|
|
10
10
|
Generic,
|
|
11
11
|
Iterable,
|
|
12
|
+
Literal,
|
|
12
13
|
Optional,
|
|
13
14
|
Sequence,
|
|
14
15
|
Tuple,
|
|
@@ -21,7 +22,6 @@ from typing import (
|
|
|
21
22
|
import polars as pl
|
|
22
23
|
from polars.type_aliases import IntoExpr
|
|
23
24
|
from pydantic import AliasChoices, AliasPath, create_model
|
|
24
|
-
from typing_extensions import Literal
|
|
25
25
|
|
|
26
26
|
from patito._pydantic.column_info import ColumnInfo
|
|
27
27
|
from patito.exceptions import MultipleRowsReturned, RowDoesNotExist
|
|
@@ -53,12 +53,10 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
53
53
|
DataFrame.set_model(model) is implicitly invoked at collection.
|
|
54
54
|
|
|
55
55
|
Args:
|
|
56
|
-
----
|
|
57
56
|
model: A patito model which should be used to validate the final dataframe.
|
|
58
57
|
If None is provided, the regular LazyFrame class will be returned.
|
|
59
58
|
|
|
60
59
|
Returns:
|
|
61
|
-
-------
|
|
62
60
|
A custom LazyFrame model class where LazyFrame.model has been correctly
|
|
63
61
|
"hard-coded" to the given model.
|
|
64
62
|
|
|
@@ -101,21 +99,17 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
101
99
|
result of which will be used to populate the column values.
|
|
102
100
|
|
|
103
101
|
Args:
|
|
104
|
-
----
|
|
105
102
|
columns: Optionally, a list of column names to derive. If not provided, all
|
|
106
103
|
columns are used.
|
|
107
104
|
|
|
108
105
|
Returns:
|
|
109
|
-
-------
|
|
110
106
|
DataFrame[Model]: A new dataframe where all derivable columns are provided.
|
|
111
107
|
|
|
112
108
|
Raises:
|
|
113
|
-
------
|
|
114
109
|
TypeError: If the ``derived_from`` parameter of ``patito.Field`` is given
|
|
115
110
|
as something else than a string or polars expression.
|
|
116
111
|
|
|
117
112
|
Examples:
|
|
118
|
-
--------
|
|
119
113
|
>>> import patito as pt
|
|
120
114
|
>>> import polars as pl
|
|
121
115
|
>>> class Foo(pt.Model):
|
|
@@ -189,8 +183,7 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
189
183
|
|
|
190
184
|
limitation - AliasChoice validation type only supports selecting a single element of an array
|
|
191
185
|
|
|
192
|
-
Returns
|
|
193
|
-
-------
|
|
186
|
+
Returns:
|
|
194
187
|
DataFrame[Model]: A dataframe with columns normalized to model names.
|
|
195
188
|
|
|
196
189
|
"""
|
|
@@ -247,7 +240,6 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
247
240
|
"""Cast columns to `dtypes` specified by the associated Patito model.
|
|
248
241
|
|
|
249
242
|
Args:
|
|
250
|
-
----
|
|
251
243
|
strict: If set to ``False``, columns which are technically compliant with
|
|
252
244
|
the specified field type, will not be casted. For example, a column
|
|
253
245
|
annotated with ``int`` is technically compliant with ``pl.UInt8``, even
|
|
@@ -258,11 +250,9 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
258
250
|
columns are casted.
|
|
259
251
|
|
|
260
252
|
Returns:
|
|
261
|
-
-------
|
|
262
253
|
LazyFrame[Model]: A dataframe with columns casted to the correct dtypes.
|
|
263
254
|
|
|
264
255
|
Examples:
|
|
265
|
-
--------
|
|
266
256
|
Create a simple model:
|
|
267
257
|
|
|
268
258
|
>>> import patito as pt
|
|
@@ -348,11 +338,9 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
348
338
|
DataFrame.set_model(model) is implicitly invoked at instantiation.
|
|
349
339
|
|
|
350
340
|
Args:
|
|
351
|
-
----
|
|
352
341
|
model: A patito model which should be used to validate the dataframe.
|
|
353
342
|
|
|
354
343
|
Returns:
|
|
355
|
-
-------
|
|
356
344
|
A custom DataFrame model class where DataFrame._model has been correctly
|
|
357
345
|
"hard-coded" to the given model.
|
|
358
346
|
|
|
@@ -369,15 +357,14 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
369
357
|
|
|
370
358
|
See documentation of polars.DataFrame.lazy() for full description.
|
|
371
359
|
|
|
372
|
-
Returns
|
|
373
|
-
-------
|
|
360
|
+
Returns:
|
|
374
361
|
A new LazyFrame object.
|
|
375
362
|
|
|
376
363
|
"""
|
|
377
|
-
lazyframe_class: LazyFrame[
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
364
|
+
lazyframe_class: LazyFrame[ModelType] = (
|
|
365
|
+
LazyFrame._construct_lazyframe_model_class(
|
|
366
|
+
model=getattr(self, "model", None)
|
|
367
|
+
)
|
|
381
368
|
) # type: ignore
|
|
382
369
|
ldf = lazyframe_class._from_pyldf(super().lazy()._ldf)
|
|
383
370
|
return ldf
|
|
@@ -392,17 +379,14 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
392
379
|
``DataFrame(...).set_model(Model)`` is equivalent with ``Model.DataFrame(...)``.
|
|
393
380
|
|
|
394
381
|
Args:
|
|
395
|
-
----
|
|
396
382
|
model (Model): Sub-class of ``patito.Model`` declaring the schema of the
|
|
397
383
|
dataframe.
|
|
398
384
|
|
|
399
385
|
Returns:
|
|
400
|
-
-------
|
|
401
386
|
DataFrame[Model]: Returns the same dataframe, but with an attached model
|
|
402
387
|
that is required for certain model-specific dataframe methods to work.
|
|
403
388
|
|
|
404
389
|
Examples:
|
|
405
|
-
--------
|
|
406
390
|
>>> from typing_extensions import Literal
|
|
407
391
|
>>> import patito as pt
|
|
408
392
|
>>> import polars as pl
|
|
@@ -454,8 +438,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
454
438
|
|
|
455
439
|
limitation - AliasChoice validation type only supports selecting a single element of an array
|
|
456
440
|
|
|
457
|
-
Returns
|
|
458
|
-
-------
|
|
441
|
+
Returns:
|
|
459
442
|
DataFrame[Model]: A dataframe with columns normalized to model names.
|
|
460
443
|
|
|
461
444
|
"""
|
|
@@ -467,7 +450,6 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
467
450
|
"""Cast columns to `dtypes` specified by the associated Patito model.
|
|
468
451
|
|
|
469
452
|
Args:
|
|
470
|
-
----
|
|
471
453
|
strict: If set to ``False``, columns which are technically compliant with
|
|
472
454
|
the specified field type, will not be casted. For example, a column
|
|
473
455
|
annotated with ``int`` is technically compliant with ``pl.UInt8``, even
|
|
@@ -478,11 +460,9 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
478
460
|
columns are casted.
|
|
479
461
|
|
|
480
462
|
Returns:
|
|
481
|
-
-------
|
|
482
463
|
DataFrame[Model]: A dataframe with columns casted to the correct dtypes.
|
|
483
464
|
|
|
484
465
|
Examples:
|
|
485
|
-
--------
|
|
486
466
|
Create a simple model:
|
|
487
467
|
|
|
488
468
|
>>> import patito as pt
|
|
@@ -519,18 +499,15 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
519
499
|
:ref:`DataFrame.set_model <DataFrame.set_model>`, are dropped.
|
|
520
500
|
|
|
521
501
|
Args:
|
|
522
|
-
----
|
|
523
502
|
columns: A single column string name, or list of strings, indicating
|
|
524
503
|
which columns to drop. If not specified, all columns *not*
|
|
525
504
|
specified by the associated dataframe model will be dropped.
|
|
526
505
|
more_columns: Additional named columns to drop.
|
|
527
506
|
|
|
528
507
|
Returns:
|
|
529
|
-
-------
|
|
530
508
|
DataFrame[Model]: New dataframe without the specified columns.
|
|
531
509
|
|
|
532
510
|
Examples:
|
|
533
|
-
--------
|
|
534
511
|
>>> import patito as pt
|
|
535
512
|
>>> class Model(pt.Model):
|
|
536
513
|
... column_1: int
|
|
@@ -552,20 +529,16 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
552
529
|
else:
|
|
553
530
|
return self.drop(list(set(self.columns) - set(self.model.columns)))
|
|
554
531
|
|
|
555
|
-
def validate(
|
|
556
|
-
self: DF, columns: Optional[Sequence[str]] = None, **kwargs: Any
|
|
557
|
-
) -> DF:
|
|
532
|
+
def validate(self, columns: Optional[Sequence[str]] = None, **kwargs: Any):
|
|
558
533
|
"""Validate the schema and content of the dataframe.
|
|
559
534
|
|
|
560
535
|
You must invoke ``.set_model()`` before invoking ``.validate()`` in order
|
|
561
536
|
to specify how the dataframe should be validated.
|
|
562
537
|
|
|
563
|
-
Returns
|
|
564
|
-
-------
|
|
538
|
+
Returns:
|
|
565
539
|
DataFrame[Model]: The original dataframe, if correctly validated.
|
|
566
540
|
|
|
567
|
-
Raises
|
|
568
|
-
------
|
|
541
|
+
Raises:
|
|
569
542
|
TypeError: If ``DataFrame.set_model()`` has not been invoked prior to
|
|
570
543
|
validation. Note that ``patito.Model.DataFrame`` automatically invokes
|
|
571
544
|
``DataFrame.set_model()`` for you.
|
|
@@ -573,8 +546,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
573
546
|
patito.exceptions.DataFrameValidationError: If the dataframe does not match the
|
|
574
547
|
specified schema.
|
|
575
548
|
|
|
576
|
-
Examples
|
|
577
|
-
--------
|
|
549
|
+
Examples:
|
|
578
550
|
>>> import patito as pt
|
|
579
551
|
|
|
580
552
|
|
|
@@ -621,17 +593,14 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
621
593
|
column name. Alternatively, an arbitrary polars expression can be given, the
|
|
622
594
|
result of which will be used to populate the column values.
|
|
623
595
|
|
|
624
|
-
Returns
|
|
625
|
-
-------
|
|
596
|
+
Returns:
|
|
626
597
|
DataFrame[Model]: A new dataframe where all derivable columns are provided.
|
|
627
598
|
|
|
628
|
-
Raises
|
|
629
|
-
------
|
|
599
|
+
Raises:
|
|
630
600
|
TypeError: If the ``derived_from`` parameter of ``patito.Field`` is given
|
|
631
601
|
as something else than a string or polars expression.
|
|
632
602
|
|
|
633
|
-
Examples
|
|
634
|
-
--------
|
|
603
|
+
Examples:
|
|
635
604
|
>>> import patito as pt
|
|
636
605
|
>>> import polars as pl
|
|
637
606
|
>>> class Foo(pt.Model):
|
|
@@ -665,11 +634,10 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
665
634
|
) -> DF:
|
|
666
635
|
"""Fill null values using a filling strategy, literal, or ``Expr``.
|
|
667
636
|
|
|
668
|
-
If ``"
|
|
637
|
+
If ``"defaults"`` is provided as the strategy, the model fields with default
|
|
669
638
|
values are used to fill missing values.
|
|
670
639
|
|
|
671
640
|
Args:
|
|
672
|
-
----
|
|
673
641
|
value: Value used to fill null values.
|
|
674
642
|
strategy: Accepts the same arguments as ``polars.DataFrame.fill_null`` in
|
|
675
643
|
addition to ``"defaults"`` which will use the field's default value if
|
|
@@ -680,12 +648,10 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
680
648
|
|
|
681
649
|
|
|
682
650
|
Returns:
|
|
683
|
-
-------
|
|
684
651
|
DataFrame[Model]: A new dataframe with nulls filled in according to the
|
|
685
652
|
provided ``strategy`` parameter.
|
|
686
653
|
|
|
687
654
|
Example:
|
|
688
|
-
-------
|
|
689
655
|
>>> import patito as pt
|
|
690
656
|
>>> class Product(pt.Model):
|
|
691
657
|
... name: str
|
|
@@ -737,7 +703,6 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
737
703
|
you can use ``.get()`` without any arguments to return that row.
|
|
738
704
|
|
|
739
705
|
Raises:
|
|
740
|
-
------
|
|
741
706
|
RowDoesNotExist: If zero rows evaluate to true for the given predicate.
|
|
742
707
|
MultipleRowsReturned: If more than one row evaluates to true for the given
|
|
743
708
|
predicate.
|
|
@@ -746,15 +711,12 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
746
711
|
same class.
|
|
747
712
|
|
|
748
713
|
Args:
|
|
749
|
-
----
|
|
750
714
|
predicate: A polars expression defining the criteria of the filter.
|
|
751
715
|
|
|
752
716
|
Returns:
|
|
753
|
-
-------
|
|
754
717
|
Model: A pydantic-derived base model representing the given row.
|
|
755
718
|
|
|
756
719
|
Example:
|
|
757
|
-
-------
|
|
758
720
|
>>> import patito as pt
|
|
759
721
|
>>> import polars as pl
|
|
760
722
|
>>> df = pt.DataFrame({"product_id": [1, 2, 3], "price": [10, 10, 20]})
|
|
@@ -819,8 +781,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
819
781
|
def _pydantic_model(self) -> Type[Model]:
|
|
820
782
|
"""Dynamically construct patito model compliant with dataframe.
|
|
821
783
|
|
|
822
|
-
Returns
|
|
823
|
-
-------
|
|
784
|
+
Returns:
|
|
824
785
|
A pydantic model class where all the rows have been specified as
|
|
825
786
|
`typing.Any` fields.
|
|
826
787
|
|
|
@@ -853,16 +814,13 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
853
814
|
to populate the given column(s).
|
|
854
815
|
|
|
855
816
|
Args:
|
|
856
|
-
----
|
|
857
817
|
*args: All positional arguments are forwarded to ``polars.read_csv``.
|
|
858
818
|
**kwargs: All keyword arguments are forwarded to ``polars.read_csv``.
|
|
859
819
|
|
|
860
820
|
Returns:
|
|
861
|
-
-------
|
|
862
821
|
DataFrame[Model]: A dataframe representing the given CSV file data.
|
|
863
822
|
|
|
864
823
|
Examples:
|
|
865
|
-
--------
|
|
866
824
|
The ``DataFrame.read_csv`` method can be used to automatically set the
|
|
867
825
|
correct column names when reading CSV files without headers.
|
|
868
826
|
|
|
@@ -908,8 +866,22 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
908
866
|
|
|
909
867
|
"""
|
|
910
868
|
kwargs.setdefault("dtypes", cls.model.dtypes)
|
|
911
|
-
|
|
869
|
+
has_header = kwargs.get("has_header", True)
|
|
870
|
+
if not has_header and "columns" not in kwargs:
|
|
912
871
|
kwargs.setdefault("new_columns", cls.model.columns)
|
|
872
|
+
alias_gen = cls.model.model_config.get("alias_generator")
|
|
873
|
+
if alias_gen:
|
|
874
|
+
alias_func = alias_gen.validation_alias or alias_gen.alias
|
|
875
|
+
if has_header and alias_gen and alias_func:
|
|
876
|
+
fields_to_cols = {
|
|
877
|
+
field_name: alias_func(field_name)
|
|
878
|
+
for field_name in cls.model.model_fields
|
|
879
|
+
}
|
|
880
|
+
kwargs["dtypes"] = {
|
|
881
|
+
fields_to_cols.get(field, field): dtype
|
|
882
|
+
for field, dtype in kwargs["dtypes"].items()
|
|
883
|
+
}
|
|
884
|
+
# TODO: other forms of alias setting like in Field
|
|
913
885
|
df = cls.model.DataFrame._from_pydf(pl.read_csv(*args, **kwargs)._df)
|
|
914
886
|
return df.derive()
|
|
915
887
|
|