patito 0.7.0__py3-none-any.whl → 0.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- patito/_pydantic/column_info.py +100 -45
- patito/_pydantic/dtypes/dtypes.py +44 -36
- patito/_pydantic/dtypes/utils.py +30 -27
- patito/_pydantic/repr.py +7 -15
- patito/_pydantic/schema.py +10 -9
- patito/exceptions.py +11 -16
- patito/polars.py +191 -118
- patito/pydantic.py +108 -95
- patito/validators.py +111 -71
- {patito-0.7.0.dist-info → patito-0.8.2.dist-info}/METADATA +4 -3
- patito-0.8.2.dist-info/RECORD +17 -0
- {patito-0.7.0.dist-info → patito-0.8.2.dist-info}/WHEEL +1 -1
- patito-0.7.0.dist-info/RECORD +0 -17
- {patito-0.7.0.dist-info → patito-0.8.2.dist-info}/LICENSE +0 -0
patito/exceptions.py
CHANGED
|
@@ -1,15 +1,10 @@
|
|
|
1
1
|
"""Exceptions used by patito."""
|
|
2
2
|
|
|
3
|
+
from collections.abc import Generator, Sequence
|
|
3
4
|
from typing import (
|
|
4
5
|
TYPE_CHECKING,
|
|
5
6
|
Any,
|
|
6
|
-
Dict,
|
|
7
|
-
Generator,
|
|
8
|
-
List,
|
|
9
7
|
Optional,
|
|
10
|
-
Sequence,
|
|
11
|
-
Tuple,
|
|
12
|
-
Type,
|
|
13
8
|
TypedDict,
|
|
14
9
|
Union,
|
|
15
10
|
)
|
|
@@ -19,7 +14,7 @@ from patito._pydantic.repr import Representation
|
|
|
19
14
|
if TYPE_CHECKING:
|
|
20
15
|
from pydantic import BaseModel
|
|
21
16
|
|
|
22
|
-
Loc =
|
|
17
|
+
Loc = tuple[Union[int, str], ...]
|
|
23
18
|
|
|
24
19
|
class _ErrorDictRequired(TypedDict):
|
|
25
20
|
loc: Loc
|
|
@@ -27,7 +22,7 @@ if TYPE_CHECKING:
|
|
|
27
22
|
type: str
|
|
28
23
|
|
|
29
24
|
class ErrorDict(_ErrorDictRequired, total=False):
|
|
30
|
-
ctx:
|
|
25
|
+
ctx: dict[str, Any]
|
|
31
26
|
|
|
32
27
|
from patito._pydantic.repr import ReprArgs
|
|
33
28
|
|
|
@@ -67,13 +62,13 @@ class DataFrameValidationError(Representation, ValueError):
|
|
|
67
62
|
|
|
68
63
|
__slots__ = "raw_errors", "model", "_error_cache"
|
|
69
64
|
|
|
70
|
-
def __init__(self, errors: Sequence[ErrorList], model:
|
|
65
|
+
def __init__(self, errors: Sequence[ErrorList], model: type["BaseModel"]) -> None:
|
|
71
66
|
"""Create a dataframe validation error."""
|
|
72
67
|
self.raw_errors = errors
|
|
73
68
|
self.model = model
|
|
74
|
-
self._error_cache: Optional[
|
|
69
|
+
self._error_cache: Optional[list[ErrorDict]] = None
|
|
75
70
|
|
|
76
|
-
def errors(self) ->
|
|
71
|
+
def errors(self) -> list["ErrorDict"]:
|
|
77
72
|
"""Get list of errors."""
|
|
78
73
|
if self._error_cache is None:
|
|
79
74
|
self._error_cache = list(flatten_errors(self.raw_errors))
|
|
@@ -93,7 +88,7 @@ class DataFrameValidationError(Representation, ValueError):
|
|
|
93
88
|
return [("model", self.model.__name__), ("errors", self.errors())]
|
|
94
89
|
|
|
95
90
|
|
|
96
|
-
def display_errors(errors:
|
|
91
|
+
def display_errors(errors: list["ErrorDict"]) -> str:
|
|
97
92
|
return "\n".join(
|
|
98
93
|
f'{_display_error_loc(e)}\n {e["msg"]} ({_display_error_type_and_ctx(e)})'
|
|
99
94
|
for e in errors
|
|
@@ -142,7 +137,7 @@ def error_dict(exc: Exception, loc: "Loc") -> "ErrorDict":
|
|
|
142
137
|
else:
|
|
143
138
|
msg = str(exc)
|
|
144
139
|
|
|
145
|
-
d:
|
|
140
|
+
d: ErrorDict = {"loc": loc, "msg": msg, "type": type_}
|
|
146
141
|
|
|
147
142
|
if ctx:
|
|
148
143
|
d["ctx"] = ctx
|
|
@@ -150,10 +145,10 @@ def error_dict(exc: Exception, loc: "Loc") -> "ErrorDict":
|
|
|
150
145
|
return d
|
|
151
146
|
|
|
152
147
|
|
|
153
|
-
_EXC_TYPE_CACHE:
|
|
148
|
+
_EXC_TYPE_CACHE: dict[type[Exception], str] = {}
|
|
154
149
|
|
|
155
150
|
|
|
156
|
-
def get_exc_type(cls:
|
|
151
|
+
def get_exc_type(cls: type[Exception]) -> str:
|
|
157
152
|
# slightly more efficient than using lru_cache since we don't need to worry about the cache filling up
|
|
158
153
|
try:
|
|
159
154
|
return _EXC_TYPE_CACHE[cls]
|
|
@@ -163,7 +158,7 @@ def get_exc_type(cls: Type[Exception]) -> str:
|
|
|
163
158
|
return r
|
|
164
159
|
|
|
165
160
|
|
|
166
|
-
def _get_exc_type(cls:
|
|
161
|
+
def _get_exc_type(cls: type[Exception]) -> str:
|
|
167
162
|
if issubclass(cls, AssertionError):
|
|
168
163
|
return "assertion_error"
|
|
169
164
|
|
patito/polars.py
CHANGED
|
@@ -2,20 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from collections.abc import Collection, Iterable, Iterator, Sequence
|
|
5
6
|
from typing import (
|
|
6
7
|
TYPE_CHECKING,
|
|
7
8
|
Any,
|
|
8
|
-
Collection,
|
|
9
|
-
Dict,
|
|
10
9
|
Generic,
|
|
11
|
-
Iterable,
|
|
12
10
|
Literal,
|
|
13
|
-
Optional,
|
|
14
|
-
Sequence,
|
|
15
|
-
Tuple,
|
|
16
|
-
Type,
|
|
17
11
|
TypeVar,
|
|
18
|
-
Union,
|
|
19
12
|
cast,
|
|
20
13
|
)
|
|
21
14
|
|
|
@@ -31,63 +24,110 @@ if TYPE_CHECKING:
|
|
|
31
24
|
|
|
32
25
|
from patito.pydantic import Model
|
|
33
26
|
|
|
34
|
-
|
|
35
27
|
DF = TypeVar("DF", bound="DataFrame")
|
|
36
28
|
LDF = TypeVar("LDF", bound="LazyFrame")
|
|
37
29
|
ModelType = TypeVar("ModelType", bound="Model")
|
|
38
30
|
OtherModelType = TypeVar("OtherModelType", bound="Model")
|
|
31
|
+
T = TypeVar("T")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ModelGenerator(Iterator[ModelType], Generic[ModelType]):
|
|
35
|
+
"""An iterator that can be converted to a list."""
|
|
36
|
+
|
|
37
|
+
def __init__(self, iterator: Iterator[ModelType]) -> None:
|
|
38
|
+
"""Construct a ModelGenerator from an iterator."""
|
|
39
|
+
self._iterator = iterator
|
|
40
|
+
|
|
41
|
+
def to_list(self) -> list[ModelType]:
|
|
42
|
+
"""Convert iterator to list."""
|
|
43
|
+
return list(self)
|
|
44
|
+
|
|
45
|
+
def __next__(self) -> ModelType: # noqa: D105
|
|
46
|
+
return next(self._iterator)
|
|
47
|
+
|
|
48
|
+
def __iter__(self) -> Iterator[ModelType]: # noqa: D105
|
|
49
|
+
return self
|
|
39
50
|
|
|
40
51
|
|
|
41
52
|
class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
42
53
|
"""LazyFrame class associated to DataFrame."""
|
|
43
54
|
|
|
44
|
-
model:
|
|
55
|
+
model: type[ModelType]
|
|
45
56
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
cls: Type[LDF], model: Optional[Type[ModelType]]
|
|
49
|
-
) -> Type[LazyFrame[ModelType]]:
|
|
50
|
-
"""Return custom LazyFrame sub-class where LazyFrame.model is set.
|
|
57
|
+
def set_model(self, model: type[OtherModelType]) -> LazyFrame[OtherModelType]:
|
|
58
|
+
"""Associate a given patito ``Model`` with the dataframe.
|
|
51
59
|
|
|
52
|
-
|
|
53
|
-
|
|
60
|
+
The model schema is used by methods that depend on a model being associated with
|
|
61
|
+
the given dataframe such as :ref:`DataFrame.validate() <DataFrame.validate>`
|
|
62
|
+
and :ref:`DataFrame.get() <DataFrame.get>`.
|
|
63
|
+
|
|
64
|
+
``DataFrame(...).set_model(Model)`` is equivalent with ``Model.DataFrame(...)``.
|
|
54
65
|
|
|
55
66
|
Args:
|
|
56
|
-
model:
|
|
57
|
-
|
|
67
|
+
model (Model): Sub-class of ``patito.Model`` declaring the schema of the
|
|
68
|
+
dataframe.
|
|
58
69
|
|
|
59
70
|
Returns:
|
|
60
|
-
|
|
61
|
-
|
|
71
|
+
DataFrame[Model]: Returns the same dataframe, but with an attached model
|
|
72
|
+
that is required for certain model-specific dataframe methods to work.
|
|
62
73
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
74
|
+
Examples:
|
|
75
|
+
>>> from typing_extensions import Literal
|
|
76
|
+
>>> import patito as pt
|
|
77
|
+
>>> import polars as pl
|
|
78
|
+
>>> class SchoolClass(pt.Model):
|
|
79
|
+
... year: int = pt.Field(dtype=pl.UInt16)
|
|
80
|
+
... letter: Literal["A", "B"] = pt.Field(dtype=pl.Categorical)
|
|
81
|
+
...
|
|
82
|
+
>>> classes = pt.DataFrame(
|
|
83
|
+
... {"year": [1, 1, 2, 2], "letter": list("ABAB")}
|
|
84
|
+
... ).set_model(SchoolClass)
|
|
85
|
+
>>> classes
|
|
86
|
+
shape: (4, 2)
|
|
87
|
+
┌──────┬────────┐
|
|
88
|
+
│ year ┆ letter │
|
|
89
|
+
│ --- ┆ --- │
|
|
90
|
+
│ i64 ┆ str │
|
|
91
|
+
╞══════╪════════╡
|
|
92
|
+
│ 1 ┆ A │
|
|
93
|
+
│ 1 ┆ B │
|
|
94
|
+
│ 2 ┆ A │
|
|
95
|
+
│ 2 ┆ B │
|
|
96
|
+
└──────┴────────┘
|
|
97
|
+
>>> casted_classes = classes.cast()
|
|
98
|
+
>>> casted_classes
|
|
99
|
+
shape: (4, 2)
|
|
100
|
+
┌──────┬────────┐
|
|
101
|
+
│ year ┆ letter │
|
|
102
|
+
│ --- ┆ --- │
|
|
103
|
+
│ u16 ┆ cat │
|
|
104
|
+
╞══════╪════════╡
|
|
105
|
+
│ 1 ┆ A │
|
|
106
|
+
│ 1 ┆ B │
|
|
107
|
+
│ 2 ┆ A │
|
|
108
|
+
│ 2 ┆ B │
|
|
109
|
+
└──────┴────────┘
|
|
110
|
+
>>> casted_classes.validate()
|
|
66
111
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
(cls,),
|
|
70
|
-
{"model": model},
|
|
71
|
-
)
|
|
72
|
-
return new_class
|
|
112
|
+
"""
|
|
113
|
+
return model.LazyFrame._from_pyldf(self._ldf) # type: ignore
|
|
73
114
|
|
|
74
115
|
def collect(
|
|
75
116
|
self,
|
|
76
117
|
*args,
|
|
77
118
|
**kwargs,
|
|
78
|
-
) ->
|
|
119
|
+
) -> DataFrame[ModelType]: # noqa: DAR101, DAR201
|
|
79
120
|
"""Collect into a DataFrame.
|
|
80
121
|
|
|
81
122
|
See documentation of polars.DataFrame.collect for full description of
|
|
82
123
|
parameters.
|
|
83
124
|
"""
|
|
84
125
|
background = kwargs.pop("background", False)
|
|
85
|
-
df = super().collect(*args, background=background, **kwargs)
|
|
126
|
+
df: pl.DataFrame = super().collect(*args, background=background, **kwargs)
|
|
127
|
+
df = DataFrame(df)
|
|
86
128
|
if getattr(self, "model", False):
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
cls = DataFrame
|
|
90
|
-
return cls._from_pydf(df._df)
|
|
129
|
+
df = df.set_model(self.model)
|
|
130
|
+
return df
|
|
91
131
|
|
|
92
132
|
def derive(self: LDF, columns: list[str] | None = None) -> LDF:
|
|
93
133
|
"""Populate columns which have ``pt.Field(derived_from=...)`` definitions.
|
|
@@ -148,33 +188,35 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
148
188
|
|
|
149
189
|
def _derive_column(
|
|
150
190
|
self,
|
|
151
|
-
|
|
191
|
+
lf: LDF,
|
|
152
192
|
column_name: str,
|
|
153
|
-
column_infos:
|
|
154
|
-
) ->
|
|
193
|
+
column_infos: dict[str, ColumnInfo],
|
|
194
|
+
) -> tuple[LDF, Sequence[str]]:
|
|
155
195
|
if (
|
|
156
196
|
column_infos.get(column_name, None) is None
|
|
157
197
|
or column_infos[column_name].derived_from is None
|
|
158
198
|
):
|
|
159
|
-
return
|
|
199
|
+
return lf, []
|
|
200
|
+
|
|
160
201
|
derived_from = column_infos[column_name].derived_from
|
|
161
202
|
dtype = self.model.dtypes[column_name]
|
|
162
203
|
derived_columns = []
|
|
204
|
+
|
|
163
205
|
if isinstance(derived_from, str):
|
|
164
|
-
|
|
206
|
+
lf = lf.with_columns(pl.col(derived_from).cast(dtype).alias(column_name))
|
|
165
207
|
elif isinstance(derived_from, pl.Expr):
|
|
166
208
|
root_cols = derived_from.meta.root_names()
|
|
167
209
|
while root_cols:
|
|
168
210
|
root_col = root_cols.pop()
|
|
169
|
-
|
|
211
|
+
lf, _derived_columns = self._derive_column(lf, root_col, column_infos)
|
|
170
212
|
derived_columns.extend(_derived_columns)
|
|
171
|
-
|
|
213
|
+
lf = lf.with_columns(derived_from.cast(dtype).alias(column_name))
|
|
172
214
|
else:
|
|
173
215
|
raise TypeError(
|
|
174
216
|
"Can not derive dataframe column from type " f"{type(derived_from)}."
|
|
175
217
|
)
|
|
176
218
|
derived_columns.append(column_name)
|
|
177
|
-
return
|
|
219
|
+
return lf, derived_columns
|
|
178
220
|
|
|
179
221
|
def unalias(self: LDF) -> LDF:
|
|
180
222
|
"""Un-aliases column names using information from pydantic validation_alias.
|
|
@@ -191,7 +233,7 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
191
233
|
return self
|
|
192
234
|
exprs = []
|
|
193
235
|
|
|
194
|
-
def to_expr(va: str | AliasPath | AliasChoices) ->
|
|
236
|
+
def to_expr(va: str | AliasPath | AliasChoices) -> pl.Expr | None:
|
|
195
237
|
if isinstance(va, str):
|
|
196
238
|
return pl.col(va) if va in self.collect_schema() else None
|
|
197
239
|
elif isinstance(va, AliasPath):
|
|
@@ -200,12 +242,12 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
200
242
|
f"TODO figure out how this AliasPath behaves ({va})"
|
|
201
243
|
)
|
|
202
244
|
return (
|
|
203
|
-
pl.col(va.path[0]).list.get(va.path[1], null_on_oob=True)
|
|
245
|
+
pl.col(str(va.path[0])).list.get(va.path[1], null_on_oob=True)
|
|
204
246
|
if va.path[0] in self.collect_schema()
|
|
205
247
|
else None
|
|
206
248
|
)
|
|
207
249
|
elif isinstance(va, AliasChoices):
|
|
208
|
-
local_expr:
|
|
250
|
+
local_expr: pl.Expr | None = None
|
|
209
251
|
for choice in va.choices:
|
|
210
252
|
if (part := to_expr(choice)) is not None:
|
|
211
253
|
local_expr = (
|
|
@@ -235,7 +277,7 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
235
277
|
return self.select(exprs)
|
|
236
278
|
|
|
237
279
|
def cast(
|
|
238
|
-
self: LDF, strict: bool = False, columns:
|
|
280
|
+
self: LDF, strict: bool = False, columns: Sequence[str] | None = None
|
|
239
281
|
) -> LDF:
|
|
240
282
|
"""Cast columns to `dtypes` specified by the associated Patito model.
|
|
241
283
|
|
|
@@ -292,9 +334,12 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
292
334
|
return self.with_columns(exprs)
|
|
293
335
|
|
|
294
336
|
@classmethod
|
|
295
|
-
def from_existing(cls:
|
|
337
|
+
def from_existing(cls: type[LDF], lf: pl.LazyFrame) -> LDF:
|
|
296
338
|
"""Construct a patito.DataFrame object from an existing polars.DataFrame object."""
|
|
297
|
-
|
|
339
|
+
if getattr(cls, "model", False):
|
|
340
|
+
return cls.model.LazyFrame._from_pyldf(super().lazy()._ldf) # type: ignore
|
|
341
|
+
|
|
342
|
+
return LazyFrame._from_pyldf(lf._ldf) # type: ignore
|
|
298
343
|
|
|
299
344
|
|
|
300
345
|
class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
@@ -326,31 +371,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
326
371
|
:ref:`Product.validate <DataFrame.validate>`.
|
|
327
372
|
"""
|
|
328
373
|
|
|
329
|
-
model:
|
|
330
|
-
|
|
331
|
-
@classmethod
|
|
332
|
-
def _construct_dataframe_model_class(
|
|
333
|
-
cls: Type[DF], model: Type[OtherModelType]
|
|
334
|
-
) -> Type[DataFrame[OtherModelType]]:
|
|
335
|
-
"""Return custom DataFrame sub-class where DataFrame.model is set.
|
|
336
|
-
|
|
337
|
-
Can be used to construct a DataFrame class where
|
|
338
|
-
DataFrame.set_model(model) is implicitly invoked at instantiation.
|
|
339
|
-
|
|
340
|
-
Args:
|
|
341
|
-
model: A patito model which should be used to validate the dataframe.
|
|
342
|
-
|
|
343
|
-
Returns:
|
|
344
|
-
A custom DataFrame model class where DataFrame._model has been correctly
|
|
345
|
-
"hard-coded" to the given model.
|
|
346
|
-
|
|
347
|
-
"""
|
|
348
|
-
new_class = type(
|
|
349
|
-
f"{model.model_json_schema()['title']}DataFrame",
|
|
350
|
-
(cls,),
|
|
351
|
-
{"model": model},
|
|
352
|
-
)
|
|
353
|
-
return new_class
|
|
374
|
+
model: type[ModelType]
|
|
354
375
|
|
|
355
376
|
def lazy(self: DataFrame[ModelType]) -> LazyFrame[ModelType]:
|
|
356
377
|
"""Convert DataFrame into LazyFrame.
|
|
@@ -361,15 +382,12 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
361
382
|
A new LazyFrame object.
|
|
362
383
|
|
|
363
384
|
"""
|
|
364
|
-
|
|
365
|
-
LazyFrame.
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
) # type: ignore
|
|
369
|
-
ldf = lazyframe_class._from_pyldf(super().lazy()._ldf)
|
|
370
|
-
return ldf
|
|
385
|
+
if getattr(self, "model", False):
|
|
386
|
+
return self.model.LazyFrame._from_pyldf(super().lazy()._ldf) # type: ignore
|
|
387
|
+
|
|
388
|
+
return LazyFrame._from_pyldf(super().lazy()._ldf) # type: ignore
|
|
371
389
|
|
|
372
|
-
def set_model(self, model
|
|
390
|
+
def set_model(self, model: type[OtherModelType]) -> DataFrame[OtherModelType]:
|
|
373
391
|
"""Associate a given patito ``Model`` with the dataframe.
|
|
374
392
|
|
|
375
393
|
The model schema is used by methods that depend on a model being associated with
|
|
@@ -425,11 +443,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
425
443
|
>>> casted_classes.validate()
|
|
426
444
|
|
|
427
445
|
"""
|
|
428
|
-
|
|
429
|
-
return cast(
|
|
430
|
-
DataFrame[model],
|
|
431
|
-
cls._from_pydf(self._df),
|
|
432
|
-
)
|
|
446
|
+
return model.DataFrame(self._df)
|
|
433
447
|
|
|
434
448
|
def unalias(self: DF) -> DF:
|
|
435
449
|
"""Un-aliases column names using information from pydantic validation_alias.
|
|
@@ -445,7 +459,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
445
459
|
return self.lazy().unalias().collect()
|
|
446
460
|
|
|
447
461
|
def cast(
|
|
448
|
-
self: DF, strict: bool = False, columns:
|
|
462
|
+
self: DF, strict: bool = False, columns: Sequence[str] | None = None
|
|
449
463
|
) -> DF:
|
|
450
464
|
"""Cast columns to `dtypes` specified by the associated Patito model.
|
|
451
465
|
|
|
@@ -489,8 +503,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
489
503
|
|
|
490
504
|
def drop(
|
|
491
505
|
self: DF,
|
|
492
|
-
columns:
|
|
493
|
-
*more_columns: str,
|
|
506
|
+
columns: str | Collection[str] | None = None,
|
|
494
507
|
) -> DF:
|
|
495
508
|
"""Drop one or more columns from the dataframe.
|
|
496
509
|
|
|
@@ -502,7 +515,6 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
502
515
|
columns: A single column string name, or list of strings, indicating
|
|
503
516
|
which columns to drop. If not specified, all columns *not*
|
|
504
517
|
specified by the associated dataframe model will be dropped.
|
|
505
|
-
more_columns: Additional named columns to drop.
|
|
506
518
|
|
|
507
519
|
Returns:
|
|
508
520
|
DataFrame[Model]: New dataframe without the specified columns.
|
|
@@ -525,27 +537,29 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
525
537
|
|
|
526
538
|
"""
|
|
527
539
|
if columns is not None:
|
|
528
|
-
|
|
540
|
+
# I get a single null row if I try to use super() here, so go via
|
|
541
|
+
# pl.DataFrame instead.
|
|
542
|
+
return self._from_pydf(pl.DataFrame(self._df).drop(columns)._df)
|
|
529
543
|
else:
|
|
530
544
|
return self.drop(list(set(self.columns) - set(self.model.columns)))
|
|
531
545
|
|
|
532
|
-
def validate(self, columns:
|
|
546
|
+
def validate(self, columns: Sequence[str] | None = None, **kwargs: Any):
|
|
533
547
|
"""Validate the schema and content of the dataframe.
|
|
534
548
|
|
|
535
549
|
You must invoke ``.set_model()`` before invoking ``.validate()`` in order
|
|
536
550
|
to specify how the dataframe should be validated.
|
|
537
551
|
|
|
538
552
|
Returns:
|
|
539
|
-
DataFrame[Model]: The original dataframe, if correctly validated.
|
|
553
|
+
DataFrame[Model]: The original patito dataframe, if correctly validated.
|
|
540
554
|
|
|
541
555
|
Raises:
|
|
556
|
+
patito.exceptions.DataFrameValidationError: If the dataframe does not match the
|
|
557
|
+
specified schema.
|
|
558
|
+
|
|
542
559
|
TypeError: If ``DataFrame.set_model()`` has not been invoked prior to
|
|
543
560
|
validation. Note that ``patito.Model.DataFrame`` automatically invokes
|
|
544
561
|
``DataFrame.set_model()`` for you.
|
|
545
562
|
|
|
546
|
-
patito.exceptions.DataFrameValidationError: If the dataframe does not match the
|
|
547
|
-
specified schema.
|
|
548
|
-
|
|
549
563
|
Examples:
|
|
550
564
|
>>> import patito as pt
|
|
551
565
|
|
|
@@ -623,13 +637,12 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
623
637
|
|
|
624
638
|
def fill_null(
|
|
625
639
|
self: DF,
|
|
626
|
-
value:
|
|
627
|
-
strategy:
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
limit: Optional[int] = None,
|
|
640
|
+
value: Any | None = None,
|
|
641
|
+
strategy: Literal[
|
|
642
|
+
"forward", "backward", "min", "max", "mean", "zero", "one", "defaults"
|
|
643
|
+
]
|
|
644
|
+
| None = None,
|
|
645
|
+
limit: int | None = None,
|
|
633
646
|
matches_supertype: bool = True,
|
|
634
647
|
) -> DF:
|
|
635
648
|
"""Fill null values using a filling strategy, literal, or ``Expr``.
|
|
@@ -689,14 +702,13 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
689
702
|
pl.lit(default_value, self.model.dtypes[column])
|
|
690
703
|
)
|
|
691
704
|
if column in self.columns
|
|
692
|
-
else pl.
|
|
693
|
-
)
|
|
694
|
-
# else pl.lit(default_value, self.model.dtypes[column]).alias(column)
|
|
705
|
+
else pl.lit(default_value, self.model.dtypes[column]).alias(column)
|
|
706
|
+
)
|
|
695
707
|
for column, default_value in self.model.defaults.items()
|
|
696
708
|
]
|
|
697
|
-
).set_model(self.model)
|
|
709
|
+
).set_model(self.model) # type: ignore
|
|
698
710
|
|
|
699
|
-
def get(self, predicate:
|
|
711
|
+
def get(self, predicate: pl.Expr | None = None) -> ModelType:
|
|
700
712
|
"""Fetch the single row that matches the given polars predicate.
|
|
701
713
|
|
|
702
714
|
If you expect a data frame to already consist of one single row,
|
|
@@ -778,7 +790,70 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
778
790
|
else:
|
|
779
791
|
return self._pydantic_model().from_row(row) # type: ignore
|
|
780
792
|
|
|
781
|
-
def
|
|
793
|
+
def iter_models(
|
|
794
|
+
self, validate_df: bool = True, validate_model: bool = False
|
|
795
|
+
) -> ModelGenerator[ModelType]:
|
|
796
|
+
"""Iterate over all rows in the dataframe as pydantic models.
|
|
797
|
+
|
|
798
|
+
Args:
|
|
799
|
+
validate_df: If set to ``True``, the dataframe will be validated before
|
|
800
|
+
making models out of each row. If set to ``False``, beware that columns
|
|
801
|
+
need to be the exact same as the model fields.
|
|
802
|
+
validate_model: If set to ``True``, each model will be validated when
|
|
803
|
+
constructing. Disabled by default since df validation should cover this case.
|
|
804
|
+
|
|
805
|
+
Yields:
|
|
806
|
+
Model: A pydantic-derived model representing the given row. .to_list() can be
|
|
807
|
+
used to convert the iterator to a list.
|
|
808
|
+
|
|
809
|
+
Raises:
|
|
810
|
+
TypeError: If ``DataFrame.set_model()`` has not been invoked prior to
|
|
811
|
+
iteration.
|
|
812
|
+
|
|
813
|
+
Example:
|
|
814
|
+
>>> import patito as pt
|
|
815
|
+
>>> import polars as pl
|
|
816
|
+
>>> class Product(pt.Model):
|
|
817
|
+
... product_id: int = pt.Field(unique=True)
|
|
818
|
+
... price: float
|
|
819
|
+
|
|
820
|
+
>>> df = pt.DataFrame({"product_id": [1, 2], "price": [10., 20.]})
|
|
821
|
+
>>> df = df.set_model(Product)
|
|
822
|
+
>>> for product in df.iter_models():
|
|
823
|
+
... print(product)
|
|
824
|
+
...
|
|
825
|
+
Product(product_id=1, price=10.0)
|
|
826
|
+
Product(product_id=2, price=20.0)
|
|
827
|
+
|
|
828
|
+
"""
|
|
829
|
+
if not hasattr(self, "model"):
|
|
830
|
+
raise TypeError(
|
|
831
|
+
f"You must invoke {self.__class__.__name__}.set_model() "
|
|
832
|
+
f"before invoking {self.__class__.__name__}.iter_models()."
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
df = self.validate(drop_superfluous_columns=True) if validate_df else self
|
|
836
|
+
|
|
837
|
+
def _iter_models_with_validate(
|
|
838
|
+
_df: DataFrame[ModelType],
|
|
839
|
+
) -> Iterator[ModelType]:
|
|
840
|
+
for row in _df.iter_rows(named=True):
|
|
841
|
+
yield self.model(**row)
|
|
842
|
+
|
|
843
|
+
def _iter_models_without_validate(
|
|
844
|
+
_df: DataFrame[ModelType],
|
|
845
|
+
) -> Iterator[ModelType]:
|
|
846
|
+
for row in _df.iter_rows(named=True):
|
|
847
|
+
yield self.model.model_construct(**row)
|
|
848
|
+
|
|
849
|
+
_iter_models = (
|
|
850
|
+
_iter_models_with_validate
|
|
851
|
+
if validate_model
|
|
852
|
+
else _iter_models_without_validate
|
|
853
|
+
)
|
|
854
|
+
return ModelGenerator(_iter_models(df))
|
|
855
|
+
|
|
856
|
+
def _pydantic_model(self) -> type[Model]:
|
|
782
857
|
"""Dynamically construct patito model compliant with dataframe.
|
|
783
858
|
|
|
784
859
|
Returns:
|
|
@@ -790,7 +865,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
790
865
|
|
|
791
866
|
pydantic_annotations = {column: (Any, ...) for column in self.columns}
|
|
792
867
|
return cast(
|
|
793
|
-
|
|
868
|
+
type[Model],
|
|
794
869
|
create_model( # type: ignore
|
|
795
870
|
"UntypedRow",
|
|
796
871
|
__base__=Model,
|
|
@@ -804,7 +879,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
804
879
|
|
|
805
880
|
@classmethod
|
|
806
881
|
def read_csv( # type: ignore[no-untyped-def]
|
|
807
|
-
cls:
|
|
882
|
+
cls: type[DF],
|
|
808
883
|
*args, # noqa: ANN002
|
|
809
884
|
**kwargs, # noqa: ANN003
|
|
810
885
|
) -> DF:
|
|
@@ -888,15 +963,13 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
888
963
|
# --- Type annotation overrides ---
|
|
889
964
|
def filter( # noqa: D102
|
|
890
965
|
self: DF,
|
|
891
|
-
predicate:
|
|
892
|
-
pl.Expr, str, pl.Series, list[bool], np.ndarray[Any, Any], bool
|
|
893
|
-
],
|
|
966
|
+
predicate: pl.Expr | str | pl.Series | list[bool] | np.ndarray[Any, Any] | bool,
|
|
894
967
|
) -> DF:
|
|
895
968
|
return cast(DF, super().filter(predicate))
|
|
896
969
|
|
|
897
970
|
def select( # noqa: D102
|
|
898
971
|
self: DF,
|
|
899
|
-
*exprs:
|
|
972
|
+
*exprs: IntoExpr | Iterable[IntoExpr],
|
|
900
973
|
**named_exprs: IntoExpr,
|
|
901
974
|
) -> DF:
|
|
902
975
|
return cast( # pyright: ignore[redundant-cast]
|
|
@@ -905,7 +978,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
905
978
|
|
|
906
979
|
def with_columns( # noqa: D102
|
|
907
980
|
self: DF,
|
|
908
|
-
*exprs:
|
|
981
|
+
*exprs: IntoExpr | Iterable[IntoExpr],
|
|
909
982
|
**named_exprs: IntoExpr,
|
|
910
983
|
) -> DF:
|
|
911
984
|
return cast(DF, super().with_columns(*exprs, **named_exprs))
|