patito 0.8.0__tar.gz → 0.8.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {patito-0.8.0 → patito-0.8.3}/PKG-INFO +1 -2
- {patito-0.8.0 → patito-0.8.3}/pyproject.toml +3 -2
- {patito-0.8.0 → patito-0.8.3}/src/patito/_pydantic/column_info.py +5 -5
- {patito-0.8.0 → patito-0.8.3}/src/patito/_pydantic/dtypes/dtypes.py +31 -24
- {patito-0.8.0 → patito-0.8.3}/src/patito/_pydantic/dtypes/utils.py +1 -1
- {patito-0.8.0 → patito-0.8.3}/src/patito/polars.py +87 -73
- {patito-0.8.0 → patito-0.8.3}/src/patito/pydantic.py +21 -17
- {patito-0.8.0 → patito-0.8.3}/src/patito/validators.py +1 -0
- {patito-0.8.0 → patito-0.8.3}/LICENSE +0 -0
- {patito-0.8.0 → patito-0.8.3}/README.md +0 -0
- {patito-0.8.0 → patito-0.8.3}/src/patito/__init__.py +0 -0
- {patito-0.8.0 → patito-0.8.3}/src/patito/_docs.py +0 -0
- {patito-0.8.0 → patito-0.8.3}/src/patito/_pydantic/__init__.py +0 -0
- {patito-0.8.0 → patito-0.8.3}/src/patito/_pydantic/dtypes/__init__.py +0 -0
- {patito-0.8.0 → patito-0.8.3}/src/patito/_pydantic/repr.py +0 -0
- {patito-0.8.0 → patito-0.8.3}/src/patito/_pydantic/schema.py +0 -0
- {patito-0.8.0 → patito-0.8.3}/src/patito/exceptions.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: patito
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.3
|
|
4
4
|
Summary: A dataframe modelling library built on top of polars and pydantic.
|
|
5
5
|
Home-page: https://github.com/JakobGM/patito
|
|
6
6
|
License: MIT
|
|
@@ -20,7 +20,6 @@ Provides-Extra: pandas
|
|
|
20
20
|
Requires-Dist: Sphinx (<7) ; extra == "docs"
|
|
21
21
|
Requires-Dist: pandas ; extra == "pandas"
|
|
22
22
|
Requires-Dist: polars (>=1.10.0)
|
|
23
|
-
Requires-Dist: pre-commit (>=3.8.0,<4.0.0)
|
|
24
23
|
Requires-Dist: pyarrow (>=5.0.0) ; extra == "caching"
|
|
25
24
|
Requires-Dist: pydantic (>=2.7.0)
|
|
26
25
|
Requires-Dist: sphinx-autobuild ; extra == "docs"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "patito"
|
|
3
|
-
version = "0.8.
|
|
3
|
+
version = "0.8.3"
|
|
4
4
|
description = "A dataframe modelling library built on top of polars and pydantic."
|
|
5
5
|
authors = ["Jakob Gerhard Martinussen <jakobgm@gmail.com>", "Thomas Aarholt <thomasaarholt@gmail.com>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -25,7 +25,6 @@ sphinx-autobuild = {version = "*", optional = true}
|
|
|
25
25
|
sphinx-autodoc-typehints = {version = "*", optional = true}
|
|
26
26
|
sphinx-toolbox = {version = "*", optional = true}
|
|
27
27
|
sphinxcontrib-mermaid = {version = "*", optional = true}
|
|
28
|
-
pre-commit = "^3.8.0"
|
|
29
28
|
|
|
30
29
|
[tool.poetry.extras]
|
|
31
30
|
# The pyarrow.parquet module is required for writing parquet caches to disk
|
|
@@ -42,6 +41,7 @@ docs = [
|
|
|
42
41
|
|
|
43
42
|
[tool.poetry.group.dev.dependencies]
|
|
44
43
|
ruff = ">=0.2.1"
|
|
44
|
+
pre-commit = "^3.8.0"
|
|
45
45
|
coverage = {version = "*", extras = ["toml"]}
|
|
46
46
|
pyright = ">=1.1.239"
|
|
47
47
|
pytest = ">=7.1.2"
|
|
@@ -133,6 +133,7 @@ extend-exclude= ["tests/__init__.py"]
|
|
|
133
133
|
|
|
134
134
|
[tool.ruff.lint]
|
|
135
135
|
select = ["E4", "E7", "E9", "F", "I", "B", "D", "UP"]
|
|
136
|
+
ignore = ["UP007"]
|
|
136
137
|
|
|
137
138
|
[tool.ruff.lint.pydocstyle]
|
|
138
139
|
convention = "google"
|
|
@@ -97,20 +97,20 @@ class ColumnInfo(BaseModel, arbitrary_types_allowed=True):
|
|
|
97
97
|
|
|
98
98
|
"""
|
|
99
99
|
|
|
100
|
-
allow_missing: Optional[bool] = None
|
|
100
|
+
allow_missing: Optional[bool] = None
|
|
101
101
|
dtype: Annotated[
|
|
102
|
-
Optional[Union[DataTypeClass, DataType]],
|
|
102
|
+
Optional[Union[DataTypeClass, DataType]],
|
|
103
103
|
BeforeValidator(dtype_deserializer),
|
|
104
104
|
] = None
|
|
105
105
|
constraints: Annotated[
|
|
106
|
-
Optional[Union[pl.Expr, list[pl.Expr]]],
|
|
106
|
+
Optional[Union[pl.Expr, list[pl.Expr]]],
|
|
107
107
|
BeforeValidator(expr_deserializer),
|
|
108
108
|
] = None
|
|
109
109
|
derived_from: Annotated[
|
|
110
|
-
Optional[Union[str, pl.Expr]],
|
|
110
|
+
Optional[Union[str, pl.Expr]],
|
|
111
111
|
BeforeValidator(expr_or_col_name_deserializer),
|
|
112
112
|
] = None
|
|
113
|
-
unique: Optional[bool] = None
|
|
113
|
+
unique: Optional[bool] = None
|
|
114
114
|
|
|
115
115
|
def __repr__(self) -> str:
|
|
116
116
|
"""Print only Field attributes whose values are not default (mainly None)."""
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from collections.abc import Mapping
|
|
4
4
|
from functools import cache, reduce
|
|
5
|
-
from operator import
|
|
5
|
+
from operator import or_
|
|
6
6
|
from typing import TYPE_CHECKING, Any
|
|
7
7
|
|
|
8
8
|
import polars as pl
|
|
@@ -115,7 +115,8 @@ def validate_annotation(
|
|
|
115
115
|
class DtypeResolver:
|
|
116
116
|
def __init__(self, annotation: Any | None):
|
|
117
117
|
self.annotation = annotation
|
|
118
|
-
|
|
118
|
+
# mode='serialization' allows nested models with structs, see #86
|
|
119
|
+
self.schema = TypeAdapter(annotation).json_schema(mode="serialization")
|
|
119
120
|
self.defs = self.schema.get("$defs", {})
|
|
120
121
|
|
|
121
122
|
def valid_polars_dtypes(self) -> DataTypeGroup:
|
|
@@ -143,7 +144,7 @@ class DtypeResolver:
|
|
|
143
144
|
valid_type_sets.append(
|
|
144
145
|
self._pydantic_subschema_to_valid_polars_types(schema)
|
|
145
146
|
)
|
|
146
|
-
return reduce(
|
|
147
|
+
return reduce(or_, valid_type_sets) if valid_type_sets else DataTypeGroup([])
|
|
147
148
|
|
|
148
149
|
def _pydantic_subschema_to_valid_polars_types(
|
|
149
150
|
self,
|
|
@@ -159,6 +160,7 @@ class DtypeResolver:
|
|
|
159
160
|
self.defs[props["$ref"].split("/")[-1]]
|
|
160
161
|
)
|
|
161
162
|
return DataTypeGroup([])
|
|
163
|
+
|
|
162
164
|
pyd_type = props.get("type")
|
|
163
165
|
if pyd_type == "array":
|
|
164
166
|
if "items" not in props:
|
|
@@ -169,28 +171,27 @@ class DtypeResolver:
|
|
|
169
171
|
return DataTypeGroup(
|
|
170
172
|
[pl.List(dtype) for dtype in item_dtypes], match_base_type=False
|
|
171
173
|
)
|
|
174
|
+
|
|
172
175
|
elif pyd_type == "object":
|
|
173
176
|
if "properties" not in props:
|
|
174
177
|
return DataTypeGroup([])
|
|
175
178
|
object_props = props["properties"]
|
|
179
|
+
struct_fields: list[pl.Field] = []
|
|
180
|
+
for name, sub_props in object_props.items():
|
|
181
|
+
dtype = self._default_polars_dtype_for_schema(sub_props)
|
|
182
|
+
assert dtype is not None
|
|
183
|
+
struct_fields.append(pl.Field(name, dtype))
|
|
176
184
|
return DataTypeGroup(
|
|
177
|
-
[
|
|
178
|
-
pl.Struct(
|
|
179
|
-
[
|
|
180
|
-
pl.Field(
|
|
181
|
-
name, self._default_polars_dtype_for_schema(sub_props)
|
|
182
|
-
)
|
|
183
|
-
for name, sub_props in object_props.items()
|
|
184
|
-
]
|
|
185
|
-
)
|
|
186
|
-
],
|
|
185
|
+
[pl.Struct(struct_fields)],
|
|
187
186
|
match_base_type=False,
|
|
188
187
|
) # for structs, return only the default dtype set to avoid combinatoric issues
|
|
189
188
|
return _pyd_type_to_valid_dtypes(
|
|
190
189
|
PydanticBaseType(pyd_type), props.get("format"), props.get("enum")
|
|
191
190
|
)
|
|
192
191
|
|
|
193
|
-
def _default_polars_dtype_for_schema(
|
|
192
|
+
def _default_polars_dtype_for_schema(
|
|
193
|
+
self, schema: dict[str, Any]
|
|
194
|
+
) -> DataType | None:
|
|
194
195
|
if "anyOf" in schema:
|
|
195
196
|
if len(schema["anyOf"]) == 2: # look for optionals first
|
|
196
197
|
schema = _without_optional(schema)
|
|
@@ -206,13 +207,14 @@ class DtypeResolver:
|
|
|
206
207
|
|
|
207
208
|
def _pydantic_subschema_to_default_dtype(
|
|
208
209
|
self,
|
|
209
|
-
props: dict,
|
|
210
|
+
props: dict[str, Any],
|
|
210
211
|
) -> DataType | None:
|
|
211
212
|
if "column_info" in props: # user has specified in patito model
|
|
212
213
|
ci = ColumnInfo.model_validate_json(props["column_info"])
|
|
213
214
|
if ci.dtype is not None:
|
|
214
215
|
dtype = ci.dtype() if isinstance(ci.dtype, DataTypeClass) else ci.dtype
|
|
215
216
|
return dtype
|
|
217
|
+
|
|
216
218
|
if "type" not in props:
|
|
217
219
|
if "enum" in props:
|
|
218
220
|
raise TypeError("Mixed type enums not supported by patito.")
|
|
@@ -223,10 +225,12 @@ class DtypeResolver:
|
|
|
223
225
|
self.defs[props["$ref"].split("/")[-1]]
|
|
224
226
|
)
|
|
225
227
|
return None
|
|
228
|
+
|
|
226
229
|
pyd_type = props.get("type")
|
|
227
230
|
if pyd_type == "numeric":
|
|
228
231
|
pyd_type = "number"
|
|
229
|
-
|
|
232
|
+
|
|
233
|
+
elif pyd_type == "array":
|
|
230
234
|
if "items" not in props:
|
|
231
235
|
raise NotImplementedError(
|
|
232
236
|
"Unexpected error processing pydantic schema. Please file an issue."
|
|
@@ -236,18 +240,21 @@ class DtypeResolver:
|
|
|
236
240
|
if inner_default_type is None:
|
|
237
241
|
return None
|
|
238
242
|
return pl.List(inner_default_type)
|
|
239
|
-
|
|
243
|
+
|
|
244
|
+
elif pyd_type == "object": # these are structs
|
|
240
245
|
if "properties" not in props:
|
|
241
246
|
raise NotImplementedError(
|
|
242
247
|
"dictionaries not currently supported by patito"
|
|
243
248
|
)
|
|
244
|
-
object_props = props["properties"]
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
249
|
+
object_props: dict[str, dict[str, str]] = props["properties"]
|
|
250
|
+
struct_fields: list[pl.Field] = []
|
|
251
|
+
|
|
252
|
+
for name, sub_props in object_props.items():
|
|
253
|
+
dtype = self._default_polars_dtype_for_schema(sub_props)
|
|
254
|
+
assert dtype is not None
|
|
255
|
+
struct_fields.append(pl.Field(name, dtype))
|
|
256
|
+
return pl.Struct(struct_fields)
|
|
257
|
+
|
|
251
258
|
return _pyd_type_to_default_dtype(
|
|
252
259
|
PydanticBaseType(pyd_type), props.get("format"), props.get("enum")
|
|
253
260
|
)
|
|
@@ -124,7 +124,7 @@ def _pyd_type_to_valid_dtypes(
|
|
|
124
124
|
_validate_enum_values(pyd_type, enum)
|
|
125
125
|
return DataTypeGroup([pl.Enum(enum), pl.String], match_base_type=False)
|
|
126
126
|
if pyd_type.value == "integer":
|
|
127
|
-
return DataTypeGroup(INTEGER_DTYPES
|
|
127
|
+
return DataTypeGroup(INTEGER_DTYPES)
|
|
128
128
|
elif pyd_type.value == "number":
|
|
129
129
|
return (
|
|
130
130
|
FLOAT_DTYPES
|
|
@@ -54,33 +54,63 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
54
54
|
|
|
55
55
|
model: type[ModelType]
|
|
56
56
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
cls: type[LDF], model: type[ModelType] | None
|
|
60
|
-
) -> type[LazyFrame[ModelType]]:
|
|
61
|
-
"""Return custom LazyFrame sub-class where LazyFrame.model is set.
|
|
57
|
+
def set_model(self, model: type[OtherModelType]) -> LazyFrame[OtherModelType]:
|
|
58
|
+
"""Associate a given patito ``Model`` with the dataframe.
|
|
62
59
|
|
|
63
|
-
|
|
64
|
-
|
|
60
|
+
The model schema is used by methods that depend on a model being associated with
|
|
61
|
+
the given dataframe such as :ref:`DataFrame.validate() <DataFrame.validate>`
|
|
62
|
+
and :ref:`DataFrame.get() <DataFrame.get>`.
|
|
63
|
+
|
|
64
|
+
``DataFrame(...).set_model(Model)`` is equivalent with ``Model.DataFrame(...)``.
|
|
65
65
|
|
|
66
66
|
Args:
|
|
67
|
-
model:
|
|
68
|
-
|
|
67
|
+
model (Model): Sub-class of ``patito.Model`` declaring the schema of the
|
|
68
|
+
dataframe.
|
|
69
69
|
|
|
70
70
|
Returns:
|
|
71
|
-
|
|
72
|
-
|
|
71
|
+
DataFrame[Model]: Returns the same dataframe, but with an attached model
|
|
72
|
+
that is required for certain model-specific dataframe methods to work.
|
|
73
73
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
74
|
+
Examples:
|
|
75
|
+
>>> from typing_extensions import Literal
|
|
76
|
+
>>> import patito as pt
|
|
77
|
+
>>> import polars as pl
|
|
78
|
+
>>> class SchoolClass(pt.Model):
|
|
79
|
+
... year: int = pt.Field(dtype=pl.UInt16)
|
|
80
|
+
... letter: Literal["A", "B"] = pt.Field(dtype=pl.Categorical)
|
|
81
|
+
...
|
|
82
|
+
>>> classes = pt.DataFrame(
|
|
83
|
+
... {"year": [1, 1, 2, 2], "letter": list("ABAB")}
|
|
84
|
+
... ).set_model(SchoolClass)
|
|
85
|
+
>>> classes
|
|
86
|
+
shape: (4, 2)
|
|
87
|
+
┌──────┬────────┐
|
|
88
|
+
│ year ┆ letter │
|
|
89
|
+
│ --- ┆ --- │
|
|
90
|
+
│ i64 ┆ str │
|
|
91
|
+
╞══════╪════════╡
|
|
92
|
+
│ 1 ┆ A │
|
|
93
|
+
│ 1 ┆ B │
|
|
94
|
+
│ 2 ┆ A │
|
|
95
|
+
│ 2 ┆ B │
|
|
96
|
+
└──────┴────────┘
|
|
97
|
+
>>> casted_classes = classes.cast()
|
|
98
|
+
>>> casted_classes
|
|
99
|
+
shape: (4, 2)
|
|
100
|
+
┌──────┬────────┐
|
|
101
|
+
│ year ┆ letter │
|
|
102
|
+
│ --- ┆ --- │
|
|
103
|
+
│ u16 ┆ cat │
|
|
104
|
+
╞══════╪════════╡
|
|
105
|
+
│ 1 ┆ A │
|
|
106
|
+
│ 1 ┆ B │
|
|
107
|
+
│ 2 ┆ A │
|
|
108
|
+
│ 2 ┆ B │
|
|
109
|
+
└──────┴────────┘
|
|
110
|
+
>>> casted_classes.validate()
|
|
77
111
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
(cls,),
|
|
81
|
-
{"model": model},
|
|
82
|
-
)
|
|
83
|
-
return new_class
|
|
112
|
+
"""
|
|
113
|
+
return model.LazyFrame._from_pyldf(self._ldf) # type: ignore
|
|
84
114
|
|
|
85
115
|
def collect(
|
|
86
116
|
self,
|
|
@@ -93,12 +123,11 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
93
123
|
parameters.
|
|
94
124
|
"""
|
|
95
125
|
background = kwargs.pop("background", False)
|
|
96
|
-
df = super().collect(*args, background=background, **kwargs)
|
|
126
|
+
df: pl.DataFrame = super().collect(*args, background=background, **kwargs)
|
|
127
|
+
df = DataFrame(df)
|
|
97
128
|
if getattr(self, "model", False):
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
cls = DataFrame
|
|
101
|
-
return cls._from_pydf(df._df)
|
|
129
|
+
df = df.set_model(self.model)
|
|
130
|
+
return df
|
|
102
131
|
|
|
103
132
|
def derive(self: LDF, columns: list[str] | None = None) -> LDF:
|
|
104
133
|
"""Populate columns which have ``pt.Field(derived_from=...)`` definitions.
|
|
@@ -213,7 +242,7 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
213
242
|
f"TODO figure out how this AliasPath behaves ({va})"
|
|
214
243
|
)
|
|
215
244
|
return (
|
|
216
|
-
pl.col(va.path[0]).list.get(va.path[1], null_on_oob=True)
|
|
245
|
+
pl.col(str(va.path[0])).list.get(va.path[1], null_on_oob=True)
|
|
217
246
|
if va.path[0] in self.collect_schema()
|
|
218
247
|
else None
|
|
219
248
|
)
|
|
@@ -307,7 +336,10 @@ class LazyFrame(pl.LazyFrame, Generic[ModelType]):
|
|
|
307
336
|
@classmethod
|
|
308
337
|
def from_existing(cls: type[LDF], lf: pl.LazyFrame) -> LDF:
|
|
309
338
|
"""Construct a patito.DataFrame object from an existing polars.DataFrame object."""
|
|
310
|
-
|
|
339
|
+
if getattr(cls, "model", False):
|
|
340
|
+
return cls.model.LazyFrame._from_pyldf(super().lazy()._ldf) # type: ignore
|
|
341
|
+
|
|
342
|
+
return LazyFrame._from_pyldf(lf._ldf) # type: ignore
|
|
311
343
|
|
|
312
344
|
|
|
313
345
|
class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
@@ -341,30 +373,6 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
341
373
|
|
|
342
374
|
model: type[ModelType]
|
|
343
375
|
|
|
344
|
-
@classmethod
|
|
345
|
-
def _construct_dataframe_model_class(
|
|
346
|
-
cls: type[DF], model: type[OtherModelType]
|
|
347
|
-
) -> type[DataFrame[OtherModelType]]:
|
|
348
|
-
"""Return custom DataFrame sub-class where DataFrame.model is set.
|
|
349
|
-
|
|
350
|
-
Can be used to construct a DataFrame class where
|
|
351
|
-
DataFrame.set_model(model) is implicitly invoked at instantiation.
|
|
352
|
-
|
|
353
|
-
Args:
|
|
354
|
-
model: A patito model which should be used to validate the dataframe.
|
|
355
|
-
|
|
356
|
-
Returns:
|
|
357
|
-
A custom DataFrame model class where DataFrame._model has been correctly
|
|
358
|
-
"hard-coded" to the given model.
|
|
359
|
-
|
|
360
|
-
"""
|
|
361
|
-
new_class = type(
|
|
362
|
-
f"{model.model_json_schema()['title']}DataFrame",
|
|
363
|
-
(cls,),
|
|
364
|
-
{"model": model},
|
|
365
|
-
)
|
|
366
|
-
return new_class
|
|
367
|
-
|
|
368
376
|
def lazy(self: DataFrame[ModelType]) -> LazyFrame[ModelType]:
|
|
369
377
|
"""Convert DataFrame into LazyFrame.
|
|
370
378
|
|
|
@@ -374,15 +382,12 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
374
382
|
A new LazyFrame object.
|
|
375
383
|
|
|
376
384
|
"""
|
|
377
|
-
|
|
378
|
-
LazyFrame.
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
) # type: ignore
|
|
382
|
-
ldf = lazyframe_class._from_pyldf(super().lazy()._ldf)
|
|
383
|
-
return ldf
|
|
385
|
+
if getattr(self, "model", False):
|
|
386
|
+
return self.model.LazyFrame._from_pyldf(super().lazy()._ldf) # type: ignore
|
|
387
|
+
|
|
388
|
+
return LazyFrame._from_pyldf(super().lazy()._ldf) # type: ignore
|
|
384
389
|
|
|
385
|
-
def set_model(self, model
|
|
390
|
+
def set_model(self, model: type[OtherModelType]) -> DataFrame[OtherModelType]:
|
|
386
391
|
"""Associate a given patito ``Model`` with the dataframe.
|
|
387
392
|
|
|
388
393
|
The model schema is used by methods that depend on a model being associated with
|
|
@@ -438,11 +443,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
438
443
|
>>> casted_classes.validate()
|
|
439
444
|
|
|
440
445
|
"""
|
|
441
|
-
|
|
442
|
-
return cast(
|
|
443
|
-
DataFrame[model],
|
|
444
|
-
cls._from_pydf(self._df),
|
|
445
|
-
)
|
|
446
|
+
return model.DataFrame(self._df)
|
|
446
447
|
|
|
447
448
|
def unalias(self: DF) -> DF:
|
|
448
449
|
"""Un-aliases column names using information from pydantic validation_alias.
|
|
@@ -503,7 +504,6 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
503
504
|
def drop(
|
|
504
505
|
self: DF,
|
|
505
506
|
columns: str | Collection[str] | None = None,
|
|
506
|
-
*more_columns: str,
|
|
507
507
|
) -> DF:
|
|
508
508
|
"""Drop one or more columns from the dataframe.
|
|
509
509
|
|
|
@@ -515,7 +515,6 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
515
515
|
columns: A single column string name, or list of strings, indicating
|
|
516
516
|
which columns to drop. If not specified, all columns *not*
|
|
517
517
|
specified by the associated dataframe model will be dropped.
|
|
518
|
-
more_columns: Additional named columns to drop.
|
|
519
518
|
|
|
520
519
|
Returns:
|
|
521
520
|
DataFrame[Model]: New dataframe without the specified columns.
|
|
@@ -538,7 +537,9 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
538
537
|
|
|
539
538
|
"""
|
|
540
539
|
if columns is not None:
|
|
541
|
-
|
|
540
|
+
# I get a single null row if I try to use super() here, so go via
|
|
541
|
+
# pl.DataFrame instead.
|
|
542
|
+
return self._from_pydf(pl.DataFrame(self._df).drop(columns)._df)
|
|
542
543
|
else:
|
|
543
544
|
return self.drop(list(set(self.columns) - set(self.model.columns)))
|
|
544
545
|
|
|
@@ -705,7 +706,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
705
706
|
)
|
|
706
707
|
for column, default_value in self.model.defaults.items()
|
|
707
708
|
]
|
|
708
|
-
).set_model(self.model)
|
|
709
|
+
).set_model(self.model) # type: ignore
|
|
709
710
|
|
|
710
711
|
def get(self, predicate: pl.Expr | None = None) -> ModelType:
|
|
711
712
|
"""Fetch the single row that matches the given polars predicate.
|
|
@@ -815,7 +816,7 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
815
816
|
>>> class Product(pt.Model):
|
|
816
817
|
... product_id: int = pt.Field(unique=True)
|
|
817
818
|
... price: float
|
|
818
|
-
|
|
819
|
+
|
|
819
820
|
>>> df = pt.DataFrame({"product_id": [1, 2], "price": [10., 20.]})
|
|
820
821
|
>>> df = df.set_model(Product)
|
|
821
822
|
>>> for product in df.iter_models():
|
|
@@ -833,10 +834,23 @@ class DataFrame(pl.DataFrame, Generic[ModelType]):
|
|
|
833
834
|
|
|
834
835
|
df = self.validate(drop_superfluous_columns=True) if validate_df else self
|
|
835
836
|
|
|
836
|
-
def
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
837
|
+
def _iter_models_with_validate(
|
|
838
|
+
_df: DataFrame[ModelType],
|
|
839
|
+
) -> Iterator[ModelType]:
|
|
840
|
+
for row in _df.iter_rows(named=True):
|
|
841
|
+
yield self.model(**row)
|
|
842
|
+
|
|
843
|
+
def _iter_models_without_validate(
|
|
844
|
+
_df: DataFrame[ModelType],
|
|
845
|
+
) -> Iterator[ModelType]:
|
|
846
|
+
for row in _df.iter_rows(named=True):
|
|
847
|
+
yield self.model.model_construct(**row)
|
|
848
|
+
|
|
849
|
+
_iter_models = (
|
|
850
|
+
_iter_models_with_validate
|
|
851
|
+
if validate_model
|
|
852
|
+
else _iter_models_without_validate
|
|
853
|
+
)
|
|
840
854
|
return ModelGenerator(_iter_models(df))
|
|
841
855
|
|
|
842
856
|
def _pydantic_model(self) -> type[Model]:
|
|
@@ -76,27 +76,31 @@ class ModelMetaclass(PydanticModelMetaclass):
|
|
|
76
76
|
|
|
77
77
|
"""
|
|
78
78
|
super().__init__(name, bases, clsdict, **kwargs)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
model
|
|
79
|
+
NewDataFrame = type(
|
|
80
|
+
f"{cls.__name__}DataFrame",
|
|
81
|
+
(DataFrame,),
|
|
82
|
+
{"model": cls},
|
|
83
83
|
)
|
|
84
|
-
#
|
|
85
|
-
|
|
86
|
-
|
|
84
|
+
cls.DataFrame: type[DataFrame[cls]] = NewDataFrame # type: ignore
|
|
85
|
+
|
|
86
|
+
NewLazyFrame = type(
|
|
87
|
+
f"{cls.__name__}LazyFrame",
|
|
88
|
+
(LazyFrame,),
|
|
89
|
+
{"model": cls},
|
|
87
90
|
)
|
|
91
|
+
cls.LazyFrame: type[LazyFrame[cls]] = NewLazyFrame # type: ignore
|
|
88
92
|
|
|
89
93
|
def __hash__(self) -> int:
|
|
90
94
|
"""Return hash of the model class."""
|
|
91
95
|
return super().__hash__()
|
|
92
96
|
|
|
93
97
|
@property
|
|
94
|
-
def column_infos(cls: type[
|
|
98
|
+
def column_infos(cls: type[Model]) -> Mapping[str, ColumnInfo]:
|
|
95
99
|
"""Return column information for the model."""
|
|
96
100
|
return column_infos_for_model(cls)
|
|
97
101
|
|
|
98
102
|
@property
|
|
99
|
-
def model_schema(cls: type[
|
|
103
|
+
def model_schema(cls: type[Model]) -> Mapping[str, Mapping[str, Any]]:
|
|
100
104
|
"""Return schema properties where definition references have been resolved.
|
|
101
105
|
|
|
102
106
|
Returns:
|
|
@@ -112,7 +116,7 @@ class ModelMetaclass(PydanticModelMetaclass):
|
|
|
112
116
|
return schema_for_model(cls)
|
|
113
117
|
|
|
114
118
|
@property
|
|
115
|
-
def columns(cls: type[
|
|
119
|
+
def columns(cls: type[Model]) -> list[str]:
|
|
116
120
|
"""Return the name of the dataframe columns specified by the fields of the model.
|
|
117
121
|
|
|
118
122
|
Returns:
|
|
@@ -131,7 +135,7 @@ class ModelMetaclass(PydanticModelMetaclass):
|
|
|
131
135
|
return list(cls.model_fields.keys())
|
|
132
136
|
|
|
133
137
|
@property
|
|
134
|
-
def dtypes(cls: type[
|
|
138
|
+
def dtypes(cls: type[Model]) -> dict[str, DataTypeClass | DataType]:
|
|
135
139
|
"""Return the polars dtypes of the dataframe.
|
|
136
140
|
|
|
137
141
|
Unless Field(dtype=...) is specified, the highest signed column dtype
|
|
@@ -155,7 +159,7 @@ class ModelMetaclass(PydanticModelMetaclass):
|
|
|
155
159
|
|
|
156
160
|
@property
|
|
157
161
|
def valid_dtypes(
|
|
158
|
-
cls: type[
|
|
162
|
+
cls: type[Model],
|
|
159
163
|
) -> Mapping[str, frozenset[DataTypeClass | DataType]]:
|
|
160
164
|
"""Return a list of polars dtypes which Patito considers valid for each field.
|
|
161
165
|
|
|
@@ -172,7 +176,7 @@ class ModelMetaclass(PydanticModelMetaclass):
|
|
|
172
176
|
return valid_dtypes_for_model(cls)
|
|
173
177
|
|
|
174
178
|
@property
|
|
175
|
-
def defaults(cls: type[
|
|
179
|
+
def defaults(cls: type[Model]) -> dict[str, Any]:
|
|
176
180
|
"""Return default field values specified on the model.
|
|
177
181
|
|
|
178
182
|
Returns:
|
|
@@ -197,7 +201,7 @@ class ModelMetaclass(PydanticModelMetaclass):
|
|
|
197
201
|
}
|
|
198
202
|
|
|
199
203
|
@property
|
|
200
|
-
def non_nullable_columns(cls: type[
|
|
204
|
+
def non_nullable_columns(cls: type[Model]) -> set[str]:
|
|
201
205
|
"""Return names of those columns that are non-nullable in the schema.
|
|
202
206
|
|
|
203
207
|
Returns:
|
|
@@ -226,7 +230,7 @@ class ModelMetaclass(PydanticModelMetaclass):
|
|
|
226
230
|
)
|
|
227
231
|
|
|
228
232
|
@property
|
|
229
|
-
def nullable_columns(cls: type[
|
|
233
|
+
def nullable_columns(cls: type[Model]) -> set[str]:
|
|
230
234
|
"""Return names of those columns that are nullable in the schema.
|
|
231
235
|
|
|
232
236
|
Returns:
|
|
@@ -248,7 +252,7 @@ class ModelMetaclass(PydanticModelMetaclass):
|
|
|
248
252
|
return set(cls.columns) - cls.non_nullable_columns
|
|
249
253
|
|
|
250
254
|
@property
|
|
251
|
-
def unique_columns(cls: type[
|
|
255
|
+
def unique_columns(cls: type[Model]) -> set[str]:
|
|
252
256
|
"""Return columns with uniqueness constraint.
|
|
253
257
|
|
|
254
258
|
Returns:
|
|
@@ -271,7 +275,7 @@ class ModelMetaclass(PydanticModelMetaclass):
|
|
|
271
275
|
return {column for column in cls.columns if infos[column].unique}
|
|
272
276
|
|
|
273
277
|
@property
|
|
274
|
-
def derived_columns(cls: type[
|
|
278
|
+
def derived_columns(cls: type[Model]) -> set[str]:
|
|
275
279
|
"""Return set of columns which are derived from other columns."""
|
|
276
280
|
infos = cls.column_infos
|
|
277
281
|
return {
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|