patito 0.6.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
patito/validators.py CHANGED
@@ -2,14 +2,10 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from collections.abc import Sequence
5
6
  from typing import (
6
7
  TYPE_CHECKING,
7
8
  Any,
8
- Optional,
9
- Sequence,
10
- Type,
11
- Union,
12
- _UnionGenericAlias,
13
9
  cast,
14
10
  )
15
11
 
@@ -18,6 +14,7 @@ from pydantic.aliases import AliasGenerator
18
14
  from typing_extensions import get_args
19
15
 
20
16
  from patito._pydantic.dtypes import is_optional
17
+ from patito._pydantic.dtypes.utils import unwrap_optional
21
18
  from patito.exceptions import (
22
19
  ColumnDTypeError,
23
20
  DataFrameValidationError,
@@ -57,29 +54,6 @@ VALID_POLARS_TYPES = {
57
54
  }
58
55
 
59
56
 
60
- def _dewrap_optional(type_annotation: Type[Any] | Any) -> Type:
61
- """Return the inner, wrapped type of an Optional.
62
-
63
- Is a no-op for non-Optional types.
64
-
65
- Args:
66
- type_annotation: The type annotation to be dewrapped.
67
-
68
- Returns:
69
- The input type, but with the outermost Optional removed.
70
-
71
- """
72
- return (
73
- next( # pragma: no cover
74
- valid_type
75
- for valid_type in get_args(type_annotation)
76
- if valid_type is not type(None) # noqa: E721
77
- )
78
- if is_optional(type_annotation)
79
- else type_annotation
80
- )
81
-
82
-
83
57
  def _transform_df(dataframe: pl.DataFrame, schema: type[Model]) -> pl.DataFrame:
84
58
  """Transform any properties of the dataframe according to the model.
85
59
 
@@ -109,8 +83,8 @@ def _transform_df(dataframe: pl.DataFrame, schema: type[Model]) -> pl.DataFrame:
109
83
 
110
84
  def _find_errors( # noqa: C901
111
85
  dataframe: pl.DataFrame,
112
- schema: Type[Model],
113
- columns: Optional[Sequence[str]] = None,
86
+ schema: type[Model],
87
+ columns: Sequence[str] | None = None,
114
88
  allow_missing_columns: bool = False,
115
89
  allow_superfluous_columns: bool = False,
116
90
  ) -> list[ErrorWrapper]:
@@ -144,6 +118,10 @@ def _find_errors( # noqa: C901
144
118
  if not allow_missing_columns:
145
119
  # Check if any columns are missing
146
120
  for missing_column in set(schema_subset) - set(dataframe.columns):
121
+ col_info = schema.column_infos.get(missing_column)
122
+ if col_info is not None and col_info.allow_missing:
123
+ continue
124
+
147
125
  errors.append(
148
126
  ErrorWrapper(
149
127
  MissingColumnsError("Missing column"),
@@ -185,7 +163,7 @@ def _find_errors( # noqa: C901
185
163
 
186
164
  # Retrieve the annotation of the list itself,
187
165
  # dewrapping any potential Optional[...]
188
- list_type = _dewrap_optional(annotation)
166
+ list_type = unwrap_optional(annotation)
189
167
 
190
168
  # Check if the list items themselves should be considered nullable
191
169
  item_type = get_args(list_type)[0]
@@ -197,6 +175,8 @@ def _find_errors( # noqa: C901
197
175
  .select(column)
198
176
  # Remove those rows that do not contain lists at all
199
177
  .filter(pl.col(column).is_not_null())
178
+ # Remove empty lists
179
+ .filter(pl.col(column).list.len() > 0)
200
180
  # Convert lists of N items to N individual rows
201
181
  .explode(column)
202
182
  # Calculate how many nulls are present in lists
@@ -220,24 +200,31 @@ def _find_errors( # noqa: C901
220
200
  valid_dtypes = schema.valid_dtypes
221
201
  dataframe_datatypes = dict(zip(dataframe.columns, dataframe.dtypes))
222
202
  for column_name, column_properties in schema._schema_properties().items():
203
+ # We rename to _tmp here to avoid overwriting the dataframe during filters below
204
+ # TODO! Really we should be passing *Series* around rather than the entire dataframe
205
+ dataframe_tmp = dataframe
223
206
  column_info = schema.column_infos[column_name]
224
- if column_name not in dataframe.columns or column_name not in column_subset:
207
+ if column_name not in dataframe_tmp.columns or column_name not in column_subset:
225
208
  continue
226
209
 
227
210
  polars_type = dataframe_datatypes[column_name]
228
- if polars_type not in valid_dtypes[column_name]:
229
- errors.append(
230
- ErrorWrapper(
231
- ColumnDTypeError(
232
- f"Polars dtype {polars_type} does not match model field type."
233
- ),
234
- loc=column_name,
211
+ if polars_type not in [
212
+ pl.Struct,
213
+ pl.List(pl.Struct),
214
+ ]: # defer struct validation for recursive call to _find_errors later
215
+ if polars_type not in valid_dtypes[column_name]:
216
+ errors.append(
217
+ ErrorWrapper(
218
+ ColumnDTypeError(
219
+ f"Polars dtype {polars_type} does not match model field type."
220
+ ),
221
+ loc=column_name,
222
+ )
235
223
  )
236
- )
237
224
 
238
225
  # Test for when only specific values are accepted
239
226
  e = _find_enum_errors(
240
- df=dataframe,
227
+ df=dataframe_tmp,
241
228
  column_name=column_name,
242
229
  props=column_properties,
243
230
  schema=schema,
@@ -247,7 +234,7 @@ def _find_errors( # noqa: C901
247
234
 
248
235
  if column_info.unique:
249
236
  # Coalescing to 0 in the case of dataframe of height 0
250
- num_duplicated = dataframe[column_name].is_duplicated().sum() or 0
237
+ num_duplicated = dataframe_tmp[column_name].is_duplicated().sum() or 0
251
238
  if num_duplicated > 0:
252
239
  errors.append(
253
240
  ErrorWrapper(
@@ -259,19 +246,31 @@ def _find_errors( # noqa: C901
259
246
  # Intercept struct columns, and process errors separately
260
247
  if schema.dtypes[column_name] == pl.Struct:
261
248
  nested_schema = schema.model_fields[column_name].annotation
262
-
249
+ assert nested_schema is not None
263
250
  # Additional unpack required if structs column is optional
264
- if type(nested_schema) == _UnionGenericAlias:
265
- nested_schema = nested_schema.__args__[0]
266
-
267
- # We need to filter out any null rows as the submodel won't know
268
- # that all of a row's columns may be null
269
- dataframe = dataframe.filter(pl.col(column_name).is_not_null())
270
- if dataframe.is_empty():
251
+ if is_optional(nested_schema):
252
+ nested_schema = unwrap_optional(nested_schema)
253
+
254
+ # An optional struct means that we allow the struct entry to be
255
+ # null. It is the inner model that is responsible for determining
256
+ # whether its fields are optional or not. Since the struct is optional,
257
+ # we need to filter out any null rows as the inner model may disallow
258
+ # nulls on a particular field
259
+
260
+ # NB As of Polars 1.1, struct_col.is_null() cannot return True
261
+ # The following code has been added to accomodate this
262
+
263
+ struct_fields = dataframe_tmp[column_name].struct.fields
264
+ col_struct = pl.col(column_name).struct
265
+ only_non_null_expr = ~pl.all_horizontal(
266
+ [col_struct.field(name).is_null() for name in struct_fields]
267
+ )
268
+ dataframe_tmp = dataframe_tmp.filter(only_non_null_expr)
269
+ if dataframe_tmp.is_empty():
271
270
  continue
272
271
 
273
272
  struct_errors = _find_errors(
274
- dataframe=dataframe.select(column_name).unnest(column_name),
273
+ dataframe=dataframe_tmp.select(column_name).unnest(column_name),
275
274
  schema=nested_schema,
276
275
  )
277
276
 
@@ -286,22 +285,36 @@ def _find_errors( # noqa: C901
286
285
 
287
286
  # Intercept list of structs columns, and process errors separately
288
287
  elif schema.dtypes[column_name] == pl.List(pl.Struct):
289
- nested_schema = schema.model_fields[column_name].annotation.__args__[0]
288
+ list_annotation = schema.model_fields[column_name].annotation
289
+ assert list_annotation is not None
290
290
 
291
- # Additional unpack required if structs column is optional
292
- if type(nested_schema) == _UnionGenericAlias:
293
- nested_schema = nested_schema.__args__[0]
291
+ # Handle Optional[list[pl.Struct]]
292
+ if is_optional(list_annotation):
293
+ list_annotation = unwrap_optional(list_annotation)
294
294
 
295
- # We need to filter out any null rows as the submodel won't know
296
- # that all of a row's columns may be null
297
- dataframe = dataframe.filter(pl.col(column_name).is_not_null())
298
- if dataframe.is_empty():
295
+ dataframe_tmp = dataframe_tmp.filter(pl.col(column_name).is_not_null())
296
+ if dataframe_tmp.is_empty():
299
297
  continue
300
298
 
301
- list_struct_errors = _find_errors(
302
- dataframe=dataframe.select(column_name)
299
+ # Unpack list schema
300
+ nested_schema = list_annotation.__args__[0]
301
+
302
+ dataframe_tmp = (
303
+ dataframe_tmp.select(column_name)
303
304
  .explode(column_name)
304
- .unnest(column_name),
305
+ .unnest(column_name)
306
+ )
307
+
308
+ # Handle list[Optional[pl.Struct]]
309
+ if is_optional(nested_schema):
310
+ nested_schema = unwrap_optional(nested_schema)
311
+
312
+ dataframe_tmp = dataframe_tmp.filter(pl.all().is_not_null())
313
+ if dataframe_tmp.is_empty():
314
+ continue
315
+
316
+ list_struct_errors = _find_errors(
317
+ dataframe=dataframe_tmp,
305
318
  schema=nested_schema,
306
319
  )
307
320
 
@@ -344,7 +357,7 @@ def _find_errors( # noqa: C901
344
357
  if checks:
345
358
  n_invalid_rows = 0
346
359
  for check in checks:
347
- lazy_df = dataframe.lazy()
360
+ lazy_df = dataframe_tmp.lazy()
348
361
  lazy_df = lazy_df.filter(
349
362
  ~check
350
363
  ) # get failing rows (nulls will evaluate to null on boolean check, we only want failures (false)))
@@ -370,11 +383,11 @@ def _find_errors( # noqa: C901
370
383
  )
371
384
  if "_" in constraints.meta.root_names():
372
385
  # An underscore is an alias for the current field
373
- illegal_rows = dataframe.with_columns(
386
+ illegal_rows = dataframe_tmp.with_columns(
374
387
  pl.col(column_name).alias("_")
375
388
  ).filter(constraints)
376
389
  else:
377
- illegal_rows = dataframe.filter(constraints)
390
+ illegal_rows = dataframe_tmp.filter(constraints)
378
391
  if illegal_rows.height > 0:
379
392
  errors.append(
380
393
  ErrorWrapper(
@@ -391,11 +404,23 @@ def _find_errors( # noqa: C901
391
404
 
392
405
 
393
406
  def _find_enum_errors(
394
- df: pl.DataFrame, column_name: str, props: dict[str, Any], schema: Type[Model]
407
+ df: pl.DataFrame, column_name: str, props: dict[str, Any], schema: type[Model]
395
408
  ) -> ErrorWrapper | None:
396
409
  if "enum" not in props:
397
410
  if "items" in props and "enum" in props["items"]:
398
411
  return _find_enum_errors(df, column_name, props["items"], schema)
412
+ for item in props.get("anyOf", []):
413
+ if "enum" in item:
414
+ return _find_enum_errors(df, column_name, item, schema)
415
+ if (
416
+ "$ref" in item
417
+ ): # If the item is a reference to another definition pass it as the properties
418
+ return _find_enum_errors(
419
+ df,
420
+ column_name,
421
+ schema.model_json_schema()["$defs"][item["$ref"]],
422
+ schema,
423
+ )
399
424
  return None
400
425
  permissible_values = set(props["enum"])
401
426
  if column_name in schema.nullable_columns:
@@ -414,12 +439,13 @@ def _find_enum_errors(
414
439
 
415
440
 
416
441
  def validate(
417
- dataframe: Union["pd.DataFrame", pl.DataFrame],
418
- schema: Type[Model],
419
- columns: Optional[Sequence[str]] = None,
442
+ dataframe: pd.DataFrame | pl.DataFrame,
443
+ schema: type[Model],
444
+ columns: Sequence[str] | None = None,
420
445
  allow_missing_columns: bool = False,
421
446
  allow_superfluous_columns: bool = False,
422
- ) -> None:
447
+ drop_superfluous_columns: bool = False,
448
+ ) -> pl.DataFrame:
423
449
  """Validate the given dataframe.
424
450
 
425
451
  Args:
@@ -429,17 +455,29 @@ def validate(
429
455
  of the dataframe will be validated.
430
456
  allow_missing_columns: If True, missing columns will not be considered an error.
431
457
  allow_superfluous_columns: If True, additional columns will not be considered an error.
458
+ drop_superfluous_columns: If True, drop any columns not specified in the schema before validation.
432
459
 
433
460
  Raises:
434
461
  DataFrameValidationError: If the given dataframe does not match the given schema.
435
462
 
436
463
  """
464
+ if drop_superfluous_columns and columns:
465
+ raise ValueError(
466
+ "Cannot specify both 'columns' and 'drop_superfluous_columns'."
467
+ )
468
+
437
469
  if _PANDAS_AVAILABLE and isinstance(dataframe, pd.DataFrame):
438
470
  polars_dataframe = pl.from_pandas(dataframe)
439
471
  else:
440
- polars_dataframe = cast(pl.DataFrame, dataframe)
472
+ polars_dataframe = cast(pl.DataFrame, dataframe).clone()
441
473
 
442
474
  polars_dataframe = _transform_df(polars_dataframe, schema)
475
+
476
+ if drop_superfluous_columns:
477
+ # NOTE: dropping rather than selecting to get the correct error messages
478
+ to_drop = set(dataframe.columns) - set(schema.columns)
479
+ polars_dataframe = polars_dataframe.drop(to_drop)
480
+
443
481
  errors = _find_errors(
444
482
  dataframe=polars_dataframe,
445
483
  schema=schema,
@@ -449,3 +487,5 @@ def validate(
449
487
  )
450
488
  if errors:
451
489
  raise DataFrameValidationError(errors=errors, model=schema)
490
+
491
+ return polars_dataframe
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: patito
3
- Version: 0.6.2
3
+ Version: 0.8.0
4
4
  Summary: A dataframe modelling library built on top of polars and pydantic.
5
5
  Home-page: https://github.com/JakobGM/patito
6
6
  License: MIT
@@ -13,14 +13,16 @@ Classifier: Programming Language :: Python :: 3
13
13
  Classifier: Programming Language :: Python :: 3.9
14
14
  Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
16
17
  Provides-Extra: caching
17
18
  Provides-Extra: docs
18
19
  Provides-Extra: pandas
19
20
  Requires-Dist: Sphinx (<7) ; extra == "docs"
20
21
  Requires-Dist: pandas ; extra == "pandas"
21
- Requires-Dist: polars (>=0.20.1)
22
+ Requires-Dist: polars (>=1.10.0)
23
+ Requires-Dist: pre-commit (>=3.8.0,<4.0.0)
22
24
  Requires-Dist: pyarrow (>=5.0.0) ; extra == "caching"
23
- Requires-Dist: pydantic (>=2.4.1)
25
+ Requires-Dist: pydantic (>=2.7.0)
24
26
  Requires-Dist: sphinx-autobuild ; extra == "docs"
25
27
  Requires-Dist: sphinx-autodoc-typehints ; extra == "docs"
26
28
  Requires-Dist: sphinx-rtd-theme ; extra == "docs"
@@ -74,7 +76,7 @@ pip install patito
74
76
 
75
77
  ## Documentation
76
78
 
77
- The full documentation of Patio can be found [here](https://patito.readthedocs.io).
79
+ The full documentation of Patito can be found [here](https://patito.readthedocs.io).
78
80
 
79
81
  ## 👮 Data validation
80
82
 
@@ -0,0 +1,17 @@
1
+ patito/__init__.py,sha256=4qD13kfoa85_kyTCChm3xQcKKzIy3G8AZQp8T_bjcmo,844
2
+ patito/_docs.py,sha256=9mfttyylWpqaOZv8xfDMEwCHHaY7GQwfyI7CDg7tWe8,162
3
+ patito/_pydantic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ patito/_pydantic/column_info.py,sha256=RZdEdv41Z34t1CVewKlOSjnvgBF2bfriT_FxkDiaRBI,5442
5
+ patito/_pydantic/dtypes/__init__.py,sha256=2vTvL4N4yMN0cbv2CSoa1OFwCswx6FhuQdsYhMaz_dU,578
6
+ patito/_pydantic/dtypes/dtypes.py,sha256=GDZxEBsNzc3jOsEF_5qkClFeKwbGUML_3gwkqLMQYeM,9612
7
+ patito/_pydantic/dtypes/utils.py,sha256=mjGOsrJ7R-WJ5KLpureaFBnYY8lGbqpuYDh_8LVJjnI,6674
8
+ patito/_pydantic/repr.py,sha256=P7ojqTeNM4htzZgw2qMO6XzFqkiIXLwP-WUIEqNt7I0,4182
9
+ patito/_pydantic/schema.py,sha256=BI2qAhNM29NxS366K9eRi8thgE2P3t8GFt1HzwlWxos,3603
10
+ patito/exceptions.py,sha256=XK6UF_UojeOR45TJnZqS19SHZAUIvu-nswqf1tnFJ08,6034
11
+ patito/polars.py,sha256=nd73Hzpji6rP5wpbYI_FtGGPoQVQ2pvij8GXlx_MrPc,37738
12
+ patito/pydantic.py,sha256=zo9321xdRLzFZdL35xr8pQaQp07VOhLGPJfCwrvPYyA,49473
13
+ patito/validators.py,sha256=WZqtXVH3gHSdKJXqIMM9j90CPfghihqLpqVJJeoVwBw,18494
14
+ patito-0.8.0.dist-info/LICENSE,sha256=3bc4YyuF0e5nd59E3CsR8QM1Ua7pqKfC9DD1LVBVMs4,1139
15
+ patito-0.8.0.dist-info/METADATA,sha256=Dz-oHA5zk4d0YVCsmbSJpzP2jR22lLX4mkSv1Wz_CJQ,14042
16
+ patito-0.8.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
17
+ patito-0.8.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.6.1
2
+ Generator: poetry-core 1.9.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,17 +0,0 @@
1
- patito/__init__.py,sha256=4qD13kfoa85_kyTCChm3xQcKKzIy3G8AZQp8T_bjcmo,844
2
- patito/_docs.py,sha256=9mfttyylWpqaOZv8xfDMEwCHHaY7GQwfyI7CDg7tWe8,162
3
- patito/_pydantic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- patito/_pydantic/column_info.py,sha256=zy3z0gCdQZhNA3_eQ9mEf3Di-gOR8Gt4vmS3v2iULkc,3536
5
- patito/_pydantic/dtypes/__init__.py,sha256=2vTvL4N4yMN0cbv2CSoa1OFwCswx6FhuQdsYhMaz_dU,578
6
- patito/_pydantic/dtypes/dtypes.py,sha256=nHtgyI0LsvA_hEIELUxtS4JnDwutqem7iT6nxMDLJxc,9510
7
- patito/_pydantic/dtypes/utils.py,sha256=6g2mVVSYCs0LSqiPlc4D2Wm3X2gm8sKJnXZYcthfabY,7017
8
- patito/_pydantic/repr.py,sha256=l9WLjwJ85nJwZCxLIwHih7UuMVVgz17W5th_UD7XZAM,4341
9
- patito/_pydantic/schema.py,sha256=1XLByZ1jJVP7PUNTkoSPDo0D_hy8QncNLjXKV2N0XDE,3622
10
- patito/exceptions.py,sha256=VfkkpLblu2Go4QnfWwew7g1NJ_gmynv28p-eGH84tLs,6060
11
- patito/polars.py,sha256=iAnMFfVyJfSdHantESrIdaX6tZDWj71jyWBery325ac,35333
12
- patito/pydantic.py,sha256=1gyPfo8-68sdy26yC8c7CQhk_9Mmr0KRsyuS4g54Ddw,48685
13
- patito/validators.py,sha256=d7lu3MBqaaLLvBVMd5BgarLYpGYHMeJEuTSGAoYqDf0,16231
14
- patito-0.6.2.dist-info/LICENSE,sha256=3bc4YyuF0e5nd59E3CsR8QM1Ua7pqKfC9DD1LVBVMs4,1139
15
- patito-0.6.2.dist-info/METADATA,sha256=vUijDkEO0zT5uxED3sN3fTvgsFEIojFYFcENJ1u9_cA,13947
16
- patito-0.6.2.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
17
- patito-0.6.2.dist-info/RECORD,,