patito 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
patito/validators.py CHANGED
@@ -1,9 +1,20 @@
1
1
  """Module for validating datastructures with respect to model specifications."""
2
+
2
3
  from __future__ import annotations
3
4
 
4
- from typing import TYPE_CHECKING, Optional, Sequence, Type, Union, cast, Any
5
+ from typing import (
6
+ TYPE_CHECKING,
7
+ Any,
8
+ Optional,
9
+ Sequence,
10
+ Type,
11
+ Union,
12
+ _UnionGenericAlias,
13
+ cast,
14
+ )
5
15
 
6
16
  import polars as pl
17
+ from pydantic.aliases import AliasGenerator
7
18
  from typing_extensions import get_args
8
19
 
9
20
  from patito._pydantic.dtypes import is_optional
@@ -18,7 +29,7 @@ from patito.exceptions import (
18
29
  )
19
30
 
20
31
  try:
21
- import pandas as pd
32
+ import pandas as pd # type: ignore
22
33
 
23
34
  _PANDAS_AVAILABLE = True
24
35
  except ImportError:
@@ -52,11 +63,9 @@ def _dewrap_optional(type_annotation: Type[Any] | Any) -> Type:
52
63
  Is a no-op for non-Optional types.
53
64
 
54
65
  Args:
55
- ----
56
66
  type_annotation: The type annotation to be dewrapped.
57
67
 
58
68
  Returns:
59
- -------
60
69
  The input type, but with the outermost Optional removed.
61
70
 
62
71
  """
@@ -71,6 +80,33 @@ def _dewrap_optional(type_annotation: Type[Any] | Any) -> Type:
71
80
  )
72
81
 
73
82
 
83
+ def _transform_df(dataframe: pl.DataFrame, schema: type[Model]) -> pl.DataFrame:
84
+ """Transform any properties of the dataframe according to the model.
85
+
86
+ Currently only supports using AliasGenerator to transform column names to match a model.
87
+
88
+ Args:
89
+ dataframe: Polars DataFrame to be validated.
90
+ schema: Patito model which specifies how the dataframe should be structured.
91
+
92
+ """
93
+ # Check if an alias generator is present in model_config
94
+ if alias_gen := schema.model_config.get("alias_generator"):
95
+ if isinstance(alias_gen, AliasGenerator):
96
+ alias_func = alias_gen.validation_alias or alias_gen.alias
97
+ assert (
98
+ alias_func is not None
99
+ ), "An AliasGenerator must contain a transforming function"
100
+ else: # alias_gen is a function
101
+ alias_func = alias_gen
102
+
103
+ new_cols: list[str] = [
104
+ alias_func(field_name) for field_name in dataframe.columns
105
+ ] # type: ignore
106
+ dataframe.columns = new_cols
107
+ return dataframe
108
+
109
+
74
110
  def _find_errors( # noqa: C901
75
111
  dataframe: pl.DataFrame,
76
112
  schema: Type[Model],
@@ -81,7 +117,6 @@ def _find_errors( # noqa: C901
81
117
  """Validate the given dataframe.
82
118
 
83
119
  Args:
84
- ----
85
120
  dataframe: Polars DataFrame to be validated.
86
121
  schema: Patito model which specifies how the dataframe should be structured.
87
122
  columns: If specified, only validate the given columns. Missing columns will
@@ -92,7 +127,6 @@ def _find_errors( # noqa: C901
92
127
  allow_superfluous_columns: If True, additional columns will not be considered an error.
93
128
 
94
129
  Returns:
95
- -------
96
130
  A list of patito.exception.ErrorWrapper instances. The specific validation
97
131
  error can be retrieved from the "exc" attribute on each error wrapper instance.
98
132
 
@@ -202,21 +236,14 @@ def _find_errors( # noqa: C901
202
236
  )
203
237
 
204
238
  # Test for when only specific values are accepted
205
- if "enum" in column_properties:
206
- permissible_values = set(column_properties["enum"])
207
- if column_name in schema.nullable_columns:
208
- permissible_values.add(None)
209
- actual_values = set(dataframe[column_name].unique())
210
- impermissible_values = actual_values - permissible_values
211
- if impermissible_values:
212
- errors.append(
213
- ErrorWrapper(
214
- RowValueError(
215
- f"Rows with invalid values: {impermissible_values}."
216
- ),
217
- loc=column_name,
218
- )
219
- )
239
+ e = _find_enum_errors(
240
+ df=dataframe,
241
+ column_name=column_name,
242
+ props=column_properties,
243
+ schema=schema,
244
+ )
245
+ if e is not None:
246
+ errors.append(e)
220
247
 
221
248
  if column_info.unique:
222
249
  # Coalescing to 0 in the case of dataframe of height 0
@@ -229,18 +256,76 @@ def _find_errors( # noqa: C901
229
256
  )
230
257
  )
231
258
 
259
+ # Intercept struct columns, and process errors separately
260
+ if schema.dtypes[column_name] == pl.Struct:
261
+ nested_schema = schema.model_fields[column_name].annotation
262
+
263
+ # Additional unpack required if structs column is optional
264
+ if type(nested_schema) == _UnionGenericAlias:
265
+ nested_schema = nested_schema.__args__[0]
266
+
267
+ # We need to filter out any null rows as the submodel won't know
268
+ # that all of a row's columns may be null
269
+ dataframe = dataframe.filter(pl.col(column_name).is_not_null())
270
+ if dataframe.is_empty():
271
+ continue
272
+
273
+ struct_errors = _find_errors(
274
+ dataframe=dataframe.select(column_name).unnest(column_name),
275
+ schema=nested_schema,
276
+ )
277
+
278
+ # Format nested errors
279
+ for error in struct_errors:
280
+ error._loc = f"{column_name}.{error._loc}"
281
+
282
+ errors.extend(struct_errors)
283
+
284
+ # No need to do any more checks
285
+ continue
286
+
287
+ # Intercept list of structs columns, and process errors separately
288
+ elif schema.dtypes[column_name] == pl.List(pl.Struct):
289
+ nested_schema = schema.model_fields[column_name].annotation.__args__[0]
290
+
291
+ # Additional unpack required if structs column is optional
292
+ if type(nested_schema) == _UnionGenericAlias:
293
+ nested_schema = nested_schema.__args__[0]
294
+
295
+ # We need to filter out any null rows as the submodel won't know
296
+ # that all of a row's columns may be null
297
+ dataframe = dataframe.filter(pl.col(column_name).is_not_null())
298
+ if dataframe.is_empty():
299
+ continue
300
+
301
+ list_struct_errors = _find_errors(
302
+ dataframe=dataframe.select(column_name)
303
+ .explode(column_name)
304
+ .unnest(column_name),
305
+ schema=nested_schema,
306
+ )
307
+
308
+ # Format nested errors
309
+ for error in list_struct_errors:
310
+ error._loc = f"{column_name}.{error._loc}"
311
+
312
+ errors.extend(list_struct_errors)
313
+
314
+ # No need to do any more checks
315
+ continue
316
+
232
317
  # Check for bounded value fields
233
318
  col = pl.col(column_name)
234
319
  filters = {
235
- "maximum": lambda v: col <= v,
236
- "exclusiveMaximum": lambda v: col < v,
237
- "minimum": lambda v: col >= v,
238
- "exclusiveMinimum": lambda v: col > v,
239
- "multipleOf": lambda v: (col == 0) | ((col % v) == 0),
240
- "const": lambda v: col == v,
241
- "pattern": lambda v: col.str.contains(v),
242
- "minLength": lambda v: col.str.len_chars() >= v,
243
- "maxLength": lambda v: col.str.len_chars() <= v,
320
+ "maximum": lambda v, col=col: col <= v,
321
+ "exclusiveMaximum": lambda v, col=col: col < v,
322
+ "minimum": lambda v, col=col: col >= v,
323
+ "exclusiveMinimum": lambda v, col=col: col > v,
324
+ "multipleOf": lambda v, col=col: (col == 0) | ((col % v) == 0),
325
+ "const": lambda v, col=col: col == v,
326
+ "pattern": lambda v, col=col: col.str.contains(v),
327
+ "minLength": lambda v, col=col: col.str.len_chars() >= v,
328
+ "maxLength": lambda v, col=col: col.str.len_chars() <= v,
244
329
  }
245
330
  if "anyOf" in column_properties:
246
331
  checks = [
@@ -280,7 +365,7 @@ def _find_errors( # noqa: C901
280
365
  custom_constraints = column_info.constraints
281
366
  if isinstance(custom_constraints, pl.Expr):
282
367
  custom_constraints = [custom_constraints]
283
- constraints = pl.all_horizontal(
368
+ constraints = pl.any_horizontal(
284
369
  [constraint.not_() for constraint in custom_constraints]
285
370
  )
286
371
  if "_" in constraints.meta.root_names():
@@ -305,6 +390,29 @@ def _find_errors( # noqa: C901
305
390
  return errors
306
391
 
307
392
 
393
+ def _find_enum_errors(
394
+ df: pl.DataFrame, column_name: str, props: dict[str, Any], schema: Type[Model]
395
+ ) -> ErrorWrapper | None:
396
+ if "enum" not in props:
397
+ if "items" in props and "enum" in props["items"]:
398
+ return _find_enum_errors(df, column_name, props["items"], schema)
399
+ return None
400
+ permissible_values = set(props["enum"])
401
+ if column_name in schema.nullable_columns:
402
+ permissible_values.add(None)
403
+ if isinstance(df[column_name].dtype, pl.List):
404
+ actual_values = set(df[column_name].explode().unique())
405
+ else:
406
+ actual_values = set(df[column_name].unique())
407
+ impermissible_values = actual_values - permissible_values
408
+ if impermissible_values:
409
+ return ErrorWrapper(
410
+ RowValueError(f"Rows with invalid values: {impermissible_values}."),
411
+ loc=column_name,
412
+ )
413
+ return None
414
+
415
+
308
416
  def validate(
309
417
  dataframe: Union["pd.DataFrame", pl.DataFrame],
310
418
  schema: Type[Model],
@@ -315,14 +423,14 @@ def validate(
315
423
  """Validate the given dataframe.
316
424
 
317
425
  Args:
318
- ----
319
426
  dataframe: Polars DataFrame to be validated.
320
427
  schema: Patito model which specifies how the dataframe should be structured.
428
+ columns: Optional list of columns to validate. If not provided, all columns
429
+ of the dataframe will be validated.
321
430
  allow_missing_columns: If True, missing columns will not be considered an error.
322
431
  allow_superfluous_columns: If True, additional columns will not be considered an error.
323
432
 
324
433
  Raises:
325
- ------
326
434
  DataFrameValidationError: If the given dataframe does not match the given schema.
327
435
 
328
436
  """
@@ -331,6 +439,7 @@ def validate(
331
439
  else:
332
440
  polars_dataframe = cast(pl.DataFrame, dataframe)
333
441
 
442
+ polars_dataframe = _transform_df(polars_dataframe, schema)
334
443
  errors = _find_errors(
335
444
  dataframe=polars_dataframe,
336
445
  schema=schema,
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: patito
3
- Version: 0.6.1
3
+ Version: 0.7.0
4
4
  Summary: A dataframe modelling library built on top of polars and pydantic.
5
- Home-page: https://github.com/kolonialno/patito
5
+ Home-page: https://github.com/JakobGM/patito
6
6
  License: MIT
7
7
  Keywords: validation,dataframe
8
8
  Author: Jakob Gerhard Martinussen
9
9
  Author-email: jakobgm@gmail.com
10
- Requires-Python: >=3.9,<4.0
10
+ Requires-Python: >=3.9
11
11
  Classifier: License :: OSI Approved :: MIT License
12
12
  Classifier: Programming Language :: Python :: 3
13
13
  Classifier: Programming Language :: Python :: 3.9
@@ -17,10 +17,10 @@ Provides-Extra: caching
17
17
  Provides-Extra: docs
18
18
  Provides-Extra: pandas
19
19
  Requires-Dist: Sphinx (<7) ; extra == "docs"
20
- Requires-Dist: pandas ; (python_version >= "3.9" and python_version < "4.0") and (extra == "pandas")
21
- Requires-Dist: polars (>=0.20.1)
22
- Requires-Dist: pyarrow (>=5.0.0) ; (python_version >= "3.9" and python_version < "4.0") and (extra == "caching")
23
- Requires-Dist: pydantic (>=2.0.0)
20
+ Requires-Dist: pandas ; extra == "pandas"
21
+ Requires-Dist: polars (>=1.0.0)
22
+ Requires-Dist: pyarrow (>=5.0.0) ; extra == "caching"
23
+ Requires-Dist: pydantic (>=2.7.0)
24
24
  Requires-Dist: sphinx-autobuild ; extra == "docs"
25
25
  Requires-Dist: sphinx-autodoc-typehints ; extra == "docs"
26
26
  Requires-Dist: sphinx-rtd-theme ; extra == "docs"
@@ -28,7 +28,7 @@ Requires-Dist: sphinx-toolbox ; extra == "docs"
28
28
  Requires-Dist: sphinxcontrib-mermaid ; extra == "docs"
29
29
  Requires-Dist: typing-extensions
30
30
  Project-URL: Documentation, https://patito.readthedocs.io
31
- Project-URL: Repository, https://github.com/kolonialno/patito
31
+ Project-URL: Repository, https://github.com/JakobGM/patito
32
32
  Description-Content-Type: text/markdown
33
33
 
34
34
  # <center><img height="30px" src="https://em-content.zobj.net/thumbs/120/samsung/78/duck_1f986.png"> Patito<center>
@@ -63,7 +63,6 @@ These schema can be used for:
63
63
  🧪 Easy generation of valid mock data frames for tests.\
64
64
  🐍 Retrieve and represent singular rows in an object-oriented manner.\
65
65
  🧠 Provide a single source of truth for the core data models in your code base. \
66
- 🦆 Integration with DuckDB for running flexible SQL queries.
67
66
 
68
67
  Patito has first-class support for [polars]("https://github.com/pola-rs/polars"), a _"blazingly fast DataFrames library written in Rust"_.
69
68
 
@@ -73,16 +72,6 @@ Patito has first-class support for [polars]("https://github.com/pola-rs/polars")
73
72
  pip install patito
74
73
  ```
75
74
 
76
- #### DuckDB Integration
77
-
78
- Patito can also integrate with [DuckDB](https://duckdb.org/).
79
- In order to enable this integration you must explicitly specify it during installation:
80
-
81
- ```sh
82
- pip install 'patito[duckdb]'
83
- ```
84
-
85
-
86
75
  ## Documentation
87
76
 
88
77
  The full documentation of Patio can be found [here](https://patito.readthedocs.io).
@@ -93,7 +82,7 @@ Patito allows you to specify the type of each column in your dataframe by creati
93
82
 
94
83
  ```py
95
84
  # models.py
96
- from typing import Literal, Optional
85
+ from typing import Literal
97
86
 
98
87
  import patito as pt
99
88
 
@@ -118,7 +107,7 @@ df = pl.DataFrame(
118
107
  )
119
108
  try:
120
109
  Product.validate(df)
121
- except pt.ValidationError as exc:
110
+ except pt.exceptions.DataFrameValidationError as exc:
122
111
  print(exc)
123
112
  # 3 validation errors for Product
124
113
  # is_for_sale
@@ -164,7 +153,7 @@ def num_products_for_sale(products: pl.DataFrame) -> int:
164
153
  return products.filter(pl.col("is_for_sale")).height
165
154
  ```
166
155
 
167
- The following test would fail with a `patito.ValidationError`:
156
+ The following test would fail with a `patito.exceptions.DataFrameValidationError`:
168
157
 
169
158
  ```py
170
159
  def test_num_products_for_sale():
@@ -0,0 +1,17 @@
1
+ patito/__init__.py,sha256=4qD13kfoa85_kyTCChm3xQcKKzIy3G8AZQp8T_bjcmo,844
2
+ patito/_docs.py,sha256=9mfttyylWpqaOZv8xfDMEwCHHaY7GQwfyI7CDg7tWe8,162
3
+ patito/_pydantic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ patito/_pydantic/column_info.py,sha256=MMsbMAif0h71-qZYGh5Lcq4bcU_87hmYWwam4zDPFDg,3545
5
+ patito/_pydantic/dtypes/__init__.py,sha256=2vTvL4N4yMN0cbv2CSoa1OFwCswx6FhuQdsYhMaz_dU,578
6
+ patito/_pydantic/dtypes/dtypes.py,sha256=alappjjAYpQ_YZTMwRx9TeDqKsCo4cmiM2HVxrCph2g,9610
7
+ patito/_pydantic/dtypes/utils.py,sha256=idWZORrs3FCTd6FtI8h9A4wZRVTgfUMzcCb7JqLVyiQ,7001
8
+ patito/_pydantic/repr.py,sha256=l9WLjwJ85nJwZCxLIwHih7UuMVVgz17W5th_UD7XZAM,4341
9
+ patito/_pydantic/schema.py,sha256=1XLByZ1jJVP7PUNTkoSPDo0D_hy8QncNLjXKV2N0XDE,3622
10
+ patito/exceptions.py,sha256=VfkkpLblu2Go4QnfWwew7g1NJ_gmynv28p-eGH84tLs,6060
11
+ patito/polars.py,sha256=pv5W_1b-E8523VbYdFfsVxR1lCqE2n2vwlZQ3KdkReA,35436
12
+ patito/pydantic.py,sha256=REaMK0vUwpPO3t-ktP_5PNsiUYcoAAFhMfgFIjaDA5A,48672
13
+ patito/validators.py,sha256=d7lu3MBqaaLLvBVMd5BgarLYpGYHMeJEuTSGAoYqDf0,16231
14
+ patito-0.7.0.dist-info/LICENSE,sha256=3bc4YyuF0e5nd59E3CsR8QM1Ua7pqKfC9DD1LVBVMs4,1139
15
+ patito-0.7.0.dist-info/METADATA,sha256=pLAXcJKh7eFdulpqQWrf5q2r4FqvOwNJlRP3AoZAYlw,13946
16
+ patito-0.7.0.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
17
+ patito-0.7.0.dist-info/RECORD,,
patito/xdg.py DELETED
@@ -1,24 +0,0 @@
1
- """Module implementing the XDG directory standard."""
2
- import os
3
- from pathlib import Path
4
- from typing import Optional
5
-
6
-
7
- def cache_home(application: Optional[str] = None) -> Path:
8
- """Return path to directory containing user-specific non-essential data files.
9
-
10
- Args:
11
- ----
12
- application: An optional name of an application for which to return an
13
- application-specific cache directory for.
14
-
15
- Returns:
16
- -------
17
- A path object pointing to a directory to store cache files.
18
-
19
- """
20
- path = Path(os.environ.get("XDG_CACHE_HOME", "~/.cache")).resolve()
21
- if application:
22
- path = path / application
23
- path.mkdir(exist_ok=True, parents=True)
24
- return path
@@ -1,18 +0,0 @@
1
- patito/__init__.py,sha256=pW3q3tt3gR7JbEdRZ9OZtSoLcyUmrWTtXX6ulvJrwdA,843
2
- patito/_docs.py,sha256=bobkmo8-RRdz80_KY53y_i1Gcp1WWTH5-D5ZHGidpok,161
3
- patito/_pydantic/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
4
- patito/_pydantic/column_info.py,sha256=ifGTRkyst2GkErpK4hAnXoFy4XkXjFEymjklYMDwta8,2956
5
- patito/_pydantic/dtypes/__init__.py,sha256=2vTvL4N4yMN0cbv2CSoa1OFwCswx6FhuQdsYhMaz_dU,578
6
- patito/_pydantic/dtypes/dtypes.py,sha256=54s4lhH76QPrk4kxV-twa0gxd0e4A9MFysvKU76W2uo,9743
7
- patito/_pydantic/dtypes/utils.py,sha256=GDIeqYBGyznYkj4m6sP4csmHpVqWXCf_bU7H1x2fCn4,7036
8
- patito/_pydantic/repr.py,sha256=1UlDlQD5l0Q7n8cGFcd3K8zu6Cb-eWQYeGHwsJWLxhA,4339
9
- patito/_pydantic/schema.py,sha256=i_P-sBGQf_u5AbmWN1RCu5awe-LLH5JAORY_QCydOrI,3643
10
- patito/exceptions.py,sha256=wEBFdo7OVhbSOc3zwd23OyNHck5tPid7FLKSr2aRTKo,5637
11
- patito/polars.py,sha256=Jq9wemYtO58r-0eSQNgKdeva-BsUhrENcliCv9FtHMA,35352
12
- patito/pydantic.py,sha256=E8ktj99-Q-saGq4Tei2BAn2zm5gC83456yzUxSLyUwQ,48696
13
- patito/validators.py,sha256=6mzSNTMS8FiHjPoaAo9fTyl2yTiWtot5mRREvnd9_UU,12144
14
- patito/xdg.py,sha256=XS-dBWjeRTV_vSveplK6CIMe0lVkSzr1F1nNM5Hb6L0,703
15
- patito-0.6.1.dist-info/LICENSE,sha256=3bc4YyuF0e5nd59E3CsR8QM1Ua7pqKfC9DD1LVBVMs4,1139
16
- patito-0.6.1.dist-info/METADATA,sha256=UwEG_kRvluYX7lHRZouzP4lbGRA9TMMEuHBjwZv4bSI,14326
17
- patito-0.6.1.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
18
- patito-0.6.1.dist-info/RECORD,,
File without changes