pointblank 0.17.0__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_agg.py +120 -0
- pointblank/_constants.py +192 -5
- pointblank/_datascan_utils.py +28 -10
- pointblank/_interrogation.py +202 -149
- pointblank/_typing.py +12 -0
- pointblank/_utils.py +81 -44
- pointblank/_utils_ai.py +4 -5
- pointblank/_utils_check_args.py +3 -3
- pointblank/_utils_llms_txt.py +40 -2
- pointblank/actions.py +1 -1
- pointblank/assistant.py +2 -3
- pointblank/cli.py +1 -1
- pointblank/column.py +162 -46
- pointblank/data/api-docs.txt +2695 -49
- pointblank/datascan.py +17 -17
- pointblank/draft.py +2 -3
- pointblank/scan_profile.py +2 -1
- pointblank/schema.py +61 -20
- pointblank/thresholds.py +15 -13
- pointblank/validate.py +780 -231
- pointblank/validate.pyi +1104 -0
- pointblank/yaml.py +10 -6
- {pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/METADATA +2 -2
- {pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/RECORD +29 -27
- {pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/licenses/LICENSE +1 -1
- {pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/WHEEL +0 -0
- {pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/top_level.txt +0 -0
pointblank/data/api-docs.txt
CHANGED
|
@@ -11,7 +11,7 @@ failure thresholds (using the `Thresholds` class or through shorthands for this
|
|
|
11
11
|
`Validate` class has numerous methods for defining validation steps and for obtaining
|
|
12
12
|
post-interrogation metrics and data.
|
|
13
13
|
|
|
14
|
-
Validate(data: '
|
|
14
|
+
Validate(data: 'IntoDataFrame', reference: 'IntoFrame | None' = None, tbl_name: 'str | None' = None, label: 'str | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, final_actions: 'FinalActions | None' = None, brief: 'str | bool | None' = None, lang: 'str | None' = None, locale: 'str | None' = None) -> None
|
|
15
15
|
|
|
16
16
|
Workflow for defining a set of validations on a table and interrogating for results.
|
|
17
17
|
|
|
@@ -916,7 +916,7 @@ FinalActions(*args)
|
|
|
916
916
|
used to retrieve the summary of the validation results.
|
|
917
917
|
|
|
918
918
|
|
|
919
|
-
Schema(columns: 'str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None' = None, tbl: '
|
|
919
|
+
Schema(columns: 'str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None' = None, tbl: 'Any | None' = None, **kwargs)
|
|
920
920
|
Definition of a schema object.
|
|
921
921
|
|
|
922
922
|
The schema object defines the structure of a table. Once it is defined, the object can be used
|
|
@@ -1167,7 +1167,7 @@ Definition of a schema object.
|
|
|
1167
1167
|
`Schema` object is used in a validation workflow.
|
|
1168
1168
|
|
|
1169
1169
|
|
|
1170
|
-
DraftValidation(data: '
|
|
1170
|
+
DraftValidation(data: 'Any', model: 'str', api_key: 'str | None' = None, verify_ssl: 'bool' = True) -> None
|
|
1171
1171
|
|
|
1172
1172
|
Draft a validation plan for a given table using an LLM.
|
|
1173
1173
|
|
|
@@ -1382,7 +1382,7 @@ Validation steps can be thought of as sequential validations on the target
|
|
|
1382
1382
|
data. We call `Validate`'s validation methods to build up a validation plan: a collection of steps
|
|
1383
1383
|
that, in the aggregate, provides good validation coverage.
|
|
1384
1384
|
|
|
1385
|
-
col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1385
|
+
col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1386
1386
|
|
|
1387
1387
|
Are column data greater than a fixed value or data in another column?
|
|
1388
1388
|
|
|
@@ -1607,7 +1607,7 @@ col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1607
1607
|
- Row 3: `c` is `2` and `b` is `2`.
|
|
1608
1608
|
|
|
1609
1609
|
|
|
1610
|
-
col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1610
|
+
col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1611
1611
|
|
|
1612
1612
|
Are column data less than a fixed value or data in another column?
|
|
1613
1613
|
|
|
@@ -1832,7 +1832,7 @@ col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1832
1832
|
- Row 2: `b` is `1` and `c` is `1`.
|
|
1833
1833
|
|
|
1834
1834
|
|
|
1835
|
-
col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1835
|
+
col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1836
1836
|
|
|
1837
1837
|
Are column data greater than or equal to a fixed value or data in another column?
|
|
1838
1838
|
|
|
@@ -2057,7 +2057,7 @@ col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2057
2057
|
- Row 4: `b` is `3` and `c` is `4`.
|
|
2058
2058
|
|
|
2059
2059
|
|
|
2060
|
-
col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2060
|
+
col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2061
2061
|
|
|
2062
2062
|
Are column data less than or equal to a fixed value or data in another column?
|
|
2063
2063
|
|
|
@@ -2282,7 +2282,7 @@ col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2282
2282
|
- Row 4: `c` is `3` and `b` is `2`.
|
|
2283
2283
|
|
|
2284
2284
|
|
|
2285
|
-
col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2285
|
+
col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2286
2286
|
|
|
2287
2287
|
Are column data equal to a fixed value or data in another column?
|
|
2288
2288
|
|
|
@@ -2505,7 +2505,7 @@ col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2505
2505
|
- Row 5: `a` is `5` and `b` is `4`.
|
|
2506
2506
|
|
|
2507
2507
|
|
|
2508
|
-
col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2508
|
+
col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2509
2509
|
|
|
2510
2510
|
Are column data not equal to a fixed value or data in another column?
|
|
2511
2511
|
|
|
@@ -2726,7 +2726,7 @@ col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2726
2726
|
0 and 4, where `a` is `5` and `b` is `5` in both cases (i.e., they are equal to each other).
|
|
2727
2727
|
|
|
2728
2728
|
|
|
2729
|
-
col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2729
|
+
col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2730
2730
|
|
|
2731
2731
|
Do column data lie between two specified values or data in other columns?
|
|
2732
2732
|
|
|
@@ -2971,7 +2971,7 @@ col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2971
2971
|
- Row 4: `b` is `8` but the bounds are `3` (`a`) and `7` (`c`).
|
|
2972
2972
|
|
|
2973
2973
|
|
|
2974
|
-
col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2974
|
+
col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2975
2975
|
|
|
2976
2976
|
Do column data lie outside of two specified values or data in other columns?
|
|
2977
2977
|
|
|
@@ -3216,7 +3216,7 @@ col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
3216
3216
|
- Row 5: `b` is `6` and the bounds are `5` (`a`) and `7` (`c`).
|
|
3217
3217
|
|
|
3218
3218
|
|
|
3219
|
-
col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3219
|
+
col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3220
3220
|
|
|
3221
3221
|
Validate whether column values are in a set of values.
|
|
3222
3222
|
|
|
@@ -3463,7 +3463,7 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
3463
3463
|
specified set.
|
|
3464
3464
|
|
|
3465
3465
|
|
|
3466
|
-
col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3466
|
+
col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3467
3467
|
|
|
3468
3468
|
Validate whether column values are not in a set of values.
|
|
3469
3469
|
|
|
@@ -3687,7 +3687,7 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3687
3687
|
statuses in the `InvalidStatus` enum.
|
|
3688
3688
|
|
|
3689
3689
|
|
|
3690
|
-
col_vals_increasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, decreasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3690
|
+
col_vals_increasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, decreasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3691
3691
|
|
|
3692
3692
|
Are column data increasing by row?
|
|
3693
3693
|
|
|
@@ -3815,7 +3815,7 @@ col_vals_increasing(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3815
3815
|
```
|
|
3816
3816
|
|
|
3817
3817
|
|
|
3818
|
-
col_vals_decreasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, increasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3818
|
+
col_vals_decreasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, increasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3819
3819
|
|
|
3820
3820
|
Are column data decreasing by row?
|
|
3821
3821
|
|
|
@@ -3943,7 +3943,7 @@ col_vals_decreasing(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3943
3943
|
```
|
|
3944
3944
|
|
|
3945
3945
|
|
|
3946
|
-
col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3946
|
+
col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3947
3947
|
|
|
3948
3948
|
Validate whether values in a column are Null.
|
|
3949
3949
|
|
|
@@ -4129,7 +4129,7 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
|
|
|
4129
4129
|
two non-Null values in column `b`.
|
|
4130
4130
|
|
|
4131
4131
|
|
|
4132
|
-
col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4132
|
+
col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4133
4133
|
|
|
4134
4134
|
Validate whether values in a column are not Null.
|
|
4135
4135
|
|
|
@@ -4315,7 +4315,7 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
|
|
|
4315
4315
|
two Null values in column `b`.
|
|
4316
4316
|
|
|
4317
4317
|
|
|
4318
|
-
col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, inverse: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4318
|
+
col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, inverse: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4319
4319
|
|
|
4320
4320
|
Validate whether column values match a regular expression pattern.
|
|
4321
4321
|
|
|
@@ -4511,7 +4511,7 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
|
|
|
4511
4511
|
string values of rows 1 and 2 in column `b`.
|
|
4512
4512
|
|
|
4513
4513
|
|
|
4514
|
-
col_vals_within_spec(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', spec: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4514
|
+
col_vals_within_spec(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', spec: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4515
4515
|
|
|
4516
4516
|
Validate whether column values fit within a specification.
|
|
4517
4517
|
|
|
@@ -4729,7 +4729,7 @@ col_vals_within_spec(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
4729
4729
|
The validation table shows that one test unit failed (the invalid email address in row 3).
|
|
4730
4730
|
|
|
4731
4731
|
|
|
4732
|
-
col_vals_expr(self, expr: '
|
|
4732
|
+
col_vals_expr(self, expr: 'Any', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4733
4733
|
|
|
4734
4734
|
Validate column values using a custom expression.
|
|
4735
4735
|
|
|
@@ -4900,7 +4900,2653 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
|
|
|
4900
4900
|
by using `col_vals_expr()`. All test units passed, with no failing test units.
|
|
4901
4901
|
|
|
4902
4902
|
|
|
4903
|
-
|
|
4903
|
+
col_sum_gt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4904
|
+
Does the column sum satisfy a greater than comparison?
|
|
4905
|
+
|
|
4906
|
+
The `col_sum_gt()` validation method checks whether the sum of values in a column
|
|
4907
|
+
is greater than a specified `value=`. This is an aggregation-based validation where the entire
|
|
4908
|
+
column is reduced to a single sum value that is then compared against the target. The
|
|
4909
|
+
comparison used in this function is `sum(column) > value`.
|
|
4910
|
+
|
|
4911
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
4912
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
4913
|
+
the comparison) or fails completely.
|
|
4914
|
+
|
|
4915
|
+
Parameters
|
|
4916
|
+
----------
|
|
4917
|
+
columns
|
|
4918
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
4919
|
+
there will be a separate validation step generated for each column. The columns must
|
|
4920
|
+
contain numeric data for the sum to be computed.
|
|
4921
|
+
value
|
|
4922
|
+
The value to compare the column sum against. This can be: (1) a numeric literal
|
|
4923
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
4924
|
+
whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
4925
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
4926
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
4927
|
+
`ref(column_name)` when reference data is set).
|
|
4928
|
+
tol
|
|
4929
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
4930
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
4931
|
+
a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
4932
|
+
`col_sum_gt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
4933
|
+
target value and still pass validation.
|
|
4934
|
+
thresholds
|
|
4935
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
4936
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
4937
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
4938
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
4939
|
+
acceptable.
|
|
4940
|
+
brief
|
|
4941
|
+
An optional brief description of the validation step that will be displayed in the
|
|
4942
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
4943
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
4944
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
4945
|
+
won't be a brief.
|
|
4946
|
+
actions
|
|
4947
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
4948
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
4949
|
+
define the actions.
|
|
4950
|
+
active
|
|
4951
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
4952
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
4953
|
+
for the steps unchanged).
|
|
4954
|
+
|
|
4955
|
+
Returns
|
|
4956
|
+
-------
|
|
4957
|
+
Validate
|
|
4958
|
+
The `Validate` object with the added validation step.
|
|
4959
|
+
|
|
4960
|
+
Using Reference Data
|
|
4961
|
+
--------------------
|
|
4962
|
+
The `col_sum_gt()` method supports comparing column aggregations against reference data. This
|
|
4963
|
+
is useful for validating that statistical properties remain consistent across different
|
|
4964
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
4965
|
+
|
|
4966
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
4967
|
+
|
|
4968
|
+
```python
|
|
4969
|
+
validation = (
|
|
4970
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
4971
|
+
.col_sum_gt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
4972
|
+
.interrogate()
|
|
4973
|
+
)
|
|
4974
|
+
```
|
|
4975
|
+
|
|
4976
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
4977
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
4978
|
+
the `ref()` helper:
|
|
4979
|
+
|
|
4980
|
+
```python
|
|
4981
|
+
.col_sum_gt(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
4982
|
+
```
|
|
4983
|
+
|
|
4984
|
+
Understanding Tolerance
|
|
4985
|
+
-----------------------
|
|
4986
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
4987
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
4988
|
+
|
|
4989
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
4990
|
+
`col_sum_gt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
4991
|
+
target value and still pass validation.
|
|
4992
|
+
|
|
4993
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
4994
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
4995
|
+
shifts the comparison boundary.
|
|
4996
|
+
|
|
4997
|
+
Thresholds
|
|
4998
|
+
----------
|
|
4999
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5000
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5001
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5002
|
+
|
|
5003
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
5004
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
5005
|
+
typically set as absolute counts:
|
|
5006
|
+
|
|
5007
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
5008
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
5009
|
+
|
|
5010
|
+
Thresholds can be defined using one of these input schemes:
|
|
5011
|
+
|
|
5012
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5013
|
+
thresholds)
|
|
5014
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5015
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5016
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5017
|
+
'critical'
|
|
5018
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5019
|
+
for the 'warning' level only
|
|
5020
|
+
|
|
5021
|
+
Examples
|
|
5022
|
+
--------
|
|
5023
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
5024
|
+
shown below:
|
|
5025
|
+
|
|
5026
|
+
```python
|
|
5027
|
+
import pointblank as pb
|
|
5028
|
+
import polars as pl
|
|
5029
|
+
|
|
5030
|
+
tbl = pl.DataFrame(
|
|
5031
|
+
{
|
|
5032
|
+
"a": [1, 2, 3, 4, 5],
|
|
5033
|
+
"b": [2, 2, 2, 2, 2],
|
|
5034
|
+
}
|
|
5035
|
+
)
|
|
5036
|
+
|
|
5037
|
+
pb.preview(tbl)
|
|
5038
|
+
```
|
|
5039
|
+
|
|
5040
|
+
Let's validate that the sum of column `a` is greater than `15`:
|
|
5041
|
+
|
|
5042
|
+
```python
|
|
5043
|
+
validation = (
|
|
5044
|
+
pb.Validate(data=tbl)
|
|
5045
|
+
.col_sum_gt(columns="a", value=15)
|
|
5046
|
+
.interrogate()
|
|
5047
|
+
)
|
|
5048
|
+
|
|
5049
|
+
validation
|
|
5050
|
+
```
|
|
5051
|
+
|
|
5052
|
+
The validation result shows whether the sum comparison passed or failed. Since this
|
|
5053
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
5054
|
+
|
|
5055
|
+
When validating multiple columns, each column gets its own validation step:
|
|
5056
|
+
|
|
5057
|
+
```python
|
|
5058
|
+
validation = (
|
|
5059
|
+
pb.Validate(data=tbl)
|
|
5060
|
+
.col_sum_gt(columns=["a", "b"], value=15)
|
|
5061
|
+
.interrogate()
|
|
5062
|
+
)
|
|
5063
|
+
|
|
5064
|
+
validation
|
|
5065
|
+
```
|
|
5066
|
+
|
|
5067
|
+
Using tolerance for flexible comparisons:
|
|
5068
|
+
|
|
5069
|
+
```python
|
|
5070
|
+
validation = (
|
|
5071
|
+
pb.Validate(data=tbl)
|
|
5072
|
+
.col_sum_gt(columns="a", value=15, tol=1.0)
|
|
5073
|
+
.interrogate()
|
|
5074
|
+
)
|
|
5075
|
+
|
|
5076
|
+
validation
|
|
5077
|
+
```
|
|
5078
|
+
|
|
5079
|
+
col_sum_lt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5080
|
+
Does the column sum satisfy a less than comparison?
|
|
5081
|
+
|
|
5082
|
+
The `col_sum_lt()` validation method checks whether the sum of values in a column
|
|
5083
|
+
is less than a specified `value=`. This is an aggregation-based validation where the entire
|
|
5084
|
+
column is reduced to a single sum value that is then compared against the target. The
|
|
5085
|
+
comparison used in this function is `sum(column) < value`.
|
|
5086
|
+
|
|
5087
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
5088
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
5089
|
+
the comparison) or fails completely.
|
|
5090
|
+
|
|
5091
|
+
Parameters
|
|
5092
|
+
----------
|
|
5093
|
+
columns
|
|
5094
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
5095
|
+
there will be a separate validation step generated for each column. The columns must
|
|
5096
|
+
contain numeric data for the sum to be computed.
|
|
5097
|
+
value
|
|
5098
|
+
The value to compare the column sum against. This can be: (1) a numeric literal
|
|
5099
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
5100
|
+
whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
5101
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
5102
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
5103
|
+
`ref(column_name)` when reference data is set).
|
|
5104
|
+
tol
|
|
5105
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
5106
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
5107
|
+
a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5108
|
+
`col_sum_lt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
5109
|
+
target value and still pass validation.
|
|
5110
|
+
thresholds
|
|
5111
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
5112
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
5113
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
5114
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
5115
|
+
acceptable.
|
|
5116
|
+
brief
|
|
5117
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5118
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5119
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5120
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5121
|
+
won't be a brief.
|
|
5122
|
+
actions
|
|
5123
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
5124
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5125
|
+
define the actions.
|
|
5126
|
+
active
|
|
5127
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5128
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5129
|
+
for the steps unchanged).
|
|
5130
|
+
|
|
5131
|
+
Returns
|
|
5132
|
+
-------
|
|
5133
|
+
Validate
|
|
5134
|
+
The `Validate` object with the added validation step.
|
|
5135
|
+
|
|
5136
|
+
Using Reference Data
|
|
5137
|
+
--------------------
|
|
5138
|
+
The `col_sum_lt()` method supports comparing column aggregations against reference data. This
|
|
5139
|
+
is useful for validating that statistical properties remain consistent across different
|
|
5140
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
5141
|
+
|
|
5142
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
5143
|
+
|
|
5144
|
+
```python
|
|
5145
|
+
validation = (
|
|
5146
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
5147
|
+
.col_sum_lt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
5148
|
+
.interrogate()
|
|
5149
|
+
)
|
|
5150
|
+
```
|
|
5151
|
+
|
|
5152
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
5153
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
5154
|
+
the `ref()` helper:
|
|
5155
|
+
|
|
5156
|
+
```python
|
|
5157
|
+
.col_sum_lt(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
5158
|
+
```
|
|
5159
|
+
|
|
5160
|
+
Understanding Tolerance
|
|
5161
|
+
-----------------------
|
|
5162
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
5163
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
5164
|
+
|
|
5165
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5166
|
+
`col_sum_lt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
5167
|
+
target value and still pass validation.
|
|
5168
|
+
|
|
5169
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
5170
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
5171
|
+
shifts the comparison boundary.
|
|
5172
|
+
|
|
5173
|
+
Thresholds
|
|
5174
|
+
----------
|
|
5175
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5176
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5177
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5178
|
+
|
|
5179
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
5180
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
5181
|
+
typically set as absolute counts:
|
|
5182
|
+
|
|
5183
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
5184
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
5185
|
+
|
|
5186
|
+
Thresholds can be defined using one of these input schemes:
|
|
5187
|
+
|
|
5188
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5189
|
+
thresholds)
|
|
5190
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5191
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5192
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5193
|
+
'critical'
|
|
5194
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5195
|
+
for the 'warning' level only
|
|
5196
|
+
|
|
5197
|
+
Examples
|
|
5198
|
+
--------
|
|
5199
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
5200
|
+
shown below:
|
|
5201
|
+
|
|
5202
|
+
```python
|
|
5203
|
+
import pointblank as pb
|
|
5204
|
+
import polars as pl
|
|
5205
|
+
|
|
5206
|
+
tbl = pl.DataFrame(
|
|
5207
|
+
{
|
|
5208
|
+
"a": [1, 2, 3, 4, 5],
|
|
5209
|
+
"b": [2, 2, 2, 2, 2],
|
|
5210
|
+
}
|
|
5211
|
+
)
|
|
5212
|
+
|
|
5213
|
+
pb.preview(tbl)
|
|
5214
|
+
```
|
|
5215
|
+
|
|
5216
|
+
Let's validate that the sum of column `a` is less than `15`:
|
|
5217
|
+
|
|
5218
|
+
```python
|
|
5219
|
+
validation = (
|
|
5220
|
+
pb.Validate(data=tbl)
|
|
5221
|
+
.col_sum_lt(columns="a", value=15)
|
|
5222
|
+
.interrogate()
|
|
5223
|
+
)
|
|
5224
|
+
|
|
5225
|
+
validation
|
|
5226
|
+
```
|
|
5227
|
+
|
|
5228
|
+
The validation result shows whether the sum comparison passed or failed. Since this
|
|
5229
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
5230
|
+
|
|
5231
|
+
When validating multiple columns, each column gets its own validation step:
|
|
5232
|
+
|
|
5233
|
+
```python
|
|
5234
|
+
validation = (
|
|
5235
|
+
pb.Validate(data=tbl)
|
|
5236
|
+
.col_sum_lt(columns=["a", "b"], value=15)
|
|
5237
|
+
.interrogate()
|
|
5238
|
+
)
|
|
5239
|
+
|
|
5240
|
+
validation
|
|
5241
|
+
```
|
|
5242
|
+
|
|
5243
|
+
Using tolerance for flexible comparisons:
|
|
5244
|
+
|
|
5245
|
+
```python
|
|
5246
|
+
validation = (
|
|
5247
|
+
pb.Validate(data=tbl)
|
|
5248
|
+
.col_sum_lt(columns="a", value=15, tol=1.0)
|
|
5249
|
+
.interrogate()
|
|
5250
|
+
)
|
|
5251
|
+
|
|
5252
|
+
validation
|
|
5253
|
+
```
|
|
5254
|
+
|
|
5255
|
+
col_sum_ge(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5256
|
+
Does the column sum satisfy a greater than or equal to comparison?
|
|
5257
|
+
|
|
5258
|
+
The `col_sum_ge()` validation method checks whether the sum of values in a column
|
|
5259
|
+
is at least a specified `value=`. This is an aggregation-based validation where the entire
|
|
5260
|
+
column is reduced to a single sum value that is then compared against the target. The
|
|
5261
|
+
comparison used in this function is `sum(column) >= value`.
|
|
5262
|
+
|
|
5263
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
5264
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
5265
|
+
the comparison) or fails completely.
|
|
5266
|
+
|
|
5267
|
+
Parameters
|
|
5268
|
+
----------
|
|
5269
|
+
columns
|
|
5270
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
5271
|
+
there will be a separate validation step generated for each column. The columns must
|
|
5272
|
+
contain numeric data for the sum to be computed.
|
|
5273
|
+
value
|
|
5274
|
+
The value to compare the column sum against. This can be: (1) a numeric literal
|
|
5275
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
5276
|
+
whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
5277
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
5278
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
5279
|
+
`ref(column_name)` when reference data is set).
|
|
5280
|
+
tol
|
|
5281
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
5282
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
5283
|
+
a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5284
|
+
`col_sum_ge()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
5285
|
+
target value and still pass validation.
|
|
5286
|
+
thresholds
|
|
5287
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
5288
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
5289
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
5290
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
5291
|
+
acceptable.
|
|
5292
|
+
brief
|
|
5293
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5294
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5295
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5296
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5297
|
+
won't be a brief.
|
|
5298
|
+
actions
|
|
5299
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
5300
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5301
|
+
define the actions.
|
|
5302
|
+
active
|
|
5303
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5304
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5305
|
+
for the steps unchanged).
|
|
5306
|
+
|
|
5307
|
+
Returns
|
|
5308
|
+
-------
|
|
5309
|
+
Validate
|
|
5310
|
+
The `Validate` object with the added validation step.
|
|
5311
|
+
|
|
5312
|
+
Using Reference Data
|
|
5313
|
+
--------------------
|
|
5314
|
+
The `col_sum_ge()` method supports comparing column aggregations against reference data. This
|
|
5315
|
+
is useful for validating that statistical properties remain consistent across different
|
|
5316
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
5317
|
+
|
|
5318
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
5319
|
+
|
|
5320
|
+
```python
|
|
5321
|
+
validation = (
|
|
5322
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
5323
|
+
.col_sum_ge(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
5324
|
+
.interrogate()
|
|
5325
|
+
)
|
|
5326
|
+
```
|
|
5327
|
+
|
|
5328
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
5329
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
5330
|
+
the `ref()` helper:
|
|
5331
|
+
|
|
5332
|
+
```python
|
|
5333
|
+
.col_sum_ge(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
5334
|
+
```
|
|
5335
|
+
|
|
5336
|
+
Understanding Tolerance
|
|
5337
|
+
-----------------------
|
|
5338
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
5339
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
5340
|
+
|
|
5341
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5342
|
+
`col_sum_ge()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
5343
|
+
target value and still pass validation.
|
|
5344
|
+
|
|
5345
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
5346
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
5347
|
+
shifts the comparison boundary.
|
|
5348
|
+
|
|
5349
|
+
Thresholds
|
|
5350
|
+
----------
|
|
5351
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5352
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5353
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5354
|
+
|
|
5355
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
5356
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
5357
|
+
typically set as absolute counts:
|
|
5358
|
+
|
|
5359
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
5360
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
5361
|
+
|
|
5362
|
+
Thresholds can be defined using one of these input schemes:
|
|
5363
|
+
|
|
5364
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5365
|
+
thresholds)
|
|
5366
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5367
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5368
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5369
|
+
'critical'
|
|
5370
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5371
|
+
for the 'warning' level only
|
|
5372
|
+
|
|
5373
|
+
Examples
|
|
5374
|
+
--------
|
|
5375
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
5376
|
+
shown below:
|
|
5377
|
+
|
|
5378
|
+
```python
|
|
5379
|
+
import pointblank as pb
|
|
5380
|
+
import polars as pl
|
|
5381
|
+
|
|
5382
|
+
tbl = pl.DataFrame(
|
|
5383
|
+
{
|
|
5384
|
+
"a": [1, 2, 3, 4, 5],
|
|
5385
|
+
"b": [2, 2, 2, 2, 2],
|
|
5386
|
+
}
|
|
5387
|
+
)
|
|
5388
|
+
|
|
5389
|
+
pb.preview(tbl)
|
|
5390
|
+
```
|
|
5391
|
+
|
|
5392
|
+
Let's validate that the sum of column `a` is at least `15`:
|
|
5393
|
+
|
|
5394
|
+
```python
|
|
5395
|
+
validation = (
|
|
5396
|
+
pb.Validate(data=tbl)
|
|
5397
|
+
.col_sum_ge(columns="a", value=15)
|
|
5398
|
+
.interrogate()
|
|
5399
|
+
)
|
|
5400
|
+
|
|
5401
|
+
validation
|
|
5402
|
+
```
|
|
5403
|
+
|
|
5404
|
+
The validation result shows whether the sum comparison passed or failed. Since this
|
|
5405
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
5406
|
+
|
|
5407
|
+
When validating multiple columns, each column gets its own validation step:
|
|
5408
|
+
|
|
5409
|
+
```python
|
|
5410
|
+
validation = (
|
|
5411
|
+
pb.Validate(data=tbl)
|
|
5412
|
+
.col_sum_ge(columns=["a", "b"], value=15)
|
|
5413
|
+
.interrogate()
|
|
5414
|
+
)
|
|
5415
|
+
|
|
5416
|
+
validation
|
|
5417
|
+
```
|
|
5418
|
+
|
|
5419
|
+
Using tolerance for flexible comparisons:
|
|
5420
|
+
|
|
5421
|
+
```python
|
|
5422
|
+
validation = (
|
|
5423
|
+
pb.Validate(data=tbl)
|
|
5424
|
+
.col_sum_ge(columns="a", value=15, tol=1.0)
|
|
5425
|
+
.interrogate()
|
|
5426
|
+
)
|
|
5427
|
+
|
|
5428
|
+
validation
|
|
5429
|
+
```
|
|
5430
|
+
|
|
5431
|
+
col_sum_le(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5432
|
+
Does the column sum satisfy a less than or equal to comparison?
|
|
5433
|
+
|
|
5434
|
+
The `col_sum_le()` validation method checks whether the sum of values in a column
|
|
5435
|
+
is at most a specified `value=`. This is an aggregation-based validation where the entire
|
|
5436
|
+
column is reduced to a single sum value that is then compared against the target. The
|
|
5437
|
+
comparison used in this function is `sum(column) <= value`.
|
|
5438
|
+
|
|
5439
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
5440
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
5441
|
+
the comparison) or fails completely.
|
|
5442
|
+
|
|
5443
|
+
Parameters
|
|
5444
|
+
----------
|
|
5445
|
+
columns
|
|
5446
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
5447
|
+
there will be a separate validation step generated for each column. The columns must
|
|
5448
|
+
contain numeric data for the sum to be computed.
|
|
5449
|
+
value
|
|
5450
|
+
The value to compare the column sum against. This can be: (1) a numeric literal
|
|
5451
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
5452
|
+
whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
5453
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
5454
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
5455
|
+
`ref(column_name)` when reference data is set).
|
|
5456
|
+
tol
|
|
5457
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
5458
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
5459
|
+
a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5460
|
+
`col_sum_le()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
5461
|
+
target value and still pass validation.
|
|
5462
|
+
thresholds
|
|
5463
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
5464
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
5465
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
5466
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
5467
|
+
acceptable.
|
|
5468
|
+
brief
|
|
5469
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5470
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5471
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5472
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5473
|
+
won't be a brief.
|
|
5474
|
+
actions
|
|
5475
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
5476
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5477
|
+
define the actions.
|
|
5478
|
+
active
|
|
5479
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5480
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5481
|
+
for the steps unchanged).
|
|
5482
|
+
|
|
5483
|
+
Returns
|
|
5484
|
+
-------
|
|
5485
|
+
Validate
|
|
5486
|
+
The `Validate` object with the added validation step.
|
|
5487
|
+
|
|
5488
|
+
Using Reference Data
|
|
5489
|
+
--------------------
|
|
5490
|
+
The `col_sum_le()` method supports comparing column aggregations against reference data. This
|
|
5491
|
+
is useful for validating that statistical properties remain consistent across different
|
|
5492
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
5493
|
+
|
|
5494
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
5495
|
+
|
|
5496
|
+
```python
|
|
5497
|
+
validation = (
|
|
5498
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
5499
|
+
.col_sum_le(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
5500
|
+
.interrogate()
|
|
5501
|
+
)
|
|
5502
|
+
```
|
|
5503
|
+
|
|
5504
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
5505
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
5506
|
+
the `ref()` helper:
|
|
5507
|
+
|
|
5508
|
+
```python
|
|
5509
|
+
.col_sum_le(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
5510
|
+
```
|
|
5511
|
+
|
|
5512
|
+
Understanding Tolerance
|
|
5513
|
+
-----------------------
|
|
5514
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
5515
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
5516
|
+
|
|
5517
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5518
|
+
`col_sum_le()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
5519
|
+
target value and still pass validation.
|
|
5520
|
+
|
|
5521
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
5522
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
5523
|
+
shifts the comparison boundary.
|
|
5524
|
+
|
|
5525
|
+
Thresholds
|
|
5526
|
+
----------
|
|
5527
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5528
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5529
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5530
|
+
|
|
5531
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
5532
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
5533
|
+
typically set as absolute counts:
|
|
5534
|
+
|
|
5535
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
5536
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
5537
|
+
|
|
5538
|
+
Thresholds can be defined using one of these input schemes:
|
|
5539
|
+
|
|
5540
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5541
|
+
thresholds)
|
|
5542
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5543
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5544
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5545
|
+
'critical'
|
|
5546
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5547
|
+
for the 'warning' level only
|
|
5548
|
+
|
|
5549
|
+
Examples
|
|
5550
|
+
--------
|
|
5551
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
5552
|
+
shown below:
|
|
5553
|
+
|
|
5554
|
+
```python
|
|
5555
|
+
import pointblank as pb
|
|
5556
|
+
import polars as pl
|
|
5557
|
+
|
|
5558
|
+
tbl = pl.DataFrame(
|
|
5559
|
+
{
|
|
5560
|
+
"a": [1, 2, 3, 4, 5],
|
|
5561
|
+
"b": [2, 2, 2, 2, 2],
|
|
5562
|
+
}
|
|
5563
|
+
)
|
|
5564
|
+
|
|
5565
|
+
pb.preview(tbl)
|
|
5566
|
+
```
|
|
5567
|
+
|
|
5568
|
+
Let's validate that the sum of column `a` is at most `15`:
|
|
5569
|
+
|
|
5570
|
+
```python
|
|
5571
|
+
validation = (
|
|
5572
|
+
pb.Validate(data=tbl)
|
|
5573
|
+
.col_sum_le(columns="a", value=15)
|
|
5574
|
+
.interrogate()
|
|
5575
|
+
)
|
|
5576
|
+
|
|
5577
|
+
validation
|
|
5578
|
+
```
|
|
5579
|
+
|
|
5580
|
+
The validation result shows whether the sum comparison passed or failed. Since this
|
|
5581
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
5582
|
+
|
|
5583
|
+
When validating multiple columns, each column gets its own validation step:
|
|
5584
|
+
|
|
5585
|
+
```python
|
|
5586
|
+
validation = (
|
|
5587
|
+
pb.Validate(data=tbl)
|
|
5588
|
+
.col_sum_le(columns=["a", "b"], value=15)
|
|
5589
|
+
.interrogate()
|
|
5590
|
+
)
|
|
5591
|
+
|
|
5592
|
+
validation
|
|
5593
|
+
```
|
|
5594
|
+
|
|
5595
|
+
Using tolerance for flexible comparisons:
|
|
5596
|
+
|
|
5597
|
+
```python
|
|
5598
|
+
validation = (
|
|
5599
|
+
pb.Validate(data=tbl)
|
|
5600
|
+
.col_sum_le(columns="a", value=15, tol=1.0)
|
|
5601
|
+
.interrogate()
|
|
5602
|
+
)
|
|
5603
|
+
|
|
5604
|
+
validation
|
|
5605
|
+
```
|
|
5606
|
+
|
|
5607
|
+
col_sum_eq(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5608
|
+
Does the column sum satisfy an equal to comparison?
|
|
5609
|
+
|
|
5610
|
+
The `col_sum_eq()` validation method checks whether the sum of values in a column
|
|
5611
|
+
equals a specified `value=`. This is an aggregation-based validation where the entire
|
|
5612
|
+
column is reduced to a single sum value that is then compared against the target. The
|
|
5613
|
+
comparison used in this function is `sum(column) == value`.
|
|
5614
|
+
|
|
5615
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
5616
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
5617
|
+
the comparison) or fails completely.
|
|
5618
|
+
|
|
5619
|
+
Parameters
|
|
5620
|
+
----------
|
|
5621
|
+
columns
|
|
5622
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
5623
|
+
there will be a separate validation step generated for each column. The columns must
|
|
5624
|
+
contain numeric data for the sum to be computed.
|
|
5625
|
+
value
|
|
5626
|
+
The value to compare the column sum against. This can be: (1) a numeric literal
|
|
5627
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
5628
|
+
whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
5629
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
5630
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
5631
|
+
`ref(column_name)` when reference data is set).
|
|
5632
|
+
tol
|
|
5633
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
5634
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
5635
|
+
a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter is particularly useful with `col_sum_eq()` since exact equality
|
|
5636
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
5637
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
5638
|
+
floating-point arithmetic.
|
|
5639
|
+
thresholds
|
|
5640
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
5641
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
5642
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
5643
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
5644
|
+
acceptable.
|
|
5645
|
+
brief
|
|
5646
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5647
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5648
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5649
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5650
|
+
won't be a brief.
|
|
5651
|
+
actions
|
|
5652
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
5653
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5654
|
+
define the actions.
|
|
5655
|
+
active
|
|
5656
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5657
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5658
|
+
for the steps unchanged).
|
|
5659
|
+
|
|
5660
|
+
Returns
|
|
5661
|
+
-------
|
|
5662
|
+
Validate
|
|
5663
|
+
The `Validate` object with the added validation step.
|
|
5664
|
+
|
|
5665
|
+
Using Reference Data
|
|
5666
|
+
--------------------
|
|
5667
|
+
The `col_sum_eq()` method supports comparing column aggregations against reference data. This
|
|
5668
|
+
is useful for validating that statistical properties remain consistent across different
|
|
5669
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
5670
|
+
|
|
5671
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
5672
|
+
|
|
5673
|
+
```python
|
|
5674
|
+
validation = (
|
|
5675
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
5676
|
+
.col_sum_eq(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
5677
|
+
.interrogate()
|
|
5678
|
+
)
|
|
5679
|
+
```
|
|
5680
|
+
|
|
5681
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
5682
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
5683
|
+
the `ref()` helper:
|
|
5684
|
+
|
|
5685
|
+
```python
|
|
5686
|
+
.col_sum_eq(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
5687
|
+
```
|
|
5688
|
+
|
|
5689
|
+
Understanding Tolerance
|
|
5690
|
+
-----------------------
|
|
5691
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
5692
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
5693
|
+
|
|
5694
|
+
The `tol=` parameter is particularly useful with `col_sum_eq()` since exact equality
|
|
5695
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
5696
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
5697
|
+
floating-point arithmetic.
|
|
5698
|
+
|
|
5699
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
5700
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
5701
|
+
shifts the comparison boundary.
|
|
5702
|
+
|
|
5703
|
+
Thresholds
|
|
5704
|
+
----------
|
|
5705
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5706
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5707
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5708
|
+
|
|
5709
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
5710
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
5711
|
+
typically set as absolute counts:
|
|
5712
|
+
|
|
5713
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
5714
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
5715
|
+
|
|
5716
|
+
Thresholds can be defined using one of these input schemes:
|
|
5717
|
+
|
|
5718
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5719
|
+
thresholds)
|
|
5720
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5721
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5722
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5723
|
+
'critical'
|
|
5724
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5725
|
+
for the 'warning' level only
|
|
5726
|
+
|
|
5727
|
+
Examples
|
|
5728
|
+
--------
|
|
5729
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
5730
|
+
shown below:
|
|
5731
|
+
|
|
5732
|
+
```python
|
|
5733
|
+
import pointblank as pb
|
|
5734
|
+
import polars as pl
|
|
5735
|
+
|
|
5736
|
+
tbl = pl.DataFrame(
|
|
5737
|
+
{
|
|
5738
|
+
"a": [1, 2, 3, 4, 5],
|
|
5739
|
+
"b": [2, 2, 2, 2, 2],
|
|
5740
|
+
}
|
|
5741
|
+
)
|
|
5742
|
+
|
|
5743
|
+
pb.preview(tbl)
|
|
5744
|
+
```
|
|
5745
|
+
|
|
5746
|
+
Let's validate that the sum of column `a` equals `15`:
|
|
5747
|
+
|
|
5748
|
+
```python
|
|
5749
|
+
validation = (
|
|
5750
|
+
pb.Validate(data=tbl)
|
|
5751
|
+
.col_sum_eq(columns="a", value=15)
|
|
5752
|
+
.interrogate()
|
|
5753
|
+
)
|
|
5754
|
+
|
|
5755
|
+
validation
|
|
5756
|
+
```
|
|
5757
|
+
|
|
5758
|
+
The validation result shows whether the sum comparison passed or failed. Since this
|
|
5759
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
5760
|
+
|
|
5761
|
+
When validating multiple columns, each column gets its own validation step:
|
|
5762
|
+
|
|
5763
|
+
```python
|
|
5764
|
+
validation = (
|
|
5765
|
+
pb.Validate(data=tbl)
|
|
5766
|
+
.col_sum_eq(columns=["a", "b"], value=15)
|
|
5767
|
+
.interrogate()
|
|
5768
|
+
)
|
|
5769
|
+
|
|
5770
|
+
validation
|
|
5771
|
+
```
|
|
5772
|
+
|
|
5773
|
+
Using tolerance for flexible comparisons:
|
|
5774
|
+
|
|
5775
|
+
```python
|
|
5776
|
+
validation = (
|
|
5777
|
+
pb.Validate(data=tbl)
|
|
5778
|
+
.col_sum_eq(columns="a", value=15, tol=1.0)
|
|
5779
|
+
.interrogate()
|
|
5780
|
+
)
|
|
5781
|
+
|
|
5782
|
+
validation
|
|
5783
|
+
```
|
|
5784
|
+
|
|
5785
|
+
col_avg_gt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5786
|
+
Does the column average satisfy a greater than comparison?
|
|
5787
|
+
|
|
5788
|
+
The `col_avg_gt()` validation method checks whether the average of values in a column
|
|
5789
|
+
is greater than a specified `value=`. This is an aggregation-based validation where the entire
|
|
5790
|
+
column is reduced to a single average value that is then compared against the target. The
|
|
5791
|
+
comparison used in this function is `average(column) > value`.
|
|
5792
|
+
|
|
5793
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
5794
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
5795
|
+
the comparison) or fails completely.
|
|
5796
|
+
|
|
5797
|
+
Parameters
|
|
5798
|
+
----------
|
|
5799
|
+
columns
|
|
5800
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
5801
|
+
there will be a separate validation step generated for each column. The columns must
|
|
5802
|
+
contain numeric data for the average to be computed.
|
|
5803
|
+
value
|
|
5804
|
+
The value to compare the column average against. This can be: (1) a numeric literal
|
|
5805
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
5806
|
+
whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
5807
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
5808
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
5809
|
+
`ref(column_name)` when reference data is set).
|
|
5810
|
+
tol
|
|
5811
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
5812
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
5813
|
+
a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5814
|
+
`col_avg_gt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
5815
|
+
target value and still pass validation.
|
|
5816
|
+
thresholds
|
|
5817
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
5818
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
5819
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
5820
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
5821
|
+
acceptable.
|
|
5822
|
+
brief
|
|
5823
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5824
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5825
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5826
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5827
|
+
won't be a brief.
|
|
5828
|
+
actions
|
|
5829
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
5830
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5831
|
+
define the actions.
|
|
5832
|
+
active
|
|
5833
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5834
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5835
|
+
for the steps unchanged).
|
|
5836
|
+
|
|
5837
|
+
Returns
|
|
5838
|
+
-------
|
|
5839
|
+
Validate
|
|
5840
|
+
The `Validate` object with the added validation step.
|
|
5841
|
+
|
|
5842
|
+
Using Reference Data
|
|
5843
|
+
--------------------
|
|
5844
|
+
The `col_avg_gt()` method supports comparing column aggregations against reference data. This
|
|
5845
|
+
is useful for validating that statistical properties remain consistent across different
|
|
5846
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
5847
|
+
|
|
5848
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
5849
|
+
|
|
5850
|
+
```python
|
|
5851
|
+
validation = (
|
|
5852
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
5853
|
+
.col_avg_gt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
5854
|
+
.interrogate()
|
|
5855
|
+
)
|
|
5856
|
+
```
|
|
5857
|
+
|
|
5858
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
5859
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
5860
|
+
the `ref()` helper:
|
|
5861
|
+
|
|
5862
|
+
```python
|
|
5863
|
+
.col_avg_gt(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
5864
|
+
```
|
|
5865
|
+
|
|
5866
|
+
Understanding Tolerance
|
|
5867
|
+
-----------------------
|
|
5868
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
5869
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
5870
|
+
|
|
5871
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5872
|
+
`col_avg_gt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
5873
|
+
target value and still pass validation.
|
|
5874
|
+
|
|
5875
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
5876
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
5877
|
+
shifts the comparison boundary.
|
|
5878
|
+
|
|
5879
|
+
Thresholds
|
|
5880
|
+
----------
|
|
5881
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5882
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5883
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5884
|
+
|
|
5885
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
5886
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
5887
|
+
typically set as absolute counts:
|
|
5888
|
+
|
|
5889
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
5890
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
5891
|
+
|
|
5892
|
+
Thresholds can be defined using one of these input schemes:
|
|
5893
|
+
|
|
5894
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5895
|
+
thresholds)
|
|
5896
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5897
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5898
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5899
|
+
'critical'
|
|
5900
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5901
|
+
for the 'warning' level only
|
|
5902
|
+
|
|
5903
|
+
Examples
|
|
5904
|
+
--------
|
|
5905
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
5906
|
+
shown below:
|
|
5907
|
+
|
|
5908
|
+
```python
|
|
5909
|
+
import pointblank as pb
|
|
5910
|
+
import polars as pl
|
|
5911
|
+
|
|
5912
|
+
tbl = pl.DataFrame(
|
|
5913
|
+
{
|
|
5914
|
+
"a": [1, 2, 3, 4, 5],
|
|
5915
|
+
"b": [2, 2, 2, 2, 2],
|
|
5916
|
+
}
|
|
5917
|
+
)
|
|
5918
|
+
|
|
5919
|
+
pb.preview(tbl)
|
|
5920
|
+
```
|
|
5921
|
+
|
|
5922
|
+
Let's validate that the average of column `a` is greater than `3`:
|
|
5923
|
+
|
|
5924
|
+
```python
|
|
5925
|
+
validation = (
|
|
5926
|
+
pb.Validate(data=tbl)
|
|
5927
|
+
.col_avg_gt(columns="a", value=3)
|
|
5928
|
+
.interrogate()
|
|
5929
|
+
)
|
|
5930
|
+
|
|
5931
|
+
validation
|
|
5932
|
+
```
|
|
5933
|
+
|
|
5934
|
+
The validation result shows whether the average comparison passed or failed. Since this
|
|
5935
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
5936
|
+
|
|
5937
|
+
When validating multiple columns, each column gets its own validation step:
|
|
5938
|
+
|
|
5939
|
+
```python
|
|
5940
|
+
validation = (
|
|
5941
|
+
pb.Validate(data=tbl)
|
|
5942
|
+
.col_avg_gt(columns=["a", "b"], value=3)
|
|
5943
|
+
.interrogate()
|
|
5944
|
+
)
|
|
5945
|
+
|
|
5946
|
+
validation
|
|
5947
|
+
```
|
|
5948
|
+
|
|
5949
|
+
Using tolerance for flexible comparisons:
|
|
5950
|
+
|
|
5951
|
+
```python
|
|
5952
|
+
validation = (
|
|
5953
|
+
pb.Validate(data=tbl)
|
|
5954
|
+
.col_avg_gt(columns="a", value=3, tol=1.0)
|
|
5955
|
+
.interrogate()
|
|
5956
|
+
)
|
|
5957
|
+
|
|
5958
|
+
validation
|
|
5959
|
+
```
|
|
5960
|
+
|
|
5961
|
+
col_avg_lt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5962
|
+
Does the column average satisfy a less than comparison?
|
|
5963
|
+
|
|
5964
|
+
The `col_avg_lt()` validation method checks whether the average of values in a column
|
|
5965
|
+
is less than a specified `value=`. This is an aggregation-based validation where the entire
|
|
5966
|
+
column is reduced to a single average value that is then compared against the target. The
|
|
5967
|
+
comparison used in this function is `average(column) < value`.
|
|
5968
|
+
|
|
5969
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
5970
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
5971
|
+
the comparison) or fails completely.
|
|
5972
|
+
|
|
5973
|
+
Parameters
|
|
5974
|
+
----------
|
|
5975
|
+
columns
|
|
5976
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
5977
|
+
there will be a separate validation step generated for each column. The columns must
|
|
5978
|
+
contain numeric data for the average to be computed.
|
|
5979
|
+
value
|
|
5980
|
+
The value to compare the column average against. This can be: (1) a numeric literal
|
|
5981
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
5982
|
+
whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
5983
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
5984
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
5985
|
+
`ref(column_name)` when reference data is set).
|
|
5986
|
+
tol
|
|
5987
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
5988
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
5989
|
+
a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5990
|
+
`col_avg_lt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
5991
|
+
target value and still pass validation.
|
|
5992
|
+
thresholds
|
|
5993
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
5994
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
5995
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
5996
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
5997
|
+
acceptable.
|
|
5998
|
+
brief
|
|
5999
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6000
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6001
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6002
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6003
|
+
won't be a brief.
|
|
6004
|
+
actions
|
|
6005
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6006
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6007
|
+
define the actions.
|
|
6008
|
+
active
|
|
6009
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6010
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6011
|
+
for the steps unchanged).
|
|
6012
|
+
|
|
6013
|
+
Returns
|
|
6014
|
+
-------
|
|
6015
|
+
Validate
|
|
6016
|
+
The `Validate` object with the added validation step.
|
|
6017
|
+
|
|
6018
|
+
Using Reference Data
|
|
6019
|
+
--------------------
|
|
6020
|
+
The `col_avg_lt()` method supports comparing column aggregations against reference data. This
|
|
6021
|
+
is useful for validating that statistical properties remain consistent across different
|
|
6022
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
6023
|
+
|
|
6024
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
6025
|
+
|
|
6026
|
+
```python
|
|
6027
|
+
validation = (
|
|
6028
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
6029
|
+
.col_avg_lt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
6030
|
+
.interrogate()
|
|
6031
|
+
)
|
|
6032
|
+
```
|
|
6033
|
+
|
|
6034
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
6035
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
6036
|
+
the `ref()` helper:
|
|
6037
|
+
|
|
6038
|
+
```python
|
|
6039
|
+
.col_avg_lt(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
6040
|
+
```
|
|
6041
|
+
|
|
6042
|
+
Understanding Tolerance
|
|
6043
|
+
-----------------------
|
|
6044
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
6045
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
6046
|
+
|
|
6047
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6048
|
+
`col_avg_lt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
6049
|
+
target value and still pass validation.
|
|
6050
|
+
|
|
6051
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
6052
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
6053
|
+
shifts the comparison boundary.
|
|
6054
|
+
|
|
6055
|
+
Thresholds
|
|
6056
|
+
----------
|
|
6057
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6058
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6059
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6060
|
+
|
|
6061
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
6062
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
6063
|
+
typically set as absolute counts:
|
|
6064
|
+
|
|
6065
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
6066
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
6067
|
+
|
|
6068
|
+
Thresholds can be defined using one of these input schemes:
|
|
6069
|
+
|
|
6070
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6071
|
+
thresholds)
|
|
6072
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6073
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6074
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6075
|
+
'critical'
|
|
6076
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6077
|
+
for the 'warning' level only
|
|
6078
|
+
|
|
6079
|
+
Examples
|
|
6080
|
+
--------
|
|
6081
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
6082
|
+
shown below:
|
|
6083
|
+
|
|
6084
|
+
```python
|
|
6085
|
+
import pointblank as pb
|
|
6086
|
+
import polars as pl
|
|
6087
|
+
|
|
6088
|
+
tbl = pl.DataFrame(
|
|
6089
|
+
{
|
|
6090
|
+
"a": [1, 2, 3, 4, 5],
|
|
6091
|
+
"b": [2, 2, 2, 2, 2],
|
|
6092
|
+
}
|
|
6093
|
+
)
|
|
6094
|
+
|
|
6095
|
+
pb.preview(tbl)
|
|
6096
|
+
```
|
|
6097
|
+
|
|
6098
|
+
Let's validate that the average of column `a` is less than `3`:
|
|
6099
|
+
|
|
6100
|
+
```python
|
|
6101
|
+
validation = (
|
|
6102
|
+
pb.Validate(data=tbl)
|
|
6103
|
+
.col_avg_lt(columns="a", value=3)
|
|
6104
|
+
.interrogate()
|
|
6105
|
+
)
|
|
6106
|
+
|
|
6107
|
+
validation
|
|
6108
|
+
```
|
|
6109
|
+
|
|
6110
|
+
The validation result shows whether the average comparison passed or failed. Since this
|
|
6111
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
6112
|
+
|
|
6113
|
+
When validating multiple columns, each column gets its own validation step:
|
|
6114
|
+
|
|
6115
|
+
```python
|
|
6116
|
+
validation = (
|
|
6117
|
+
pb.Validate(data=tbl)
|
|
6118
|
+
.col_avg_lt(columns=["a", "b"], value=3)
|
|
6119
|
+
.interrogate()
|
|
6120
|
+
)
|
|
6121
|
+
|
|
6122
|
+
validation
|
|
6123
|
+
```
|
|
6124
|
+
|
|
6125
|
+
Using tolerance for flexible comparisons:
|
|
6126
|
+
|
|
6127
|
+
```python
|
|
6128
|
+
validation = (
|
|
6129
|
+
pb.Validate(data=tbl)
|
|
6130
|
+
.col_avg_lt(columns="a", value=3, tol=1.0)
|
|
6131
|
+
.interrogate()
|
|
6132
|
+
)
|
|
6133
|
+
|
|
6134
|
+
validation
|
|
6135
|
+
```
|
|
6136
|
+
|
|
6137
|
+
col_avg_ge(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6138
|
+
Does the column average satisfy a greater than or equal to comparison?
|
|
6139
|
+
|
|
6140
|
+
The `col_avg_ge()` validation method checks whether the average of values in a column
|
|
6141
|
+
is at least a specified `value=`. This is an aggregation-based validation where the entire
|
|
6142
|
+
column is reduced to a single average value that is then compared against the target. The
|
|
6143
|
+
comparison used in this function is `average(column) >= value`.
|
|
6144
|
+
|
|
6145
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
6146
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
6147
|
+
the comparison) or fails completely.
|
|
6148
|
+
|
|
6149
|
+
Parameters
|
|
6150
|
+
----------
|
|
6151
|
+
columns
|
|
6152
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
6153
|
+
there will be a separate validation step generated for each column. The columns must
|
|
6154
|
+
contain numeric data for the average to be computed.
|
|
6155
|
+
value
|
|
6156
|
+
The value to compare the column average against. This can be: (1) a numeric literal
|
|
6157
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
6158
|
+
whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
6159
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
6160
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
6161
|
+
`ref(column_name)` when reference data is set).
|
|
6162
|
+
tol
|
|
6163
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
6164
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
6165
|
+
a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6166
|
+
`col_avg_ge()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
6167
|
+
target value and still pass validation.
|
|
6168
|
+
thresholds
|
|
6169
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
6170
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
6171
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
6172
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
6173
|
+
acceptable.
|
|
6174
|
+
brief
|
|
6175
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6176
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6177
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6178
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6179
|
+
won't be a brief.
|
|
6180
|
+
actions
|
|
6181
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6182
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6183
|
+
define the actions.
|
|
6184
|
+
active
|
|
6185
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6186
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6187
|
+
for the steps unchanged).
|
|
6188
|
+
|
|
6189
|
+
Returns
|
|
6190
|
+
-------
|
|
6191
|
+
Validate
|
|
6192
|
+
The `Validate` object with the added validation step.
|
|
6193
|
+
|
|
6194
|
+
Using Reference Data
|
|
6195
|
+
--------------------
|
|
6196
|
+
The `col_avg_ge()` method supports comparing column aggregations against reference data. This
|
|
6197
|
+
is useful for validating that statistical properties remain consistent across different
|
|
6198
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
6199
|
+
|
|
6200
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
6201
|
+
|
|
6202
|
+
```python
|
|
6203
|
+
validation = (
|
|
6204
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
6205
|
+
.col_avg_ge(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
6206
|
+
.interrogate()
|
|
6207
|
+
)
|
|
6208
|
+
```
|
|
6209
|
+
|
|
6210
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
6211
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
6212
|
+
the `ref()` helper:
|
|
6213
|
+
|
|
6214
|
+
```python
|
|
6215
|
+
.col_avg_ge(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
6216
|
+
```
|
|
6217
|
+
|
|
6218
|
+
Understanding Tolerance
|
|
6219
|
+
-----------------------
|
|
6220
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
6221
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
6222
|
+
|
|
6223
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6224
|
+
`col_avg_ge()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
6225
|
+
target value and still pass validation.
|
|
6226
|
+
|
|
6227
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
6228
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
6229
|
+
shifts the comparison boundary.
|
|
6230
|
+
|
|
6231
|
+
Thresholds
|
|
6232
|
+
----------
|
|
6233
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6234
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6235
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6236
|
+
|
|
6237
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
6238
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
6239
|
+
typically set as absolute counts:
|
|
6240
|
+
|
|
6241
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
6242
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
6243
|
+
|
|
6244
|
+
Thresholds can be defined using one of these input schemes:
|
|
6245
|
+
|
|
6246
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6247
|
+
thresholds)
|
|
6248
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6249
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6250
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6251
|
+
'critical'
|
|
6252
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6253
|
+
for the 'warning' level only
|
|
6254
|
+
|
|
6255
|
+
Examples
|
|
6256
|
+
--------
|
|
6257
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
6258
|
+
shown below:
|
|
6259
|
+
|
|
6260
|
+
```python
|
|
6261
|
+
import pointblank as pb
|
|
6262
|
+
import polars as pl
|
|
6263
|
+
|
|
6264
|
+
tbl = pl.DataFrame(
|
|
6265
|
+
{
|
|
6266
|
+
"a": [1, 2, 3, 4, 5],
|
|
6267
|
+
"b": [2, 2, 2, 2, 2],
|
|
6268
|
+
}
|
|
6269
|
+
)
|
|
6270
|
+
|
|
6271
|
+
pb.preview(tbl)
|
|
6272
|
+
```
|
|
6273
|
+
|
|
6274
|
+
Let's validate that the average of column `a` is at least `3`:
|
|
6275
|
+
|
|
6276
|
+
```python
|
|
6277
|
+
validation = (
|
|
6278
|
+
pb.Validate(data=tbl)
|
|
6279
|
+
.col_avg_ge(columns="a", value=3)
|
|
6280
|
+
.interrogate()
|
|
6281
|
+
)
|
|
6282
|
+
|
|
6283
|
+
validation
|
|
6284
|
+
```
|
|
6285
|
+
|
|
6286
|
+
The validation result shows whether the average comparison passed or failed. Since this
|
|
6287
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
6288
|
+
|
|
6289
|
+
When validating multiple columns, each column gets its own validation step:
|
|
6290
|
+
|
|
6291
|
+
```python
|
|
6292
|
+
validation = (
|
|
6293
|
+
pb.Validate(data=tbl)
|
|
6294
|
+
.col_avg_ge(columns=["a", "b"], value=3)
|
|
6295
|
+
.interrogate()
|
|
6296
|
+
)
|
|
6297
|
+
|
|
6298
|
+
validation
|
|
6299
|
+
```
|
|
6300
|
+
|
|
6301
|
+
Using tolerance for flexible comparisons:
|
|
6302
|
+
|
|
6303
|
+
```python
|
|
6304
|
+
validation = (
|
|
6305
|
+
pb.Validate(data=tbl)
|
|
6306
|
+
.col_avg_ge(columns="a", value=3, tol=1.0)
|
|
6307
|
+
.interrogate()
|
|
6308
|
+
)
|
|
6309
|
+
|
|
6310
|
+
validation
|
|
6311
|
+
```
|
|
6312
|
+
|
|
6313
|
+
col_avg_le(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6314
|
+
Does the column average satisfy a less than or equal to comparison?
|
|
6315
|
+
|
|
6316
|
+
The `col_avg_le()` validation method checks whether the average of values in a column
|
|
6317
|
+
is at most a specified `value=`. This is an aggregation-based validation where the entire
|
|
6318
|
+
column is reduced to a single average value that is then compared against the target. The
|
|
6319
|
+
comparison used in this function is `average(column) <= value`.
|
|
6320
|
+
|
|
6321
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
6322
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
6323
|
+
the comparison) or fails completely.
|
|
6324
|
+
|
|
6325
|
+
Parameters
|
|
6326
|
+
----------
|
|
6327
|
+
columns
|
|
6328
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
6329
|
+
there will be a separate validation step generated for each column. The columns must
|
|
6330
|
+
contain numeric data for the average to be computed.
|
|
6331
|
+
value
|
|
6332
|
+
The value to compare the column average against. This can be: (1) a numeric literal
|
|
6333
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
6334
|
+
whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
6335
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
6336
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
6337
|
+
`ref(column_name)` when reference data is set).
|
|
6338
|
+
tol
|
|
6339
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
6340
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
6341
|
+
a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6342
|
+
`col_avg_le()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
6343
|
+
target value and still pass validation.
|
|
6344
|
+
thresholds
|
|
6345
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
6346
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
6347
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
6348
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
6349
|
+
acceptable.
|
|
6350
|
+
brief
|
|
6351
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6352
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6353
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6354
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6355
|
+
won't be a brief.
|
|
6356
|
+
actions
|
|
6357
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6358
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6359
|
+
define the actions.
|
|
6360
|
+
active
|
|
6361
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6362
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6363
|
+
for the steps unchanged).
|
|
6364
|
+
|
|
6365
|
+
Returns
|
|
6366
|
+
-------
|
|
6367
|
+
Validate
|
|
6368
|
+
The `Validate` object with the added validation step.
|
|
6369
|
+
|
|
6370
|
+
Using Reference Data
|
|
6371
|
+
--------------------
|
|
6372
|
+
The `col_avg_le()` method supports comparing column aggregations against reference data. This
|
|
6373
|
+
is useful for validating that statistical properties remain consistent across different
|
|
6374
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
6375
|
+
|
|
6376
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
6377
|
+
|
|
6378
|
+
```python
|
|
6379
|
+
validation = (
|
|
6380
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
6381
|
+
.col_avg_le(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
6382
|
+
.interrogate()
|
|
6383
|
+
)
|
|
6384
|
+
```
|
|
6385
|
+
|
|
6386
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
6387
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
6388
|
+
the `ref()` helper:
|
|
6389
|
+
|
|
6390
|
+
```python
|
|
6391
|
+
.col_avg_le(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
6392
|
+
```
|
|
6393
|
+
|
|
6394
|
+
Understanding Tolerance
|
|
6395
|
+
-----------------------
|
|
6396
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
6397
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
6398
|
+
|
|
6399
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6400
|
+
`col_avg_le()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
6401
|
+
target value and still pass validation.
|
|
6402
|
+
|
|
6403
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
6404
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
6405
|
+
shifts the comparison boundary.
|
|
6406
|
+
|
|
6407
|
+
Thresholds
|
|
6408
|
+
----------
|
|
6409
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6410
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6411
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6412
|
+
|
|
6413
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
6414
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
6415
|
+
typically set as absolute counts:
|
|
6416
|
+
|
|
6417
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
6418
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
6419
|
+
|
|
6420
|
+
Thresholds can be defined using one of these input schemes:
|
|
6421
|
+
|
|
6422
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6423
|
+
thresholds)
|
|
6424
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6425
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6426
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6427
|
+
'critical'
|
|
6428
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6429
|
+
for the 'warning' level only
|
|
6430
|
+
|
|
6431
|
+
Examples
|
|
6432
|
+
--------
|
|
6433
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
6434
|
+
shown below:
|
|
6435
|
+
|
|
6436
|
+
```python
|
|
6437
|
+
import pointblank as pb
|
|
6438
|
+
import polars as pl
|
|
6439
|
+
|
|
6440
|
+
tbl = pl.DataFrame(
|
|
6441
|
+
{
|
|
6442
|
+
"a": [1, 2, 3, 4, 5],
|
|
6443
|
+
"b": [2, 2, 2, 2, 2],
|
|
6444
|
+
}
|
|
6445
|
+
)
|
|
6446
|
+
|
|
6447
|
+
pb.preview(tbl)
|
|
6448
|
+
```
|
|
6449
|
+
|
|
6450
|
+
Let's validate that the average of column `a` is at most `3`:
|
|
6451
|
+
|
|
6452
|
+
```python
|
|
6453
|
+
validation = (
|
|
6454
|
+
pb.Validate(data=tbl)
|
|
6455
|
+
.col_avg_le(columns="a", value=3)
|
|
6456
|
+
.interrogate()
|
|
6457
|
+
)
|
|
6458
|
+
|
|
6459
|
+
validation
|
|
6460
|
+
```
|
|
6461
|
+
|
|
6462
|
+
The validation result shows whether the average comparison passed or failed. Since this
|
|
6463
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
6464
|
+
|
|
6465
|
+
When validating multiple columns, each column gets its own validation step:
|
|
6466
|
+
|
|
6467
|
+
```python
|
|
6468
|
+
validation = (
|
|
6469
|
+
pb.Validate(data=tbl)
|
|
6470
|
+
.col_avg_le(columns=["a", "b"], value=3)
|
|
6471
|
+
.interrogate()
|
|
6472
|
+
)
|
|
6473
|
+
|
|
6474
|
+
validation
|
|
6475
|
+
```
|
|
6476
|
+
|
|
6477
|
+
Using tolerance for flexible comparisons:
|
|
6478
|
+
|
|
6479
|
+
```python
|
|
6480
|
+
validation = (
|
|
6481
|
+
pb.Validate(data=tbl)
|
|
6482
|
+
.col_avg_le(columns="a", value=3, tol=1.0)
|
|
6483
|
+
.interrogate()
|
|
6484
|
+
)
|
|
6485
|
+
|
|
6486
|
+
validation
|
|
6487
|
+
```
|
|
6488
|
+
|
|
6489
|
+
col_avg_eq(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6490
|
+
Does the column average satisfy an equal to comparison?
|
|
6491
|
+
|
|
6492
|
+
The `col_avg_eq()` validation method checks whether the average of values in a column
|
|
6493
|
+
equals a specified `value=`. This is an aggregation-based validation where the entire
|
|
6494
|
+
column is reduced to a single average value that is then compared against the target. The
|
|
6495
|
+
comparison used in this function is `average(column) == value`.
|
|
6496
|
+
|
|
6497
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
6498
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
6499
|
+
the comparison) or fails completely.
|
|
6500
|
+
|
|
6501
|
+
Parameters
|
|
6502
|
+
----------
|
|
6503
|
+
columns
|
|
6504
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
6505
|
+
there will be a separate validation step generated for each column. The columns must
|
|
6506
|
+
contain numeric data for the average to be computed.
|
|
6507
|
+
value
|
|
6508
|
+
The value to compare the column average against. This can be: (1) a numeric literal
|
|
6509
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
6510
|
+
whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
6511
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
6512
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
6513
|
+
`ref(column_name)` when reference data is set).
|
|
6514
|
+
tol
|
|
6515
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
6516
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
6517
|
+
a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter is particularly useful with `col_avg_eq()` since exact equality
|
|
6518
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
6519
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
6520
|
+
floating-point arithmetic.
|
|
6521
|
+
thresholds
|
|
6522
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
6523
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
6524
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
6525
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
6526
|
+
acceptable.
|
|
6527
|
+
brief
|
|
6528
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6529
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6530
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6531
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6532
|
+
won't be a brief.
|
|
6533
|
+
actions
|
|
6534
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6535
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6536
|
+
define the actions.
|
|
6537
|
+
active
|
|
6538
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6539
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6540
|
+
for the steps unchanged).
|
|
6541
|
+
|
|
6542
|
+
Returns
|
|
6543
|
+
-------
|
|
6544
|
+
Validate
|
|
6545
|
+
The `Validate` object with the added validation step.
|
|
6546
|
+
|
|
6547
|
+
Using Reference Data
|
|
6548
|
+
--------------------
|
|
6549
|
+
The `col_avg_eq()` method supports comparing column aggregations against reference data. This
|
|
6550
|
+
is useful for validating that statistical properties remain consistent across different
|
|
6551
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
6552
|
+
|
|
6553
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
6554
|
+
|
|
6555
|
+
```python
|
|
6556
|
+
validation = (
|
|
6557
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
6558
|
+
.col_avg_eq(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
6559
|
+
.interrogate()
|
|
6560
|
+
)
|
|
6561
|
+
```
|
|
6562
|
+
|
|
6563
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
6564
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
6565
|
+
the `ref()` helper:
|
|
6566
|
+
|
|
6567
|
+
```python
|
|
6568
|
+
.col_avg_eq(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
6569
|
+
```
|
|
6570
|
+
|
|
6571
|
+
Understanding Tolerance
|
|
6572
|
+
-----------------------
|
|
6573
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
6574
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
6575
|
+
|
|
6576
|
+
The `tol=` parameter is particularly useful with `col_avg_eq()` since exact equality
|
|
6577
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
6578
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
6579
|
+
floating-point arithmetic.
|
|
6580
|
+
|
|
6581
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
6582
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
6583
|
+
shifts the comparison boundary.
|
|
6584
|
+
|
|
6585
|
+
Thresholds
|
|
6586
|
+
----------
|
|
6587
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6588
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6589
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6590
|
+
|
|
6591
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
6592
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
6593
|
+
typically set as absolute counts:
|
|
6594
|
+
|
|
6595
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
6596
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
6597
|
+
|
|
6598
|
+
Thresholds can be defined using one of these input schemes:
|
|
6599
|
+
|
|
6600
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6601
|
+
thresholds)
|
|
6602
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6603
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6604
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6605
|
+
'critical'
|
|
6606
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6607
|
+
for the 'warning' level only
|
|
6608
|
+
|
|
6609
|
+
Examples
|
|
6610
|
+
--------
|
|
6611
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
6612
|
+
shown below:
|
|
6613
|
+
|
|
6614
|
+
```python
|
|
6615
|
+
import pointblank as pb
|
|
6616
|
+
import polars as pl
|
|
6617
|
+
|
|
6618
|
+
tbl = pl.DataFrame(
|
|
6619
|
+
{
|
|
6620
|
+
"a": [1, 2, 3, 4, 5],
|
|
6621
|
+
"b": [2, 2, 2, 2, 2],
|
|
6622
|
+
}
|
|
6623
|
+
)
|
|
6624
|
+
|
|
6625
|
+
pb.preview(tbl)
|
|
6626
|
+
```
|
|
6627
|
+
|
|
6628
|
+
Let's validate that the average of column `a` equals `3`:
|
|
6629
|
+
|
|
6630
|
+
```python
|
|
6631
|
+
validation = (
|
|
6632
|
+
pb.Validate(data=tbl)
|
|
6633
|
+
.col_avg_eq(columns="a", value=3)
|
|
6634
|
+
.interrogate()
|
|
6635
|
+
)
|
|
6636
|
+
|
|
6637
|
+
validation
|
|
6638
|
+
```
|
|
6639
|
+
|
|
6640
|
+
The validation result shows whether the average comparison passed or failed. Since this
|
|
6641
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
6642
|
+
|
|
6643
|
+
When validating multiple columns, each column gets its own validation step:
|
|
6644
|
+
|
|
6645
|
+
```python
|
|
6646
|
+
validation = (
|
|
6647
|
+
pb.Validate(data=tbl)
|
|
6648
|
+
.col_avg_eq(columns=["a", "b"], value=3)
|
|
6649
|
+
.interrogate()
|
|
6650
|
+
)
|
|
6651
|
+
|
|
6652
|
+
validation
|
|
6653
|
+
```
|
|
6654
|
+
|
|
6655
|
+
Using tolerance for flexible comparisons:
|
|
6656
|
+
|
|
6657
|
+
```python
|
|
6658
|
+
validation = (
|
|
6659
|
+
pb.Validate(data=tbl)
|
|
6660
|
+
.col_avg_eq(columns="a", value=3, tol=1.0)
|
|
6661
|
+
.interrogate()
|
|
6662
|
+
)
|
|
6663
|
+
|
|
6664
|
+
validation
|
|
6665
|
+
```
|
|
6666
|
+
|
|
6667
|
+
col_sd_gt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6668
|
+
Does the column standard deviation satisfy a greater than comparison?
|
|
6669
|
+
|
|
6670
|
+
The `col_sd_gt()` validation method checks whether the standard deviation of values in a column
|
|
6671
|
+
is greater than a specified `value=`. This is an aggregation-based validation where the entire
|
|
6672
|
+
column is reduced to a single standard deviation value that is then compared against the target. The
|
|
6673
|
+
comparison used in this function is `standard deviation(column) > value`.
|
|
6674
|
+
|
|
6675
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
6676
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
6677
|
+
the comparison) or fails completely.
|
|
6678
|
+
|
|
6679
|
+
Parameters
|
|
6680
|
+
----------
|
|
6681
|
+
columns
|
|
6682
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
6683
|
+
there will be a separate validation step generated for each column. The columns must
|
|
6684
|
+
contain numeric data for the standard deviation to be computed.
|
|
6685
|
+
value
|
|
6686
|
+
The value to compare the column standard deviation against. This can be: (1) a numeric literal
|
|
6687
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
6688
|
+
whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
6689
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
6690
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
6691
|
+
`ref(column_name)` when reference data is set).
|
|
6692
|
+
tol
|
|
6693
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
6694
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
6695
|
+
a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6696
|
+
`col_sd_gt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
6697
|
+
target value and still pass validation.
|
|
6698
|
+
thresholds
|
|
6699
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
6700
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
6701
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
6702
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
6703
|
+
acceptable.
|
|
6704
|
+
brief
|
|
6705
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6706
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6707
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6708
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6709
|
+
won't be a brief.
|
|
6710
|
+
actions
|
|
6711
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6712
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6713
|
+
define the actions.
|
|
6714
|
+
active
|
|
6715
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6716
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6717
|
+
for the steps unchanged).
|
|
6718
|
+
|
|
6719
|
+
Returns
|
|
6720
|
+
-------
|
|
6721
|
+
Validate
|
|
6722
|
+
The `Validate` object with the added validation step.
|
|
6723
|
+
|
|
6724
|
+
Using Reference Data
|
|
6725
|
+
--------------------
|
|
6726
|
+
The `col_sd_gt()` method supports comparing column aggregations against reference data. This
|
|
6727
|
+
is useful for validating that statistical properties remain consistent across different
|
|
6728
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
6729
|
+
|
|
6730
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
6731
|
+
|
|
6732
|
+
```python
|
|
6733
|
+
validation = (
|
|
6734
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
6735
|
+
.col_sd_gt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
6736
|
+
.interrogate()
|
|
6737
|
+
)
|
|
6738
|
+
```
|
|
6739
|
+
|
|
6740
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
6741
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
6742
|
+
the `ref()` helper:
|
|
6743
|
+
|
|
6744
|
+
```python
|
|
6745
|
+
.col_sd_gt(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
6746
|
+
```
|
|
6747
|
+
|
|
6748
|
+
Understanding Tolerance
|
|
6749
|
+
-----------------------
|
|
6750
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
6751
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
6752
|
+
|
|
6753
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6754
|
+
`col_sd_gt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
6755
|
+
target value and still pass validation.
|
|
6756
|
+
|
|
6757
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
6758
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
6759
|
+
shifts the comparison boundary.
|
|
6760
|
+
|
|
6761
|
+
Thresholds
|
|
6762
|
+
----------
|
|
6763
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6764
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6765
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6766
|
+
|
|
6767
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
6768
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
6769
|
+
typically set as absolute counts:
|
|
6770
|
+
|
|
6771
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
6772
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
6773
|
+
|
|
6774
|
+
Thresholds can be defined using one of these input schemes:
|
|
6775
|
+
|
|
6776
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6777
|
+
thresholds)
|
|
6778
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6779
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6780
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6781
|
+
'critical'
|
|
6782
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6783
|
+
for the 'warning' level only
|
|
6784
|
+
|
|
6785
|
+
Examples
|
|
6786
|
+
--------
|
|
6787
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
6788
|
+
shown below:
|
|
6789
|
+
|
|
6790
|
+
```python
|
|
6791
|
+
import pointblank as pb
|
|
6792
|
+
import polars as pl
|
|
6793
|
+
|
|
6794
|
+
tbl = pl.DataFrame(
|
|
6795
|
+
{
|
|
6796
|
+
"a": [1, 2, 3, 4, 5],
|
|
6797
|
+
"b": [2, 2, 2, 2, 2],
|
|
6798
|
+
}
|
|
6799
|
+
)
|
|
6800
|
+
|
|
6801
|
+
pb.preview(tbl)
|
|
6802
|
+
```
|
|
6803
|
+
|
|
6804
|
+
Let's validate that the standard deviation of column `a` is greater than `2`:
|
|
6805
|
+
|
|
6806
|
+
```python
|
|
6807
|
+
validation = (
|
|
6808
|
+
pb.Validate(data=tbl)
|
|
6809
|
+
.col_sd_gt(columns="a", value=2)
|
|
6810
|
+
.interrogate()
|
|
6811
|
+
)
|
|
6812
|
+
|
|
6813
|
+
validation
|
|
6814
|
+
```
|
|
6815
|
+
|
|
6816
|
+
The validation result shows whether the standard deviation comparison passed or failed. Since this
|
|
6817
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
6818
|
+
|
|
6819
|
+
When validating multiple columns, each column gets its own validation step:
|
|
6820
|
+
|
|
6821
|
+
```python
|
|
6822
|
+
validation = (
|
|
6823
|
+
pb.Validate(data=tbl)
|
|
6824
|
+
.col_sd_gt(columns=["a", "b"], value=2)
|
|
6825
|
+
.interrogate()
|
|
6826
|
+
)
|
|
6827
|
+
|
|
6828
|
+
validation
|
|
6829
|
+
```
|
|
6830
|
+
|
|
6831
|
+
Using tolerance for flexible comparisons:
|
|
6832
|
+
|
|
6833
|
+
```python
|
|
6834
|
+
validation = (
|
|
6835
|
+
pb.Validate(data=tbl)
|
|
6836
|
+
.col_sd_gt(columns="a", value=2, tol=1.0)
|
|
6837
|
+
.interrogate()
|
|
6838
|
+
)
|
|
6839
|
+
|
|
6840
|
+
validation
|
|
6841
|
+
```
|
|
6842
|
+
|
|
6843
|
+
col_sd_lt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6844
|
+
Does the column standard deviation satisfy a less than comparison?
|
|
6845
|
+
|
|
6846
|
+
The `col_sd_lt()` validation method checks whether the standard deviation of values in a column
|
|
6847
|
+
is less than a specified `value=`. This is an aggregation-based validation where the entire
|
|
6848
|
+
column is reduced to a single standard deviation value that is then compared against the target. The
|
|
6849
|
+
comparison used in this function is `standard deviation(column) < value`.
|
|
6850
|
+
|
|
6851
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
6852
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
6853
|
+
the comparison) or fails completely.
|
|
6854
|
+
|
|
6855
|
+
Parameters
|
|
6856
|
+
----------
|
|
6857
|
+
columns
|
|
6858
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
6859
|
+
there will be a separate validation step generated for each column. The columns must
|
|
6860
|
+
contain numeric data for the standard deviation to be computed.
|
|
6861
|
+
value
|
|
6862
|
+
The value to compare the column standard deviation against. This can be: (1) a numeric literal
|
|
6863
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
6864
|
+
whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
6865
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
6866
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
6867
|
+
`ref(column_name)` when reference data is set).
|
|
6868
|
+
tol
|
|
6869
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
6870
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
6871
|
+
a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6872
|
+
`col_sd_lt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
6873
|
+
target value and still pass validation.
|
|
6874
|
+
thresholds
|
|
6875
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
6876
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
6877
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
6878
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
6879
|
+
acceptable.
|
|
6880
|
+
brief
|
|
6881
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6882
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6883
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6884
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6885
|
+
won't be a brief.
|
|
6886
|
+
actions
|
|
6887
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6888
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6889
|
+
define the actions.
|
|
6890
|
+
active
|
|
6891
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6892
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6893
|
+
for the steps unchanged).
|
|
6894
|
+
|
|
6895
|
+
Returns
|
|
6896
|
+
-------
|
|
6897
|
+
Validate
|
|
6898
|
+
The `Validate` object with the added validation step.
|
|
6899
|
+
|
|
6900
|
+
Using Reference Data
|
|
6901
|
+
--------------------
|
|
6902
|
+
The `col_sd_lt()` method supports comparing column aggregations against reference data. This
|
|
6903
|
+
is useful for validating that statistical properties remain consistent across different
|
|
6904
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
6905
|
+
|
|
6906
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
6907
|
+
|
|
6908
|
+
```python
|
|
6909
|
+
validation = (
|
|
6910
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
6911
|
+
.col_sd_lt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
6912
|
+
.interrogate()
|
|
6913
|
+
)
|
|
6914
|
+
```
|
|
6915
|
+
|
|
6916
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
6917
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
6918
|
+
the `ref()` helper:
|
|
6919
|
+
|
|
6920
|
+
```python
|
|
6921
|
+
.col_sd_lt(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
6922
|
+
```
|
|
6923
|
+
|
|
6924
|
+
Understanding Tolerance
|
|
6925
|
+
-----------------------
|
|
6926
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
6927
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
6928
|
+
|
|
6929
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6930
|
+
`col_sd_lt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
6931
|
+
target value and still pass validation.
|
|
6932
|
+
|
|
6933
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
6934
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
6935
|
+
shifts the comparison boundary.
|
|
6936
|
+
|
|
6937
|
+
Thresholds
|
|
6938
|
+
----------
|
|
6939
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6940
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6941
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6942
|
+
|
|
6943
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
6944
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
6945
|
+
typically set as absolute counts:
|
|
6946
|
+
|
|
6947
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
6948
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
6949
|
+
|
|
6950
|
+
Thresholds can be defined using one of these input schemes:
|
|
6951
|
+
|
|
6952
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6953
|
+
thresholds)
|
|
6954
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6955
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6956
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6957
|
+
'critical'
|
|
6958
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6959
|
+
for the 'warning' level only
|
|
6960
|
+
|
|
6961
|
+
Examples
|
|
6962
|
+
--------
|
|
6963
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
6964
|
+
shown below:
|
|
6965
|
+
|
|
6966
|
+
```python
|
|
6967
|
+
import pointblank as pb
|
|
6968
|
+
import polars as pl
|
|
6969
|
+
|
|
6970
|
+
tbl = pl.DataFrame(
|
|
6971
|
+
{
|
|
6972
|
+
"a": [1, 2, 3, 4, 5],
|
|
6973
|
+
"b": [2, 2, 2, 2, 2],
|
|
6974
|
+
}
|
|
6975
|
+
)
|
|
6976
|
+
|
|
6977
|
+
pb.preview(tbl)
|
|
6978
|
+
```
|
|
6979
|
+
|
|
6980
|
+
Let's validate that the standard deviation of column `a` is less than `2`:
|
|
6981
|
+
|
|
6982
|
+
```python
|
|
6983
|
+
validation = (
|
|
6984
|
+
pb.Validate(data=tbl)
|
|
6985
|
+
.col_sd_lt(columns="a", value=2)
|
|
6986
|
+
.interrogate()
|
|
6987
|
+
)
|
|
6988
|
+
|
|
6989
|
+
validation
|
|
6990
|
+
```
|
|
6991
|
+
|
|
6992
|
+
The validation result shows whether the standard deviation comparison passed or failed. Since this
|
|
6993
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
6994
|
+
|
|
6995
|
+
When validating multiple columns, each column gets its own validation step:
|
|
6996
|
+
|
|
6997
|
+
```python
|
|
6998
|
+
validation = (
|
|
6999
|
+
pb.Validate(data=tbl)
|
|
7000
|
+
.col_sd_lt(columns=["a", "b"], value=2)
|
|
7001
|
+
.interrogate()
|
|
7002
|
+
)
|
|
7003
|
+
|
|
7004
|
+
validation
|
|
7005
|
+
```
|
|
7006
|
+
|
|
7007
|
+
Using tolerance for flexible comparisons:
|
|
7008
|
+
|
|
7009
|
+
```python
|
|
7010
|
+
validation = (
|
|
7011
|
+
pb.Validate(data=tbl)
|
|
7012
|
+
.col_sd_lt(columns="a", value=2, tol=1.0)
|
|
7013
|
+
.interrogate()
|
|
7014
|
+
)
|
|
7015
|
+
|
|
7016
|
+
validation
|
|
7017
|
+
```
|
|
7018
|
+
|
|
7019
|
+
col_sd_ge(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
7020
|
+
Does the column standard deviation satisfy a greater than or equal to comparison?
|
|
7021
|
+
|
|
7022
|
+
The `col_sd_ge()` validation method checks whether the standard deviation of values in a column
|
|
7023
|
+
is at least a specified `value=`. This is an aggregation-based validation where the entire
|
|
7024
|
+
column is reduced to a single standard deviation value that is then compared against the target. The
|
|
7025
|
+
comparison used in this function is `standard deviation(column) >= value`.
|
|
7026
|
+
|
|
7027
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
7028
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
7029
|
+
the comparison) or fails completely.
|
|
7030
|
+
|
|
7031
|
+
Parameters
|
|
7032
|
+
----------
|
|
7033
|
+
columns
|
|
7034
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
7035
|
+
there will be a separate validation step generated for each column. The columns must
|
|
7036
|
+
contain numeric data for the standard deviation to be computed.
|
|
7037
|
+
value
|
|
7038
|
+
The value to compare the column standard deviation against. This can be: (1) a numeric literal
|
|
7039
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
7040
|
+
whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
7041
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
7042
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
7043
|
+
`ref(column_name)` when reference data is set).
|
|
7044
|
+
tol
|
|
7045
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
7046
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
7047
|
+
a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
7048
|
+
`col_sd_ge()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
7049
|
+
target value and still pass validation.
|
|
7050
|
+
thresholds
|
|
7051
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
7052
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
7053
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
7054
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
7055
|
+
acceptable.
|
|
7056
|
+
brief
|
|
7057
|
+
An optional brief description of the validation step that will be displayed in the
|
|
7058
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
7059
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
7060
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
7061
|
+
won't be a brief.
|
|
7062
|
+
actions
|
|
7063
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
7064
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
7065
|
+
define the actions.
|
|
7066
|
+
active
|
|
7067
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
7068
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
7069
|
+
for the steps unchanged).
|
|
7070
|
+
|
|
7071
|
+
Returns
|
|
7072
|
+
-------
|
|
7073
|
+
Validate
|
|
7074
|
+
The `Validate` object with the added validation step.
|
|
7075
|
+
|
|
7076
|
+
Using Reference Data
|
|
7077
|
+
--------------------
|
|
7078
|
+
The `col_sd_ge()` method supports comparing column aggregations against reference data. This
|
|
7079
|
+
is useful for validating that statistical properties remain consistent across different
|
|
7080
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
7081
|
+
|
|
7082
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
7083
|
+
|
|
7084
|
+
```python
|
|
7085
|
+
validation = (
|
|
7086
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
7087
|
+
.col_sd_ge(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
7088
|
+
.interrogate()
|
|
7089
|
+
)
|
|
7090
|
+
```
|
|
7091
|
+
|
|
7092
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
7093
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
7094
|
+
the `ref()` helper:
|
|
7095
|
+
|
|
7096
|
+
```python
|
|
7097
|
+
.col_sd_ge(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
7098
|
+
```
|
|
7099
|
+
|
|
7100
|
+
Understanding Tolerance
|
|
7101
|
+
-----------------------
|
|
7102
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
7103
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
7104
|
+
|
|
7105
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
7106
|
+
`col_sd_ge()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
7107
|
+
target value and still pass validation.
|
|
7108
|
+
|
|
7109
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
7110
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
7111
|
+
shifts the comparison boundary.
|
|
7112
|
+
|
|
7113
|
+
Thresholds
|
|
7114
|
+
----------
|
|
7115
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
7116
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
7117
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
7118
|
+
|
|
7119
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
7120
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
7121
|
+
typically set as absolute counts:
|
|
7122
|
+
|
|
7123
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
7124
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
7125
|
+
|
|
7126
|
+
Thresholds can be defined using one of these input schemes:
|
|
7127
|
+
|
|
7128
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
7129
|
+
thresholds)
|
|
7130
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
7131
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
7132
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
7133
|
+
'critical'
|
|
7134
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
7135
|
+
for the 'warning' level only
|
|
7136
|
+
|
|
7137
|
+
Examples
|
|
7138
|
+
--------
|
|
7139
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
7140
|
+
shown below:
|
|
7141
|
+
|
|
7142
|
+
```python
|
|
7143
|
+
import pointblank as pb
|
|
7144
|
+
import polars as pl
|
|
7145
|
+
|
|
7146
|
+
tbl = pl.DataFrame(
|
|
7147
|
+
{
|
|
7148
|
+
"a": [1, 2, 3, 4, 5],
|
|
7149
|
+
"b": [2, 2, 2, 2, 2],
|
|
7150
|
+
}
|
|
7151
|
+
)
|
|
7152
|
+
|
|
7153
|
+
pb.preview(tbl)
|
|
7154
|
+
```
|
|
7155
|
+
|
|
7156
|
+
Let's validate that the standard deviation of column `a` is at least `2`:
|
|
7157
|
+
|
|
7158
|
+
```python
|
|
7159
|
+
validation = (
|
|
7160
|
+
pb.Validate(data=tbl)
|
|
7161
|
+
.col_sd_ge(columns="a", value=2)
|
|
7162
|
+
.interrogate()
|
|
7163
|
+
)
|
|
7164
|
+
|
|
7165
|
+
validation
|
|
7166
|
+
```
|
|
7167
|
+
|
|
7168
|
+
The validation result shows whether the standard deviation comparison passed or failed. Since this
|
|
7169
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
7170
|
+
|
|
7171
|
+
When validating multiple columns, each column gets its own validation step:
|
|
7172
|
+
|
|
7173
|
+
```python
|
|
7174
|
+
validation = (
|
|
7175
|
+
pb.Validate(data=tbl)
|
|
7176
|
+
.col_sd_ge(columns=["a", "b"], value=2)
|
|
7177
|
+
.interrogate()
|
|
7178
|
+
)
|
|
7179
|
+
|
|
7180
|
+
validation
|
|
7181
|
+
```
|
|
7182
|
+
|
|
7183
|
+
Using tolerance for flexible comparisons:
|
|
7184
|
+
|
|
7185
|
+
```python
|
|
7186
|
+
validation = (
|
|
7187
|
+
pb.Validate(data=tbl)
|
|
7188
|
+
.col_sd_ge(columns="a", value=2, tol=1.0)
|
|
7189
|
+
.interrogate()
|
|
7190
|
+
)
|
|
7191
|
+
|
|
7192
|
+
validation
|
|
7193
|
+
```
|
|
7194
|
+
|
|
7195
|
+
col_sd_le(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
7196
|
+
Does the column standard deviation satisfy a less than or equal to comparison?
|
|
7197
|
+
|
|
7198
|
+
The `col_sd_le()` validation method checks whether the standard deviation of values in a column
|
|
7199
|
+
is at most a specified `value=`. This is an aggregation-based validation where the entire
|
|
7200
|
+
column is reduced to a single standard deviation value that is then compared against the target. The
|
|
7201
|
+
comparison used in this function is `standard deviation(column) <= value`.
|
|
7202
|
+
|
|
7203
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
7204
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
7205
|
+
the comparison) or fails completely.
|
|
7206
|
+
|
|
7207
|
+
Parameters
|
|
7208
|
+
----------
|
|
7209
|
+
columns
|
|
7210
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
7211
|
+
there will be a separate validation step generated for each column. The columns must
|
|
7212
|
+
contain numeric data for the standard deviation to be computed.
|
|
7213
|
+
value
|
|
7214
|
+
The value to compare the column standard deviation against. This can be: (1) a numeric literal
|
|
7215
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
7216
|
+
whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
7217
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
7218
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
7219
|
+
`ref(column_name)` when reference data is set).
|
|
7220
|
+
tol
|
|
7221
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
7222
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
7223
|
+
a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
7224
|
+
`col_sd_le()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
7225
|
+
target value and still pass validation.
|
|
7226
|
+
thresholds
|
|
7227
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
7228
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
7229
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
7230
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
7231
|
+
acceptable.
|
|
7232
|
+
brief
|
|
7233
|
+
An optional brief description of the validation step that will be displayed in the
|
|
7234
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
7235
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
7236
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
7237
|
+
won't be a brief.
|
|
7238
|
+
actions
|
|
7239
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
7240
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
7241
|
+
define the actions.
|
|
7242
|
+
active
|
|
7243
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
7244
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
7245
|
+
for the steps unchanged).
|
|
7246
|
+
|
|
7247
|
+
Returns
|
|
7248
|
+
-------
|
|
7249
|
+
Validate
|
|
7250
|
+
The `Validate` object with the added validation step.
|
|
7251
|
+
|
|
7252
|
+
Using Reference Data
|
|
7253
|
+
--------------------
|
|
7254
|
+
The `col_sd_le()` method supports comparing column aggregations against reference data. This
|
|
7255
|
+
is useful for validating that statistical properties remain consistent across different
|
|
7256
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
7257
|
+
|
|
7258
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
7259
|
+
|
|
7260
|
+
```python
|
|
7261
|
+
validation = (
|
|
7262
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
7263
|
+
.col_sd_le(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
7264
|
+
.interrogate()
|
|
7265
|
+
)
|
|
7266
|
+
```
|
|
7267
|
+
|
|
7268
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
7269
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
7270
|
+
the `ref()` helper:
|
|
7271
|
+
|
|
7272
|
+
```python
|
|
7273
|
+
.col_sd_le(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
7274
|
+
```
|
|
7275
|
+
|
|
7276
|
+
Understanding Tolerance
|
|
7277
|
+
-----------------------
|
|
7278
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
7279
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
7280
|
+
|
|
7281
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
7282
|
+
`col_sd_le()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
7283
|
+
target value and still pass validation.
|
|
7284
|
+
|
|
7285
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
7286
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
7287
|
+
shifts the comparison boundary.
|
|
7288
|
+
|
|
7289
|
+
Thresholds
|
|
7290
|
+
----------
|
|
7291
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
7292
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
7293
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
7294
|
+
|
|
7295
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
7296
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
7297
|
+
typically set as absolute counts:
|
|
7298
|
+
|
|
7299
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
7300
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
7301
|
+
|
|
7302
|
+
Thresholds can be defined using one of these input schemes:
|
|
7303
|
+
|
|
7304
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
7305
|
+
thresholds)
|
|
7306
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
7307
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
7308
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
7309
|
+
'critical'
|
|
7310
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
7311
|
+
for the 'warning' level only
|
|
7312
|
+
|
|
7313
|
+
Examples
|
|
7314
|
+
--------
|
|
7315
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
7316
|
+
shown below:
|
|
7317
|
+
|
|
7318
|
+
```python
|
|
7319
|
+
import pointblank as pb
|
|
7320
|
+
import polars as pl
|
|
7321
|
+
|
|
7322
|
+
tbl = pl.DataFrame(
|
|
7323
|
+
{
|
|
7324
|
+
"a": [1, 2, 3, 4, 5],
|
|
7325
|
+
"b": [2, 2, 2, 2, 2],
|
|
7326
|
+
}
|
|
7327
|
+
)
|
|
7328
|
+
|
|
7329
|
+
pb.preview(tbl)
|
|
7330
|
+
```
|
|
7331
|
+
|
|
7332
|
+
Let's validate that the standard deviation of column `a` is at most `2`:
|
|
7333
|
+
|
|
7334
|
+
```python
|
|
7335
|
+
validation = (
|
|
7336
|
+
pb.Validate(data=tbl)
|
|
7337
|
+
.col_sd_le(columns="a", value=2)
|
|
7338
|
+
.interrogate()
|
|
7339
|
+
)
|
|
7340
|
+
|
|
7341
|
+
validation
|
|
7342
|
+
```
|
|
7343
|
+
|
|
7344
|
+
The validation result shows whether the standard deviation comparison passed or failed. Since this
|
|
7345
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
7346
|
+
|
|
7347
|
+
When validating multiple columns, each column gets its own validation step:
|
|
7348
|
+
|
|
7349
|
+
```python
|
|
7350
|
+
validation = (
|
|
7351
|
+
pb.Validate(data=tbl)
|
|
7352
|
+
.col_sd_le(columns=["a", "b"], value=2)
|
|
7353
|
+
.interrogate()
|
|
7354
|
+
)
|
|
7355
|
+
|
|
7356
|
+
validation
|
|
7357
|
+
```
|
|
7358
|
+
|
|
7359
|
+
Using tolerance for flexible comparisons:
|
|
7360
|
+
|
|
7361
|
+
```python
|
|
7362
|
+
validation = (
|
|
7363
|
+
pb.Validate(data=tbl)
|
|
7364
|
+
.col_sd_le(columns="a", value=2, tol=1.0)
|
|
7365
|
+
.interrogate()
|
|
7366
|
+
)
|
|
7367
|
+
|
|
7368
|
+
validation
|
|
7369
|
+
```
|
|
7370
|
+
|
|
7371
|
+
col_sd_eq(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
7372
|
+
Does the column standard deviation satisfy an equal to comparison?
|
|
7373
|
+
|
|
7374
|
+
The `col_sd_eq()` validation method checks whether the standard deviation of values in a column
|
|
7375
|
+
equals a specified `value=`. This is an aggregation-based validation where the entire
|
|
7376
|
+
column is reduced to a single standard deviation value that is then compared against the target. The
|
|
7377
|
+
comparison used in this function is `standard deviation(column) == value`.
|
|
7378
|
+
|
|
7379
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
7380
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
7381
|
+
the comparison) or fails completely.
|
|
7382
|
+
|
|
7383
|
+
Parameters
|
|
7384
|
+
----------
|
|
7385
|
+
columns
|
|
7386
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
7387
|
+
there will be a separate validation step generated for each column. The columns must
|
|
7388
|
+
contain numeric data for the standard deviation to be computed.
|
|
7389
|
+
value
|
|
7390
|
+
The value to compare the column standard deviation against. This can be: (1) a numeric literal
|
|
7391
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
7392
|
+
whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
7393
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
7394
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
7395
|
+
`ref(column_name)` when reference data is set).
|
|
7396
|
+
tol
|
|
7397
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
7398
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
7399
|
+
a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter is particularly useful with `col_sd_eq()` since exact equality
|
|
7400
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
7401
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
7402
|
+
floating-point arithmetic.
|
|
7403
|
+
thresholds
|
|
7404
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
7405
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
7406
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
7407
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
7408
|
+
acceptable.
|
|
7409
|
+
brief
|
|
7410
|
+
An optional brief description of the validation step that will be displayed in the
|
|
7411
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
7412
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
7413
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
7414
|
+
won't be a brief.
|
|
7415
|
+
actions
|
|
7416
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
7417
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
7418
|
+
define the actions.
|
|
7419
|
+
active
|
|
7420
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
7421
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
7422
|
+
for the steps unchanged).
|
|
7423
|
+
|
|
7424
|
+
Returns
|
|
7425
|
+
-------
|
|
7426
|
+
Validate
|
|
7427
|
+
The `Validate` object with the added validation step.
|
|
7428
|
+
|
|
7429
|
+
Using Reference Data
|
|
7430
|
+
--------------------
|
|
7431
|
+
The `col_sd_eq()` method supports comparing column aggregations against reference data. This
|
|
7432
|
+
is useful for validating that statistical properties remain consistent across different
|
|
7433
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
7434
|
+
|
|
7435
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
7436
|
+
|
|
7437
|
+
```python
|
|
7438
|
+
validation = (
|
|
7439
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
7440
|
+
.col_sd_eq(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
7441
|
+
.interrogate()
|
|
7442
|
+
)
|
|
7443
|
+
```
|
|
7444
|
+
|
|
7445
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
7446
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
7447
|
+
the `ref()` helper:
|
|
7448
|
+
|
|
7449
|
+
```python
|
|
7450
|
+
.col_sd_eq(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
7451
|
+
```
|
|
7452
|
+
|
|
7453
|
+
Understanding Tolerance
|
|
7454
|
+
-----------------------
|
|
7455
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
7456
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
7457
|
+
|
|
7458
|
+
The `tol=` parameter is particularly useful with `col_sd_eq()` since exact equality
|
|
7459
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
7460
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
7461
|
+
floating-point arithmetic.
|
|
7462
|
+
|
|
7463
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
7464
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
7465
|
+
shifts the comparison boundary.
|
|
7466
|
+
|
|
7467
|
+
Thresholds
|
|
7468
|
+
----------
|
|
7469
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
7470
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
7471
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
7472
|
+
|
|
7473
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
7474
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
7475
|
+
typically set as absolute counts:
|
|
7476
|
+
|
|
7477
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
7478
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
7479
|
+
|
|
7480
|
+
Thresholds can be defined using one of these input schemes:
|
|
7481
|
+
|
|
7482
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
7483
|
+
thresholds)
|
|
7484
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
7485
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
7486
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
7487
|
+
'critical'
|
|
7488
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
7489
|
+
for the 'warning' level only
|
|
7490
|
+
|
|
7491
|
+
Examples
|
|
7492
|
+
--------
|
|
7493
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
7494
|
+
shown below:
|
|
7495
|
+
|
|
7496
|
+
```python
|
|
7497
|
+
import pointblank as pb
|
|
7498
|
+
import polars as pl
|
|
7499
|
+
|
|
7500
|
+
tbl = pl.DataFrame(
|
|
7501
|
+
{
|
|
7502
|
+
"a": [1, 2, 3, 4, 5],
|
|
7503
|
+
"b": [2, 2, 2, 2, 2],
|
|
7504
|
+
}
|
|
7505
|
+
)
|
|
7506
|
+
|
|
7507
|
+
pb.preview(tbl)
|
|
7508
|
+
```
|
|
7509
|
+
|
|
7510
|
+
Let's validate that the standard deviation of column `a` equals `2`:
|
|
7511
|
+
|
|
7512
|
+
```python
|
|
7513
|
+
validation = (
|
|
7514
|
+
pb.Validate(data=tbl)
|
|
7515
|
+
.col_sd_eq(columns="a", value=2)
|
|
7516
|
+
.interrogate()
|
|
7517
|
+
)
|
|
7518
|
+
|
|
7519
|
+
validation
|
|
7520
|
+
```
|
|
7521
|
+
|
|
7522
|
+
The validation result shows whether the standard deviation comparison passed or failed. Since this
|
|
7523
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
7524
|
+
|
|
7525
|
+
When validating multiple columns, each column gets its own validation step:
|
|
7526
|
+
|
|
7527
|
+
```python
|
|
7528
|
+
validation = (
|
|
7529
|
+
pb.Validate(data=tbl)
|
|
7530
|
+
.col_sd_eq(columns=["a", "b"], value=2)
|
|
7531
|
+
.interrogate()
|
|
7532
|
+
)
|
|
7533
|
+
|
|
7534
|
+
validation
|
|
7535
|
+
```
|
|
7536
|
+
|
|
7537
|
+
Using tolerance for flexible comparisons:
|
|
7538
|
+
|
|
7539
|
+
```python
|
|
7540
|
+
validation = (
|
|
7541
|
+
pb.Validate(data=tbl)
|
|
7542
|
+
.col_sd_eq(columns="a", value=2, tol=1.0)
|
|
7543
|
+
.interrogate()
|
|
7544
|
+
)
|
|
7545
|
+
|
|
7546
|
+
validation
|
|
7547
|
+
```
|
|
7548
|
+
|
|
7549
|
+
rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4904
7550
|
|
|
4905
7551
|
Validate whether rows in the table are distinct.
|
|
4906
7552
|
|
|
@@ -5090,7 +7736,7 @@ rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
5090
7736
|
others.
|
|
5091
7737
|
|
|
5092
7738
|
|
|
5093
|
-
rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
7739
|
+
rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5094
7740
|
|
|
5095
7741
|
Validate whether row data are complete by having no missing values.
|
|
5096
7742
|
|
|
@@ -5280,7 +7926,7 @@ rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
5280
7926
|
others.
|
|
5281
7927
|
|
|
5282
7928
|
|
|
5283
|
-
col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
7929
|
+
col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5284
7930
|
|
|
5285
7931
|
Validate whether one or more columns exist in the table.
|
|
5286
7932
|
|
|
@@ -5632,7 +8278,7 @@ col_pct_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnS
|
|
|
5632
8278
|
```python
|
|
5633
8279
|
validation = (
|
|
5634
8280
|
pb.Validate(data=tbl)
|
|
5635
|
-
.col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3) # Expect 3 Nulls, allow -10%/+30%
|
|
8281
|
+
.col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3)) # Expect 3 Nulls, allow -10%/+30%
|
|
5636
8282
|
.interrogate()
|
|
5637
8283
|
)
|
|
5638
8284
|
|
|
@@ -5643,7 +8289,7 @@ col_pct_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnS
|
|
|
5643
8289
|
calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows).
|
|
5644
8290
|
|
|
5645
8291
|
|
|
5646
|
-
col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
8292
|
+
col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5647
8293
|
|
|
5648
8294
|
Do columns in the table (and their types) match a predefined schema?
|
|
5649
8295
|
|
|
@@ -5803,7 +8449,7 @@ col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'boo
|
|
|
5803
8449
|
since the table columns and their types match the schema.
|
|
5804
8450
|
|
|
5805
8451
|
|
|
5806
|
-
row_count_match(self, count: 'int |
|
|
8452
|
+
row_count_match(self, count: 'int | Any', tol: 'Tolerance' = 0, inverse: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5807
8453
|
|
|
5808
8454
|
Validate whether the row count of the table matches a specified count.
|
|
5809
8455
|
|
|
@@ -5957,7 +8603,7 @@ row_count_match(self, count: 'int | FrameT | Any', tol: 'Tolerance' = 0, inverse
|
|
|
5957
8603
|
|
|
5958
8604
|
|
|
5959
8605
|
|
|
5960
|
-
col_count_match(self, count: 'int |
|
|
8606
|
+
col_count_match(self, count: 'int | Any', inverse: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5961
8607
|
|
|
5962
8608
|
Validate whether the column count of the table matches a specified count.
|
|
5963
8609
|
|
|
@@ -6072,7 +8718,7 @@ col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre:
|
|
|
6072
8718
|
columns in the target table. So, the single test unit passed.
|
|
6073
8719
|
|
|
6074
8720
|
|
|
6075
|
-
tbl_match(self, tbl_compare: '
|
|
8721
|
+
tbl_match(self, tbl_compare: 'Any', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6076
8722
|
|
|
6077
8723
|
Validate whether the target table matches a comparison table.
|
|
6078
8724
|
|
|
@@ -6295,7 +8941,7 @@ tbl_match(self, tbl_compare: 'FrameT | Any', pre: 'Callable | None' = None, thre
|
|
|
6295
8941
|
(one value is different in column `c`).
|
|
6296
8942
|
|
|
6297
8943
|
|
|
6298
|
-
conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
8944
|
+
conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6299
8945
|
|
|
6300
8946
|
Perform multiple row-wise validations for joint validity.
|
|
6301
8947
|
|
|
@@ -6494,7 +9140,7 @@ conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds:
|
|
|
6494
9140
|
information on how to use it with different table backends.
|
|
6495
9141
|
|
|
6496
9142
|
|
|
6497
|
-
specially(self, expr: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
9143
|
+
specially(self, expr: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6498
9144
|
|
|
6499
9145
|
Perform a specialized validation with customized logic.
|
|
6500
9146
|
|
|
@@ -6794,7 +9440,7 @@ specially(self, expr: 'Callable', pre: 'Callable | None' = None, thresholds: 'in
|
|
|
6794
9440
|
virtually any data quality requirement in your organization.
|
|
6795
9441
|
|
|
6796
9442
|
|
|
6797
|
-
prompt(self, prompt: 'str', model: 'str', columns_subset: 'str | list[str] | None' = None, batch_size: 'int' = 1000, max_concurrent: 'int' = 3, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
9443
|
+
prompt(self, prompt: 'str', model: 'str', columns_subset: 'str | list[str] | None' = None, batch_size: 'int' = 1000, max_concurrent: 'int' = 3, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6798
9444
|
|
|
6799
9445
|
Validate rows using AI/LLM-powered analysis.
|
|
6800
9446
|
|
|
@@ -7115,7 +9761,7 @@ many steps). Furthermore, the `col()` function can be used to declare a comparis
|
|
|
7115
9761
|
for the `value=` argument in many `col_vals_*()` methods) when you can't use a fixed value
|
|
7116
9762
|
for comparison.
|
|
7117
9763
|
|
|
7118
|
-
col(exprs: 'str | ColumnSelector | ColumnSelectorNarwhals') -> 'Column | ColumnLiteral | ColumnSelectorNarwhals'
|
|
9764
|
+
col(exprs: 'str | ColumnSelector | ColumnSelectorNarwhals | nw.selectors.Selector') -> 'Column | ColumnLiteral | ColumnSelectorNarwhals'
|
|
7119
9765
|
|
|
7120
9766
|
Helper function for referencing a column in the input table.
|
|
7121
9767
|
|
|
@@ -8735,7 +11381,7 @@ interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' =
|
|
|
8735
11381
|
`get_first_n=10`.
|
|
8736
11382
|
|
|
8737
11383
|
|
|
8738
|
-
set_tbl(self, tbl: '
|
|
11384
|
+
set_tbl(self, tbl: 'Any', tbl_name: 'str | None' = None, label: 'str | None' = None) -> 'Validate'
|
|
8739
11385
|
|
|
8740
11386
|
Set or replace the table associated with the Validate object.
|
|
8741
11387
|
|
|
@@ -8837,7 +11483,7 @@ set_tbl(self, tbl: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str |
|
|
|
8837
11483
|
```
|
|
8838
11484
|
|
|
8839
11485
|
|
|
8840
|
-
get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool' = None, incl_footer: 'bool' = None, incl_footer_timings: 'bool' = None, incl_footer_notes: 'bool' = None) -> 'GT'
|
|
11486
|
+
get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool | None' = None, incl_footer: 'bool | None' = None, incl_footer_timings: 'bool | None' = None, incl_footer_notes: 'bool | None' = None) -> 'GT'
|
|
8841
11487
|
|
|
8842
11488
|
Validation report as a GT table.
|
|
8843
11489
|
|
|
@@ -9210,7 +11856,7 @@ get_json_report(self, use_fields: 'list[str] | None' = None, exclude_fields: 'li
|
|
|
9210
11856
|
failed validation
|
|
9211
11857
|
|
|
9212
11858
|
|
|
9213
|
-
get_sundered_data(self, type='pass') -> '
|
|
11859
|
+
get_sundered_data(self, type='pass') -> 'Any'
|
|
9214
11860
|
|
|
9215
11861
|
Get the data that passed or failed the validation steps.
|
|
9216
11862
|
|
|
@@ -9246,7 +11892,7 @@ get_sundered_data(self, type='pass') -> 'FrameT'
|
|
|
9246
11892
|
|
|
9247
11893
|
Returns
|
|
9248
11894
|
-------
|
|
9249
|
-
|
|
11895
|
+
Any
|
|
9250
11896
|
A table containing the data that passed or failed the validation steps.
|
|
9251
11897
|
|
|
9252
11898
|
Examples
|
|
@@ -9291,7 +11937,7 @@ get_sundered_data(self, type='pass') -> 'FrameT'
|
|
|
9291
11937
|
that's what we see in the returned DataFrame.
|
|
9292
11938
|
|
|
9293
11939
|
|
|
9294
|
-
get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = False) -> 'dict[int,
|
|
11940
|
+
get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = False) -> 'dict[int, Any] | Any'
|
|
9295
11941
|
|
|
9296
11942
|
Get the rows that failed for each validation step.
|
|
9297
11943
|
|
|
@@ -9314,7 +11960,7 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
|
|
|
9314
11960
|
|
|
9315
11961
|
Returns
|
|
9316
11962
|
-------
|
|
9317
|
-
dict[int,
|
|
11963
|
+
dict[int, Any] | Any
|
|
9318
11964
|
A dictionary of tables containing the rows that failed in every compatible validation
|
|
9319
11965
|
step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
|
|
9320
11966
|
|
|
@@ -10471,7 +13117,7 @@ datasets included in the package can be accessed via the `load_dataset()` functi
|
|
|
10471
13117
|
`config()` utility lets us set global configuration parameters. Want to chat with an assistant? Use
|
|
10472
13118
|
the `assistant()` function to get help with Pointblank.
|
|
10473
13119
|
|
|
10474
|
-
DataScan(data: '
|
|
13120
|
+
DataScan(data: 'Any', tbl_name: 'str | None' = None) -> 'None'
|
|
10475
13121
|
|
|
10476
13122
|
Get a summary of a dataset.
|
|
10477
13123
|
|
|
@@ -10567,7 +13213,7 @@ DataScan(data: 'IntoFrameT', tbl_name: 'str | None' = None) -> 'None'
|
|
|
10567
13213
|
A DataScan object.
|
|
10568
13214
|
|
|
10569
13215
|
|
|
10570
|
-
preview(data: '
|
|
13216
|
+
preview(data: 'Any', columns_subset: 'str | list[str] | Column | None' = None, n_head: 'int' = 5, n_tail: 'int' = 5, limit: 'int' = 50, show_row_numbers: 'bool' = True, max_col_width: 'int' = 250, min_tbl_width: 'int' = 500, incl_header: 'bool | None' = None) -> 'GT'
|
|
10571
13217
|
|
|
10572
13218
|
Display a table preview that shows some rows from the top, some from the bottom.
|
|
10573
13219
|
|
|
@@ -10766,7 +13412,7 @@ preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None'
|
|
|
10766
13412
|
function.
|
|
10767
13413
|
|
|
10768
13414
|
|
|
10769
|
-
col_summary_tbl(data: '
|
|
13415
|
+
col_summary_tbl(data: 'Any', tbl_name: 'str | None' = None) -> 'GT'
|
|
10770
13416
|
|
|
10771
13417
|
Generate a column-level summary table of a dataset.
|
|
10772
13418
|
|
|
@@ -10843,7 +13489,7 @@ col_summary_tbl(data: 'FrameT | Any', tbl_name: 'str | None' = None) -> 'GT'
|
|
|
10843
13489
|
```
|
|
10844
13490
|
|
|
10845
13491
|
|
|
10846
|
-
missing_vals_tbl(data: '
|
|
13492
|
+
missing_vals_tbl(data: 'Any') -> 'GT'
|
|
10847
13493
|
|
|
10848
13494
|
Display a table that shows the missing values in the input table.
|
|
10849
13495
|
|
|
@@ -10917,7 +13563,7 @@ missing_vals_tbl(data: 'FrameT | Any') -> 'GT'
|
|
|
10917
13563
|
sector. Many columns have no missing values at all, and those sectors are colored light blue.
|
|
10918
13564
|
|
|
10919
13565
|
|
|
10920
|
-
assistant(model: 'str', data: '
|
|
13566
|
+
assistant(model: 'str', data: 'Any' = None, tbl_name: 'str | None' = None, api_key: 'str | None' = None, display: 'str | None' = None) -> 'None'
|
|
10921
13567
|
|
|
10922
13568
|
Chat with the PbA (Pointblank Assistant) about your data validation needs.
|
|
10923
13569
|
|
|
@@ -11061,7 +13707,7 @@ assistant(model: 'str', data: 'FrameT | Any | None' = None, tbl_name: 'str | Non
|
|
|
11061
13707
|
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
11062
13708
|
|
|
11063
13709
|
|
|
11064
|
-
load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'global_sales']" = 'small_table', tbl_type: "Literal['polars', 'pandas', 'duckdb']" = 'polars') -> '
|
|
13710
|
+
load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'global_sales']" = 'small_table', tbl_type: "Literal['polars', 'pandas', 'duckdb']" = 'polars') -> 'Any'
|
|
11065
13711
|
|
|
11066
13712
|
Load a dataset hosted in the library as specified table type.
|
|
11067
13713
|
|
|
@@ -11082,7 +13728,7 @@ load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'glo
|
|
|
11082
13728
|
|
|
11083
13729
|
Returns
|
|
11084
13730
|
-------
|
|
11085
|
-
|
|
13731
|
+
Any
|
|
11086
13732
|
The dataset for the `Validate` object. This could be a Polars DataFrame, a Pandas DataFrame,
|
|
11087
13733
|
or a DuckDB table as an Ibis table.
|
|
11088
13734
|
|
|
@@ -11374,7 +14020,7 @@ from YAML strings or files. The `validate_yaml()` function checks if the YAML co
|
|
|
11374
14020
|
its own validity checks. The `yaml_to_python()` function converts YAML configuration to equivalent
|
|
11375
14021
|
Python code.
|
|
11376
14022
|
|
|
11377
|
-
yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: '
|
|
14023
|
+
yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Any' = None, namespaces: 'Optional[Union[Iterable[str], Mapping[str, str]]]' = None) -> 'Validate'
|
|
11378
14024
|
Execute a YAML-based validation workflow.
|
|
11379
14025
|
|
|
11380
14026
|
This is the main entry point for YAML-based validation workflows. It takes YAML configuration
|
|
@@ -11863,7 +14509,7 @@ columns or rows in a table. The `get_action_metadata()` function is useful when
|
|
|
11863
14509
|
actions since it returns metadata about the validation step that's triggering the action. Lastly,
|
|
11864
14510
|
the `config()` utility lets us set global configuration parameters.
|
|
11865
14511
|
|
|
11866
|
-
get_column_count(data: '
|
|
14512
|
+
get_column_count(data: 'Any') -> 'int'
|
|
11867
14513
|
|
|
11868
14514
|
Get the number of columns in a table.
|
|
11869
14515
|
|
|
@@ -11978,7 +14624,7 @@ get_column_count(data: 'FrameT | Any') -> 'int'
|
|
|
11978
14624
|
`8` for the `small_table` dataset.
|
|
11979
14625
|
|
|
11980
14626
|
|
|
11981
|
-
get_row_count(data: '
|
|
14627
|
+
get_row_count(data: 'Any') -> 'int'
|
|
11982
14628
|
|
|
11983
14629
|
Get the number of rows in a table.
|
|
11984
14630
|
|
|
@@ -12602,7 +15248,7 @@ send a Slack notification when validation steps exceed failure threshold levels
|
|
|
12602
15248
|
summary of the validation results, including the status, number of steps, passing and failing steps,
|
|
12603
15249
|
table information, and timing details.
|
|
12604
15250
|
|
|
12605
|
-
send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None' = None, summary_msg: 'str | None' = None, debug: 'bool' = False) -> 'Callable'
|
|
15251
|
+
send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None' = None, summary_msg: 'str | None' = None, debug: 'bool' = False) -> 'Callable | None'
|
|
12606
15252
|
|
|
12607
15253
|
Create a Slack notification function using a webhook URL.
|
|
12608
15254
|
|