pointblank 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_agg.py +120 -0
- pointblank/_constants.py +207 -6
- pointblank/_constants_translations.py +1302 -0
- pointblank/_datascan_utils.py +28 -10
- pointblank/_interrogation.py +216 -139
- pointblank/_typing.py +12 -0
- pointblank/_utils.py +81 -44
- pointblank/_utils_ai.py +4 -5
- pointblank/_utils_check_args.py +3 -3
- pointblank/_utils_llms_txt.py +41 -2
- pointblank/actions.py +1 -1
- pointblank/assistant.py +2 -3
- pointblank/cli.py +1 -1
- pointblank/column.py +162 -46
- pointblank/data/api-docs.txt +2957 -50
- pointblank/datascan.py +17 -17
- pointblank/draft.py +2 -3
- pointblank/scan_profile.py +2 -1
- pointblank/schema.py +61 -20
- pointblank/thresholds.py +15 -13
- pointblank/validate.py +2280 -410
- pointblank/validate.pyi +1104 -0
- pointblank/yaml.py +15 -8
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/METADATA +7 -2
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/RECORD +30 -28
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/licenses/LICENSE +1 -1
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/WHEEL +0 -0
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/top_level.txt +0 -0
pointblank/data/api-docs.txt
CHANGED
|
@@ -11,7 +11,7 @@ failure thresholds (using the `Thresholds` class or through shorthands for this
|
|
|
11
11
|
`Validate` class has numerous methods for defining validation steps and for obtaining
|
|
12
12
|
post-interrogation metrics and data.
|
|
13
13
|
|
|
14
|
-
Validate(data: '
|
|
14
|
+
Validate(data: 'IntoDataFrame', reference: 'IntoFrame | None' = None, tbl_name: 'str | None' = None, label: 'str | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, final_actions: 'FinalActions | None' = None, brief: 'str | bool | None' = None, lang: 'str | None' = None, locale: 'str | None' = None) -> None
|
|
15
15
|
|
|
16
16
|
Workflow for defining a set of validations on a table and interrogating for results.
|
|
17
17
|
|
|
@@ -916,7 +916,7 @@ FinalActions(*args)
|
|
|
916
916
|
used to retrieve the summary of the validation results.
|
|
917
917
|
|
|
918
918
|
|
|
919
|
-
Schema(columns: 'str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None' = None, tbl: '
|
|
919
|
+
Schema(columns: 'str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None' = None, tbl: 'Any | None' = None, **kwargs)
|
|
920
920
|
Definition of a schema object.
|
|
921
921
|
|
|
922
922
|
The schema object defines the structure of a table. Once it is defined, the object can be used
|
|
@@ -1167,7 +1167,7 @@ Definition of a schema object.
|
|
|
1167
1167
|
`Schema` object is used in a validation workflow.
|
|
1168
1168
|
|
|
1169
1169
|
|
|
1170
|
-
DraftValidation(data: '
|
|
1170
|
+
DraftValidation(data: 'Any', model: 'str', api_key: 'str | None' = None, verify_ssl: 'bool' = True) -> None
|
|
1171
1171
|
|
|
1172
1172
|
Draft a validation plan for a given table using an LLM.
|
|
1173
1173
|
|
|
@@ -1382,7 +1382,7 @@ Validation steps can be thought of as sequential validations on the target
|
|
|
1382
1382
|
data. We call `Validate`'s validation methods to build up a validation plan: a collection of steps
|
|
1383
1383
|
that, in the aggregate, provides good validation coverage.
|
|
1384
1384
|
|
|
1385
|
-
col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1385
|
+
col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1386
1386
|
|
|
1387
1387
|
Are column data greater than a fixed value or data in another column?
|
|
1388
1388
|
|
|
@@ -1607,7 +1607,7 @@ col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1607
1607
|
- Row 3: `c` is `2` and `b` is `2`.
|
|
1608
1608
|
|
|
1609
1609
|
|
|
1610
|
-
col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1610
|
+
col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1611
1611
|
|
|
1612
1612
|
Are column data less than a fixed value or data in another column?
|
|
1613
1613
|
|
|
@@ -1832,7 +1832,7 @@ col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1832
1832
|
- Row 2: `b` is `1` and `c` is `1`.
|
|
1833
1833
|
|
|
1834
1834
|
|
|
1835
|
-
col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1835
|
+
col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1836
1836
|
|
|
1837
1837
|
Are column data greater than or equal to a fixed value or data in another column?
|
|
1838
1838
|
|
|
@@ -2057,7 +2057,7 @@ col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2057
2057
|
- Row 4: `b` is `3` and `c` is `4`.
|
|
2058
2058
|
|
|
2059
2059
|
|
|
2060
|
-
col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2060
|
+
col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2061
2061
|
|
|
2062
2062
|
Are column data less than or equal to a fixed value or data in another column?
|
|
2063
2063
|
|
|
@@ -2282,7 +2282,7 @@ col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2282
2282
|
- Row 4: `c` is `3` and `b` is `2`.
|
|
2283
2283
|
|
|
2284
2284
|
|
|
2285
|
-
col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2285
|
+
col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2286
2286
|
|
|
2287
2287
|
Are column data equal to a fixed value or data in another column?
|
|
2288
2288
|
|
|
@@ -2505,7 +2505,7 @@ col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2505
2505
|
- Row 5: `a` is `5` and `b` is `4`.
|
|
2506
2506
|
|
|
2507
2507
|
|
|
2508
|
-
col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2508
|
+
col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2509
2509
|
|
|
2510
2510
|
Are column data not equal to a fixed value or data in another column?
|
|
2511
2511
|
|
|
@@ -2726,7 +2726,7 @@ col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2726
2726
|
0 and 4, where `a` is `5` and `b` is `5` in both cases (i.e., they are equal to each other).
|
|
2727
2727
|
|
|
2728
2728
|
|
|
2729
|
-
col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2729
|
+
col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2730
2730
|
|
|
2731
2731
|
Do column data lie between two specified values or data in other columns?
|
|
2732
2732
|
|
|
@@ -2971,7 +2971,7 @@ col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2971
2971
|
- Row 4: `b` is `8` but the bounds are `3` (`a`) and `7` (`c`).
|
|
2972
2972
|
|
|
2973
2973
|
|
|
2974
|
-
col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2974
|
+
col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2975
2975
|
|
|
2976
2976
|
Do column data lie outside of two specified values or data in other columns?
|
|
2977
2977
|
|
|
@@ -3216,7 +3216,7 @@ col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
3216
3216
|
- Row 5: `b` is `6` and the bounds are `5` (`a`) and `7` (`c`).
|
|
3217
3217
|
|
|
3218
3218
|
|
|
3219
|
-
col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3219
|
+
col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3220
3220
|
|
|
3221
3221
|
Validate whether column values are in a set of values.
|
|
3222
3222
|
|
|
@@ -3463,7 +3463,7 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
3463
3463
|
specified set.
|
|
3464
3464
|
|
|
3465
3465
|
|
|
3466
|
-
col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3466
|
+
col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3467
3467
|
|
|
3468
3468
|
Validate whether column values are not in a set of values.
|
|
3469
3469
|
|
|
@@ -3687,7 +3687,7 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3687
3687
|
statuses in the `InvalidStatus` enum.
|
|
3688
3688
|
|
|
3689
3689
|
|
|
3690
|
-
col_vals_increasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, decreasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3690
|
+
col_vals_increasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, decreasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3691
3691
|
|
|
3692
3692
|
Are column data increasing by row?
|
|
3693
3693
|
|
|
@@ -3815,7 +3815,7 @@ col_vals_increasing(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3815
3815
|
```
|
|
3816
3816
|
|
|
3817
3817
|
|
|
3818
|
-
col_vals_decreasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, increasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3818
|
+
col_vals_decreasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, increasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3819
3819
|
|
|
3820
3820
|
Are column data decreasing by row?
|
|
3821
3821
|
|
|
@@ -3943,7 +3943,7 @@ col_vals_decreasing(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3943
3943
|
```
|
|
3944
3944
|
|
|
3945
3945
|
|
|
3946
|
-
col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3946
|
+
col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3947
3947
|
|
|
3948
3948
|
Validate whether values in a column are Null.
|
|
3949
3949
|
|
|
@@ -4129,7 +4129,7 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
|
|
|
4129
4129
|
two non-Null values in column `b`.
|
|
4130
4130
|
|
|
4131
4131
|
|
|
4132
|
-
col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4132
|
+
col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4133
4133
|
|
|
4134
4134
|
Validate whether values in a column are not Null.
|
|
4135
4135
|
|
|
@@ -4315,7 +4315,7 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
|
|
|
4315
4315
|
two Null values in column `b`.
|
|
4316
4316
|
|
|
4317
4317
|
|
|
4318
|
-
col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, inverse: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4318
|
+
col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, inverse: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4319
4319
|
|
|
4320
4320
|
Validate whether column values match a regular expression pattern.
|
|
4321
4321
|
|
|
@@ -4511,7 +4511,7 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
|
|
|
4511
4511
|
string values of rows 1 and 2 in column `b`.
|
|
4512
4512
|
|
|
4513
4513
|
|
|
4514
|
-
col_vals_within_spec(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', spec: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4514
|
+
col_vals_within_spec(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', spec: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4515
4515
|
|
|
4516
4516
|
Validate whether column values fit within a specification.
|
|
4517
4517
|
|
|
@@ -4729,7 +4729,7 @@ col_vals_within_spec(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
4729
4729
|
The validation table shows that one test unit failed (the invalid email address in row 3).
|
|
4730
4730
|
|
|
4731
4731
|
|
|
4732
|
-
col_vals_expr(self, expr: '
|
|
4732
|
+
col_vals_expr(self, expr: 'Any', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4733
4733
|
|
|
4734
4734
|
Validate column values using a custom expression.
|
|
4735
4735
|
|
|
@@ -4900,7 +4900,2653 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
|
|
|
4900
4900
|
by using `col_vals_expr()`. All test units passed, with no failing test units.
|
|
4901
4901
|
|
|
4902
4902
|
|
|
4903
|
-
|
|
4903
|
+
col_sum_gt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4904
|
+
Does the column sum satisfy a greater than comparison?
|
|
4905
|
+
|
|
4906
|
+
The `col_sum_gt()` validation method checks whether the sum of values in a column
|
|
4907
|
+
is greater than a specified `value=`. This is an aggregation-based validation where the entire
|
|
4908
|
+
column is reduced to a single sum value that is then compared against the target. The
|
|
4909
|
+
comparison used in this function is `sum(column) > value`.
|
|
4910
|
+
|
|
4911
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
4912
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
4913
|
+
the comparison) or fails completely.
|
|
4914
|
+
|
|
4915
|
+
Parameters
|
|
4916
|
+
----------
|
|
4917
|
+
columns
|
|
4918
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
4919
|
+
there will be a separate validation step generated for each column. The columns must
|
|
4920
|
+
contain numeric data for the sum to be computed.
|
|
4921
|
+
value
|
|
4922
|
+
The value to compare the column sum against. This can be: (1) a numeric literal
|
|
4923
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
4924
|
+
whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
4925
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
4926
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
4927
|
+
`ref(column_name)` when reference data is set).
|
|
4928
|
+
tol
|
|
4929
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
4930
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
4931
|
+
a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
4932
|
+
`col_sum_gt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
4933
|
+
target value and still pass validation.
|
|
4934
|
+
thresholds
|
|
4935
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
4936
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
4937
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
4938
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
4939
|
+
acceptable.
|
|
4940
|
+
brief
|
|
4941
|
+
An optional brief description of the validation step that will be displayed in the
|
|
4942
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
4943
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
4944
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
4945
|
+
won't be a brief.
|
|
4946
|
+
actions
|
|
4947
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
4948
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
4949
|
+
define the actions.
|
|
4950
|
+
active
|
|
4951
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
4952
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
4953
|
+
for the steps unchanged).
|
|
4954
|
+
|
|
4955
|
+
Returns
|
|
4956
|
+
-------
|
|
4957
|
+
Validate
|
|
4958
|
+
The `Validate` object with the added validation step.
|
|
4959
|
+
|
|
4960
|
+
Using Reference Data
|
|
4961
|
+
--------------------
|
|
4962
|
+
The `col_sum_gt()` method supports comparing column aggregations against reference data. This
|
|
4963
|
+
is useful for validating that statistical properties remain consistent across different
|
|
4964
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
4965
|
+
|
|
4966
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
4967
|
+
|
|
4968
|
+
```python
|
|
4969
|
+
validation = (
|
|
4970
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
4971
|
+
.col_sum_gt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
4972
|
+
.interrogate()
|
|
4973
|
+
)
|
|
4974
|
+
```
|
|
4975
|
+
|
|
4976
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
4977
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
4978
|
+
the `ref()` helper:
|
|
4979
|
+
|
|
4980
|
+
```python
|
|
4981
|
+
.col_sum_gt(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
4982
|
+
```
|
|
4983
|
+
|
|
4984
|
+
Understanding Tolerance
|
|
4985
|
+
-----------------------
|
|
4986
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
4987
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
4988
|
+
|
|
4989
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
4990
|
+
`col_sum_gt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
4991
|
+
target value and still pass validation.
|
|
4992
|
+
|
|
4993
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
4994
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
4995
|
+
shifts the comparison boundary.
|
|
4996
|
+
|
|
4997
|
+
Thresholds
|
|
4998
|
+
----------
|
|
4999
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5000
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5001
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5002
|
+
|
|
5003
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
5004
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
5005
|
+
typically set as absolute counts:
|
|
5006
|
+
|
|
5007
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
5008
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
5009
|
+
|
|
5010
|
+
Thresholds can be defined using one of these input schemes:
|
|
5011
|
+
|
|
5012
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5013
|
+
thresholds)
|
|
5014
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5015
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5016
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5017
|
+
'critical'
|
|
5018
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5019
|
+
for the 'warning' level only
|
|
5020
|
+
|
|
5021
|
+
Examples
|
|
5022
|
+
--------
|
|
5023
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
5024
|
+
shown below:
|
|
5025
|
+
|
|
5026
|
+
```python
|
|
5027
|
+
import pointblank as pb
|
|
5028
|
+
import polars as pl
|
|
5029
|
+
|
|
5030
|
+
tbl = pl.DataFrame(
|
|
5031
|
+
{
|
|
5032
|
+
"a": [1, 2, 3, 4, 5],
|
|
5033
|
+
"b": [2, 2, 2, 2, 2],
|
|
5034
|
+
}
|
|
5035
|
+
)
|
|
5036
|
+
|
|
5037
|
+
pb.preview(tbl)
|
|
5038
|
+
```
|
|
5039
|
+
|
|
5040
|
+
Let's validate that the sum of column `a` is greater than `15`:
|
|
5041
|
+
|
|
5042
|
+
```python
|
|
5043
|
+
validation = (
|
|
5044
|
+
pb.Validate(data=tbl)
|
|
5045
|
+
.col_sum_gt(columns="a", value=15)
|
|
5046
|
+
.interrogate()
|
|
5047
|
+
)
|
|
5048
|
+
|
|
5049
|
+
validation
|
|
5050
|
+
```
|
|
5051
|
+
|
|
5052
|
+
The validation result shows whether the sum comparison passed or failed. Since this
|
|
5053
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
5054
|
+
|
|
5055
|
+
When validating multiple columns, each column gets its own validation step:
|
|
5056
|
+
|
|
5057
|
+
```python
|
|
5058
|
+
validation = (
|
|
5059
|
+
pb.Validate(data=tbl)
|
|
5060
|
+
.col_sum_gt(columns=["a", "b"], value=15)
|
|
5061
|
+
.interrogate()
|
|
5062
|
+
)
|
|
5063
|
+
|
|
5064
|
+
validation
|
|
5065
|
+
```
|
|
5066
|
+
|
|
5067
|
+
Using tolerance for flexible comparisons:
|
|
5068
|
+
|
|
5069
|
+
```python
|
|
5070
|
+
validation = (
|
|
5071
|
+
pb.Validate(data=tbl)
|
|
5072
|
+
.col_sum_gt(columns="a", value=15, tol=1.0)
|
|
5073
|
+
.interrogate()
|
|
5074
|
+
)
|
|
5075
|
+
|
|
5076
|
+
validation
|
|
5077
|
+
```
|
|
5078
|
+
|
|
5079
|
+
col_sum_lt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5080
|
+
Does the column sum satisfy a less than comparison?
|
|
5081
|
+
|
|
5082
|
+
The `col_sum_lt()` validation method checks whether the sum of values in a column
|
|
5083
|
+
is less than a specified `value=`. This is an aggregation-based validation where the entire
|
|
5084
|
+
column is reduced to a single sum value that is then compared against the target. The
|
|
5085
|
+
comparison used in this function is `sum(column) < value`.
|
|
5086
|
+
|
|
5087
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
5088
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
5089
|
+
the comparison) or fails completely.
|
|
5090
|
+
|
|
5091
|
+
Parameters
|
|
5092
|
+
----------
|
|
5093
|
+
columns
|
|
5094
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
5095
|
+
there will be a separate validation step generated for each column. The columns must
|
|
5096
|
+
contain numeric data for the sum to be computed.
|
|
5097
|
+
value
|
|
5098
|
+
The value to compare the column sum against. This can be: (1) a numeric literal
|
|
5099
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
5100
|
+
whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
5101
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
5102
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
5103
|
+
`ref(column_name)` when reference data is set).
|
|
5104
|
+
tol
|
|
5105
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
5106
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
5107
|
+
a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5108
|
+
`col_sum_lt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
5109
|
+
target value and still pass validation.
|
|
5110
|
+
thresholds
|
|
5111
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
5112
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
5113
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
5114
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
5115
|
+
acceptable.
|
|
5116
|
+
brief
|
|
5117
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5118
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5119
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5120
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5121
|
+
won't be a brief.
|
|
5122
|
+
actions
|
|
5123
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
5124
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5125
|
+
define the actions.
|
|
5126
|
+
active
|
|
5127
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5128
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5129
|
+
for the steps unchanged).
|
|
5130
|
+
|
|
5131
|
+
Returns
|
|
5132
|
+
-------
|
|
5133
|
+
Validate
|
|
5134
|
+
The `Validate` object with the added validation step.
|
|
5135
|
+
|
|
5136
|
+
Using Reference Data
|
|
5137
|
+
--------------------
|
|
5138
|
+
The `col_sum_lt()` method supports comparing column aggregations against reference data. This
|
|
5139
|
+
is useful for validating that statistical properties remain consistent across different
|
|
5140
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
5141
|
+
|
|
5142
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
5143
|
+
|
|
5144
|
+
```python
|
|
5145
|
+
validation = (
|
|
5146
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
5147
|
+
.col_sum_lt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
5148
|
+
.interrogate()
|
|
5149
|
+
)
|
|
5150
|
+
```
|
|
5151
|
+
|
|
5152
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
5153
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
5154
|
+
the `ref()` helper:
|
|
5155
|
+
|
|
5156
|
+
```python
|
|
5157
|
+
.col_sum_lt(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
5158
|
+
```
|
|
5159
|
+
|
|
5160
|
+
Understanding Tolerance
|
|
5161
|
+
-----------------------
|
|
5162
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
5163
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
5164
|
+
|
|
5165
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5166
|
+
`col_sum_lt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
5167
|
+
target value and still pass validation.
|
|
5168
|
+
|
|
5169
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
5170
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
5171
|
+
shifts the comparison boundary.
|
|
5172
|
+
|
|
5173
|
+
Thresholds
|
|
5174
|
+
----------
|
|
5175
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5176
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5177
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5178
|
+
|
|
5179
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
5180
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
5181
|
+
typically set as absolute counts:
|
|
5182
|
+
|
|
5183
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
5184
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
5185
|
+
|
|
5186
|
+
Thresholds can be defined using one of these input schemes:
|
|
5187
|
+
|
|
5188
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5189
|
+
thresholds)
|
|
5190
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5191
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5192
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5193
|
+
'critical'
|
|
5194
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5195
|
+
for the 'warning' level only
|
|
5196
|
+
|
|
5197
|
+
Examples
|
|
5198
|
+
--------
|
|
5199
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
5200
|
+
shown below:
|
|
5201
|
+
|
|
5202
|
+
```python
|
|
5203
|
+
import pointblank as pb
|
|
5204
|
+
import polars as pl
|
|
5205
|
+
|
|
5206
|
+
tbl = pl.DataFrame(
|
|
5207
|
+
{
|
|
5208
|
+
"a": [1, 2, 3, 4, 5],
|
|
5209
|
+
"b": [2, 2, 2, 2, 2],
|
|
5210
|
+
}
|
|
5211
|
+
)
|
|
5212
|
+
|
|
5213
|
+
pb.preview(tbl)
|
|
5214
|
+
```
|
|
5215
|
+
|
|
5216
|
+
Let's validate that the sum of column `a` is less than `15`:
|
|
5217
|
+
|
|
5218
|
+
```python
|
|
5219
|
+
validation = (
|
|
5220
|
+
pb.Validate(data=tbl)
|
|
5221
|
+
.col_sum_lt(columns="a", value=15)
|
|
5222
|
+
.interrogate()
|
|
5223
|
+
)
|
|
5224
|
+
|
|
5225
|
+
validation
|
|
5226
|
+
```
|
|
5227
|
+
|
|
5228
|
+
The validation result shows whether the sum comparison passed or failed. Since this
|
|
5229
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
5230
|
+
|
|
5231
|
+
When validating multiple columns, each column gets its own validation step:
|
|
5232
|
+
|
|
5233
|
+
```python
|
|
5234
|
+
validation = (
|
|
5235
|
+
pb.Validate(data=tbl)
|
|
5236
|
+
.col_sum_lt(columns=["a", "b"], value=15)
|
|
5237
|
+
.interrogate()
|
|
5238
|
+
)
|
|
5239
|
+
|
|
5240
|
+
validation
|
|
5241
|
+
```
|
|
5242
|
+
|
|
5243
|
+
Using tolerance for flexible comparisons:
|
|
5244
|
+
|
|
5245
|
+
```python
|
|
5246
|
+
validation = (
|
|
5247
|
+
pb.Validate(data=tbl)
|
|
5248
|
+
.col_sum_lt(columns="a", value=15, tol=1.0)
|
|
5249
|
+
.interrogate()
|
|
5250
|
+
)
|
|
5251
|
+
|
|
5252
|
+
validation
|
|
5253
|
+
```
|
|
5254
|
+
|
|
5255
|
+
col_sum_ge(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5256
|
+
Does the column sum satisfy a greater than or equal to comparison?
|
|
5257
|
+
|
|
5258
|
+
The `col_sum_ge()` validation method checks whether the sum of values in a column
|
|
5259
|
+
is at least a specified `value=`. This is an aggregation-based validation where the entire
|
|
5260
|
+
column is reduced to a single sum value that is then compared against the target. The
|
|
5261
|
+
comparison used in this function is `sum(column) >= value`.
|
|
5262
|
+
|
|
5263
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
5264
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
5265
|
+
the comparison) or fails completely.
|
|
5266
|
+
|
|
5267
|
+
Parameters
|
|
5268
|
+
----------
|
|
5269
|
+
columns
|
|
5270
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
5271
|
+
there will be a separate validation step generated for each column. The columns must
|
|
5272
|
+
contain numeric data for the sum to be computed.
|
|
5273
|
+
value
|
|
5274
|
+
The value to compare the column sum against. This can be: (1) a numeric literal
|
|
5275
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
5276
|
+
whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
5277
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
5278
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
5279
|
+
`ref(column_name)` when reference data is set).
|
|
5280
|
+
tol
|
|
5281
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
5282
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
5283
|
+
a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5284
|
+
`col_sum_ge()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
5285
|
+
target value and still pass validation.
|
|
5286
|
+
thresholds
|
|
5287
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
5288
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
5289
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
5290
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
5291
|
+
acceptable.
|
|
5292
|
+
brief
|
|
5293
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5294
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5295
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5296
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5297
|
+
won't be a brief.
|
|
5298
|
+
actions
|
|
5299
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
5300
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5301
|
+
define the actions.
|
|
5302
|
+
active
|
|
5303
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5304
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5305
|
+
for the steps unchanged).
|
|
5306
|
+
|
|
5307
|
+
Returns
|
|
5308
|
+
-------
|
|
5309
|
+
Validate
|
|
5310
|
+
The `Validate` object with the added validation step.
|
|
5311
|
+
|
|
5312
|
+
Using Reference Data
|
|
5313
|
+
--------------------
|
|
5314
|
+
The `col_sum_ge()` method supports comparing column aggregations against reference data. This
|
|
5315
|
+
is useful for validating that statistical properties remain consistent across different
|
|
5316
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
5317
|
+
|
|
5318
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
5319
|
+
|
|
5320
|
+
```python
|
|
5321
|
+
validation = (
|
|
5322
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
5323
|
+
.col_sum_ge(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
5324
|
+
.interrogate()
|
|
5325
|
+
)
|
|
5326
|
+
```
|
|
5327
|
+
|
|
5328
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
5329
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
5330
|
+
the `ref()` helper:
|
|
5331
|
+
|
|
5332
|
+
```python
|
|
5333
|
+
.col_sum_ge(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
5334
|
+
```
|
|
5335
|
+
|
|
5336
|
+
Understanding Tolerance
|
|
5337
|
+
-----------------------
|
|
5338
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
5339
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
5340
|
+
|
|
5341
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5342
|
+
`col_sum_ge()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
5343
|
+
target value and still pass validation.
|
|
5344
|
+
|
|
5345
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
5346
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
5347
|
+
shifts the comparison boundary.
|
|
5348
|
+
|
|
5349
|
+
Thresholds
|
|
5350
|
+
----------
|
|
5351
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5352
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5353
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5354
|
+
|
|
5355
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
5356
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
5357
|
+
typically set as absolute counts:
|
|
5358
|
+
|
|
5359
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
5360
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
5361
|
+
|
|
5362
|
+
Thresholds can be defined using one of these input schemes:
|
|
5363
|
+
|
|
5364
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5365
|
+
thresholds)
|
|
5366
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5367
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5368
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5369
|
+
'critical'
|
|
5370
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5371
|
+
for the 'warning' level only
|
|
5372
|
+
|
|
5373
|
+
Examples
|
|
5374
|
+
--------
|
|
5375
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
5376
|
+
shown below:
|
|
5377
|
+
|
|
5378
|
+
```python
|
|
5379
|
+
import pointblank as pb
|
|
5380
|
+
import polars as pl
|
|
5381
|
+
|
|
5382
|
+
tbl = pl.DataFrame(
|
|
5383
|
+
{
|
|
5384
|
+
"a": [1, 2, 3, 4, 5],
|
|
5385
|
+
"b": [2, 2, 2, 2, 2],
|
|
5386
|
+
}
|
|
5387
|
+
)
|
|
5388
|
+
|
|
5389
|
+
pb.preview(tbl)
|
|
5390
|
+
```
|
|
5391
|
+
|
|
5392
|
+
Let's validate that the sum of column `a` is at least `15`:
|
|
5393
|
+
|
|
5394
|
+
```python
|
|
5395
|
+
validation = (
|
|
5396
|
+
pb.Validate(data=tbl)
|
|
5397
|
+
.col_sum_ge(columns="a", value=15)
|
|
5398
|
+
.interrogate()
|
|
5399
|
+
)
|
|
5400
|
+
|
|
5401
|
+
validation
|
|
5402
|
+
```
|
|
5403
|
+
|
|
5404
|
+
The validation result shows whether the sum comparison passed or failed. Since this
|
|
5405
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
5406
|
+
|
|
5407
|
+
When validating multiple columns, each column gets its own validation step:
|
|
5408
|
+
|
|
5409
|
+
```python
|
|
5410
|
+
validation = (
|
|
5411
|
+
pb.Validate(data=tbl)
|
|
5412
|
+
.col_sum_ge(columns=["a", "b"], value=15)
|
|
5413
|
+
.interrogate()
|
|
5414
|
+
)
|
|
5415
|
+
|
|
5416
|
+
validation
|
|
5417
|
+
```
|
|
5418
|
+
|
|
5419
|
+
Using tolerance for flexible comparisons:
|
|
5420
|
+
|
|
5421
|
+
```python
|
|
5422
|
+
validation = (
|
|
5423
|
+
pb.Validate(data=tbl)
|
|
5424
|
+
.col_sum_ge(columns="a", value=15, tol=1.0)
|
|
5425
|
+
.interrogate()
|
|
5426
|
+
)
|
|
5427
|
+
|
|
5428
|
+
validation
|
|
5429
|
+
```
|
|
5430
|
+
|
|
5431
|
+
col_sum_le(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5432
|
+
Does the column sum satisfy a less than or equal to comparison?
|
|
5433
|
+
|
|
5434
|
+
The `col_sum_le()` validation method checks whether the sum of values in a column
|
|
5435
|
+
is at most a specified `value=`. This is an aggregation-based validation where the entire
|
|
5436
|
+
column is reduced to a single sum value that is then compared against the target. The
|
|
5437
|
+
comparison used in this function is `sum(column) <= value`.
|
|
5438
|
+
|
|
5439
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
5440
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
5441
|
+
the comparison) or fails completely.
|
|
5442
|
+
|
|
5443
|
+
Parameters
|
|
5444
|
+
----------
|
|
5445
|
+
columns
|
|
5446
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
5447
|
+
there will be a separate validation step generated for each column. The columns must
|
|
5448
|
+
contain numeric data for the sum to be computed.
|
|
5449
|
+
value
|
|
5450
|
+
The value to compare the column sum against. This can be: (1) a numeric literal
|
|
5451
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
5452
|
+
whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
5453
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
5454
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
5455
|
+
`ref(column_name)` when reference data is set).
|
|
5456
|
+
tol
|
|
5457
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
5458
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
5459
|
+
a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5460
|
+
`col_sum_le()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
5461
|
+
target value and still pass validation.
|
|
5462
|
+
thresholds
|
|
5463
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
5464
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
5465
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
5466
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
5467
|
+
acceptable.
|
|
5468
|
+
brief
|
|
5469
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5470
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5471
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5472
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5473
|
+
won't be a brief.
|
|
5474
|
+
actions
|
|
5475
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
5476
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5477
|
+
define the actions.
|
|
5478
|
+
active
|
|
5479
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5480
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5481
|
+
for the steps unchanged).
|
|
5482
|
+
|
|
5483
|
+
Returns
|
|
5484
|
+
-------
|
|
5485
|
+
Validate
|
|
5486
|
+
The `Validate` object with the added validation step.
|
|
5487
|
+
|
|
5488
|
+
Using Reference Data
|
|
5489
|
+
--------------------
|
|
5490
|
+
The `col_sum_le()` method supports comparing column aggregations against reference data. This
|
|
5491
|
+
is useful for validating that statistical properties remain consistent across different
|
|
5492
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
5493
|
+
|
|
5494
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
5495
|
+
|
|
5496
|
+
```python
|
|
5497
|
+
validation = (
|
|
5498
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
5499
|
+
.col_sum_le(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
5500
|
+
.interrogate()
|
|
5501
|
+
)
|
|
5502
|
+
```
|
|
5503
|
+
|
|
5504
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
5505
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
5506
|
+
the `ref()` helper:
|
|
5507
|
+
|
|
5508
|
+
```python
|
|
5509
|
+
.col_sum_le(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
5510
|
+
```
|
|
5511
|
+
|
|
5512
|
+
Understanding Tolerance
|
|
5513
|
+
-----------------------
|
|
5514
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
5515
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
5516
|
+
|
|
5517
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5518
|
+
`col_sum_le()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the
|
|
5519
|
+
target value and still pass validation.
|
|
5520
|
+
|
|
5521
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
5522
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
5523
|
+
shifts the comparison boundary.
|
|
5524
|
+
|
|
5525
|
+
Thresholds
|
|
5526
|
+
----------
|
|
5527
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5528
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5529
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5530
|
+
|
|
5531
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
5532
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
5533
|
+
typically set as absolute counts:
|
|
5534
|
+
|
|
5535
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
5536
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
5537
|
+
|
|
5538
|
+
Thresholds can be defined using one of these input schemes:
|
|
5539
|
+
|
|
5540
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5541
|
+
thresholds)
|
|
5542
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5543
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5544
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5545
|
+
'critical'
|
|
5546
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5547
|
+
for the 'warning' level only
|
|
5548
|
+
|
|
5549
|
+
Examples
|
|
5550
|
+
--------
|
|
5551
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
5552
|
+
shown below:
|
|
5553
|
+
|
|
5554
|
+
```python
|
|
5555
|
+
import pointblank as pb
|
|
5556
|
+
import polars as pl
|
|
5557
|
+
|
|
5558
|
+
tbl = pl.DataFrame(
|
|
5559
|
+
{
|
|
5560
|
+
"a": [1, 2, 3, 4, 5],
|
|
5561
|
+
"b": [2, 2, 2, 2, 2],
|
|
5562
|
+
}
|
|
5563
|
+
)
|
|
5564
|
+
|
|
5565
|
+
pb.preview(tbl)
|
|
5566
|
+
```
|
|
5567
|
+
|
|
5568
|
+
Let's validate that the sum of column `a` is at most `15`:
|
|
5569
|
+
|
|
5570
|
+
```python
|
|
5571
|
+
validation = (
|
|
5572
|
+
pb.Validate(data=tbl)
|
|
5573
|
+
.col_sum_le(columns="a", value=15)
|
|
5574
|
+
.interrogate()
|
|
5575
|
+
)
|
|
5576
|
+
|
|
5577
|
+
validation
|
|
5578
|
+
```
|
|
5579
|
+
|
|
5580
|
+
The validation result shows whether the sum comparison passed or failed. Since this
|
|
5581
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
5582
|
+
|
|
5583
|
+
When validating multiple columns, each column gets its own validation step:
|
|
5584
|
+
|
|
5585
|
+
```python
|
|
5586
|
+
validation = (
|
|
5587
|
+
pb.Validate(data=tbl)
|
|
5588
|
+
.col_sum_le(columns=["a", "b"], value=15)
|
|
5589
|
+
.interrogate()
|
|
5590
|
+
)
|
|
5591
|
+
|
|
5592
|
+
validation
|
|
5593
|
+
```
|
|
5594
|
+
|
|
5595
|
+
Using tolerance for flexible comparisons:
|
|
5596
|
+
|
|
5597
|
+
```python
|
|
5598
|
+
validation = (
|
|
5599
|
+
pb.Validate(data=tbl)
|
|
5600
|
+
.col_sum_le(columns="a", value=15, tol=1.0)
|
|
5601
|
+
.interrogate()
|
|
5602
|
+
)
|
|
5603
|
+
|
|
5604
|
+
validation
|
|
5605
|
+
```
|
|
5606
|
+
|
|
5607
|
+
col_sum_eq(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5608
|
+
Does the column sum satisfy an equal to comparison?
|
|
5609
|
+
|
|
5610
|
+
The `col_sum_eq()` validation method checks whether the sum of values in a column
|
|
5611
|
+
equals a specified `value=`. This is an aggregation-based validation where the entire
|
|
5612
|
+
column is reduced to a single sum value that is then compared against the target. The
|
|
5613
|
+
comparison used in this function is `sum(column) == value`.
|
|
5614
|
+
|
|
5615
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
5616
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
5617
|
+
the comparison) or fails completely.
|
|
5618
|
+
|
|
5619
|
+
Parameters
|
|
5620
|
+
----------
|
|
5621
|
+
columns
|
|
5622
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
5623
|
+
there will be a separate validation step generated for each column. The columns must
|
|
5624
|
+
contain numeric data for the sum to be computed.
|
|
5625
|
+
value
|
|
5626
|
+
The value to compare the column sum against. This can be: (1) a numeric literal
|
|
5627
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
5628
|
+
whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
5629
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
5630
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
5631
|
+
`ref(column_name)` when reference data is set).
|
|
5632
|
+
tol
|
|
5633
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
5634
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
5635
|
+
a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter is particularly useful with `col_sum_eq()` since exact equality
|
|
5636
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
5637
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
5638
|
+
floating-point arithmetic.
|
|
5639
|
+
thresholds
|
|
5640
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
5641
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
5642
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
5643
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
5644
|
+
acceptable.
|
|
5645
|
+
brief
|
|
5646
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5647
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5648
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5649
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5650
|
+
won't be a brief.
|
|
5651
|
+
actions
|
|
5652
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
5653
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5654
|
+
define the actions.
|
|
5655
|
+
active
|
|
5656
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5657
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5658
|
+
for the steps unchanged).
|
|
5659
|
+
|
|
5660
|
+
Returns
|
|
5661
|
+
-------
|
|
5662
|
+
Validate
|
|
5663
|
+
The `Validate` object with the added validation step.
|
|
5664
|
+
|
|
5665
|
+
Using Reference Data
|
|
5666
|
+
--------------------
|
|
5667
|
+
The `col_sum_eq()` method supports comparing column aggregations against reference data. This
|
|
5668
|
+
is useful for validating that statistical properties remain consistent across different
|
|
5669
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
5670
|
+
|
|
5671
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
5672
|
+
|
|
5673
|
+
```python
|
|
5674
|
+
validation = (
|
|
5675
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
5676
|
+
.col_sum_eq(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
5677
|
+
.interrogate()
|
|
5678
|
+
)
|
|
5679
|
+
```
|
|
5680
|
+
|
|
5681
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
5682
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
5683
|
+
the `ref()` helper:
|
|
5684
|
+
|
|
5685
|
+
```python
|
|
5686
|
+
.col_sum_eq(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
5687
|
+
```
|
|
5688
|
+
|
|
5689
|
+
Understanding Tolerance
|
|
5690
|
+
-----------------------
|
|
5691
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
5692
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
5693
|
+
|
|
5694
|
+
The `tol=` parameter is particularly useful with `col_sum_eq()` since exact equality
|
|
5695
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
5696
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
5697
|
+
floating-point arithmetic.
|
|
5698
|
+
|
|
5699
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
5700
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
5701
|
+
shifts the comparison boundary.
|
|
5702
|
+
|
|
5703
|
+
Thresholds
|
|
5704
|
+
----------
|
|
5705
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5706
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5707
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5708
|
+
|
|
5709
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
5710
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
5711
|
+
typically set as absolute counts:
|
|
5712
|
+
|
|
5713
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
5714
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
5715
|
+
|
|
5716
|
+
Thresholds can be defined using one of these input schemes:
|
|
5717
|
+
|
|
5718
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5719
|
+
thresholds)
|
|
5720
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5721
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5722
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5723
|
+
'critical'
|
|
5724
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5725
|
+
for the 'warning' level only
|
|
5726
|
+
|
|
5727
|
+
Examples
|
|
5728
|
+
--------
|
|
5729
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
5730
|
+
shown below:
|
|
5731
|
+
|
|
5732
|
+
```python
|
|
5733
|
+
import pointblank as pb
|
|
5734
|
+
import polars as pl
|
|
5735
|
+
|
|
5736
|
+
tbl = pl.DataFrame(
|
|
5737
|
+
{
|
|
5738
|
+
"a": [1, 2, 3, 4, 5],
|
|
5739
|
+
"b": [2, 2, 2, 2, 2],
|
|
5740
|
+
}
|
|
5741
|
+
)
|
|
5742
|
+
|
|
5743
|
+
pb.preview(tbl)
|
|
5744
|
+
```
|
|
5745
|
+
|
|
5746
|
+
Let's validate that the sum of column `a` equals `15`:
|
|
5747
|
+
|
|
5748
|
+
```python
|
|
5749
|
+
validation = (
|
|
5750
|
+
pb.Validate(data=tbl)
|
|
5751
|
+
.col_sum_eq(columns="a", value=15)
|
|
5752
|
+
.interrogate()
|
|
5753
|
+
)
|
|
5754
|
+
|
|
5755
|
+
validation
|
|
5756
|
+
```
|
|
5757
|
+
|
|
5758
|
+
The validation result shows whether the sum comparison passed or failed. Since this
|
|
5759
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
5760
|
+
|
|
5761
|
+
When validating multiple columns, each column gets its own validation step:
|
|
5762
|
+
|
|
5763
|
+
```python
|
|
5764
|
+
validation = (
|
|
5765
|
+
pb.Validate(data=tbl)
|
|
5766
|
+
.col_sum_eq(columns=["a", "b"], value=15)
|
|
5767
|
+
.interrogate()
|
|
5768
|
+
)
|
|
5769
|
+
|
|
5770
|
+
validation
|
|
5771
|
+
```
|
|
5772
|
+
|
|
5773
|
+
Using tolerance for flexible comparisons:
|
|
5774
|
+
|
|
5775
|
+
```python
|
|
5776
|
+
validation = (
|
|
5777
|
+
pb.Validate(data=tbl)
|
|
5778
|
+
.col_sum_eq(columns="a", value=15, tol=1.0)
|
|
5779
|
+
.interrogate()
|
|
5780
|
+
)
|
|
5781
|
+
|
|
5782
|
+
validation
|
|
5783
|
+
```
|
|
5784
|
+
|
|
5785
|
+
col_avg_gt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5786
|
+
Does the column average satisfy a greater than comparison?
|
|
5787
|
+
|
|
5788
|
+
The `col_avg_gt()` validation method checks whether the average of values in a column
|
|
5789
|
+
is greater than a specified `value=`. This is an aggregation-based validation where the entire
|
|
5790
|
+
column is reduced to a single average value that is then compared against the target. The
|
|
5791
|
+
comparison used in this function is `average(column) > value`.
|
|
5792
|
+
|
|
5793
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
5794
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
5795
|
+
the comparison) or fails completely.
|
|
5796
|
+
|
|
5797
|
+
Parameters
|
|
5798
|
+
----------
|
|
5799
|
+
columns
|
|
5800
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
5801
|
+
there will be a separate validation step generated for each column. The columns must
|
|
5802
|
+
contain numeric data for the average to be computed.
|
|
5803
|
+
value
|
|
5804
|
+
The value to compare the column average against. This can be: (1) a numeric literal
|
|
5805
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
5806
|
+
whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
5807
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
5808
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
5809
|
+
`ref(column_name)` when reference data is set).
|
|
5810
|
+
tol
|
|
5811
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
5812
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
5813
|
+
a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5814
|
+
`col_avg_gt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
5815
|
+
target value and still pass validation.
|
|
5816
|
+
thresholds
|
|
5817
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
5818
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
5819
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
5820
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
5821
|
+
acceptable.
|
|
5822
|
+
brief
|
|
5823
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5824
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5825
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5826
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5827
|
+
won't be a brief.
|
|
5828
|
+
actions
|
|
5829
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
5830
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5831
|
+
define the actions.
|
|
5832
|
+
active
|
|
5833
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5834
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5835
|
+
for the steps unchanged).
|
|
5836
|
+
|
|
5837
|
+
Returns
|
|
5838
|
+
-------
|
|
5839
|
+
Validate
|
|
5840
|
+
The `Validate` object with the added validation step.
|
|
5841
|
+
|
|
5842
|
+
Using Reference Data
|
|
5843
|
+
--------------------
|
|
5844
|
+
The `col_avg_gt()` method supports comparing column aggregations against reference data. This
|
|
5845
|
+
is useful for validating that statistical properties remain consistent across different
|
|
5846
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
5847
|
+
|
|
5848
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
5849
|
+
|
|
5850
|
+
```python
|
|
5851
|
+
validation = (
|
|
5852
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
5853
|
+
.col_avg_gt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
5854
|
+
.interrogate()
|
|
5855
|
+
)
|
|
5856
|
+
```
|
|
5857
|
+
|
|
5858
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
5859
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
5860
|
+
the `ref()` helper:
|
|
5861
|
+
|
|
5862
|
+
```python
|
|
5863
|
+
.col_avg_gt(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
5864
|
+
```
|
|
5865
|
+
|
|
5866
|
+
Understanding Tolerance
|
|
5867
|
+
-----------------------
|
|
5868
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
5869
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
5870
|
+
|
|
5871
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5872
|
+
`col_avg_gt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
5873
|
+
target value and still pass validation.
|
|
5874
|
+
|
|
5875
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
5876
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
5877
|
+
shifts the comparison boundary.
|
|
5878
|
+
|
|
5879
|
+
Thresholds
|
|
5880
|
+
----------
|
|
5881
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5882
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5883
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5884
|
+
|
|
5885
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
5886
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
5887
|
+
typically set as absolute counts:
|
|
5888
|
+
|
|
5889
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
5890
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
5891
|
+
|
|
5892
|
+
Thresholds can be defined using one of these input schemes:
|
|
5893
|
+
|
|
5894
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5895
|
+
thresholds)
|
|
5896
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5897
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5898
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5899
|
+
'critical'
|
|
5900
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5901
|
+
for the 'warning' level only
|
|
5902
|
+
|
|
5903
|
+
Examples
|
|
5904
|
+
--------
|
|
5905
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
5906
|
+
shown below:
|
|
5907
|
+
|
|
5908
|
+
```python
|
|
5909
|
+
import pointblank as pb
|
|
5910
|
+
import polars as pl
|
|
5911
|
+
|
|
5912
|
+
tbl = pl.DataFrame(
|
|
5913
|
+
{
|
|
5914
|
+
"a": [1, 2, 3, 4, 5],
|
|
5915
|
+
"b": [2, 2, 2, 2, 2],
|
|
5916
|
+
}
|
|
5917
|
+
)
|
|
5918
|
+
|
|
5919
|
+
pb.preview(tbl)
|
|
5920
|
+
```
|
|
5921
|
+
|
|
5922
|
+
Let's validate that the average of column `a` is greater than `3`:
|
|
5923
|
+
|
|
5924
|
+
```python
|
|
5925
|
+
validation = (
|
|
5926
|
+
pb.Validate(data=tbl)
|
|
5927
|
+
.col_avg_gt(columns="a", value=3)
|
|
5928
|
+
.interrogate()
|
|
5929
|
+
)
|
|
5930
|
+
|
|
5931
|
+
validation
|
|
5932
|
+
```
|
|
5933
|
+
|
|
5934
|
+
The validation result shows whether the average comparison passed or failed. Since this
|
|
5935
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
5936
|
+
|
|
5937
|
+
When validating multiple columns, each column gets its own validation step:
|
|
5938
|
+
|
|
5939
|
+
```python
|
|
5940
|
+
validation = (
|
|
5941
|
+
pb.Validate(data=tbl)
|
|
5942
|
+
.col_avg_gt(columns=["a", "b"], value=3)
|
|
5943
|
+
.interrogate()
|
|
5944
|
+
)
|
|
5945
|
+
|
|
5946
|
+
validation
|
|
5947
|
+
```
|
|
5948
|
+
|
|
5949
|
+
Using tolerance for flexible comparisons:
|
|
5950
|
+
|
|
5951
|
+
```python
|
|
5952
|
+
validation = (
|
|
5953
|
+
pb.Validate(data=tbl)
|
|
5954
|
+
.col_avg_gt(columns="a", value=3, tol=1.0)
|
|
5955
|
+
.interrogate()
|
|
5956
|
+
)
|
|
5957
|
+
|
|
5958
|
+
validation
|
|
5959
|
+
```
|
|
5960
|
+
|
|
5961
|
+
col_avg_lt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5962
|
+
Does the column average satisfy a less than comparison?
|
|
5963
|
+
|
|
5964
|
+
The `col_avg_lt()` validation method checks whether the average of values in a column
|
|
5965
|
+
is less than a specified `value=`. This is an aggregation-based validation where the entire
|
|
5966
|
+
column is reduced to a single average value that is then compared against the target. The
|
|
5967
|
+
comparison used in this function is `average(column) < value`.
|
|
5968
|
+
|
|
5969
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
5970
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
5971
|
+
the comparison) or fails completely.
|
|
5972
|
+
|
|
5973
|
+
Parameters
|
|
5974
|
+
----------
|
|
5975
|
+
columns
|
|
5976
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
5977
|
+
there will be a separate validation step generated for each column. The columns must
|
|
5978
|
+
contain numeric data for the average to be computed.
|
|
5979
|
+
value
|
|
5980
|
+
The value to compare the column average against. This can be: (1) a numeric literal
|
|
5981
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
5982
|
+
whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
5983
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
5984
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
5985
|
+
`ref(column_name)` when reference data is set).
|
|
5986
|
+
tol
|
|
5987
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
5988
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
5989
|
+
a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
5990
|
+
`col_avg_lt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
5991
|
+
target value and still pass validation.
|
|
5992
|
+
thresholds
|
|
5993
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
5994
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
5995
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
5996
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
5997
|
+
acceptable.
|
|
5998
|
+
brief
|
|
5999
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6000
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6001
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6002
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6003
|
+
won't be a brief.
|
|
6004
|
+
actions
|
|
6005
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6006
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6007
|
+
define the actions.
|
|
6008
|
+
active
|
|
6009
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6010
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6011
|
+
for the steps unchanged).
|
|
6012
|
+
|
|
6013
|
+
Returns
|
|
6014
|
+
-------
|
|
6015
|
+
Validate
|
|
6016
|
+
The `Validate` object with the added validation step.
|
|
6017
|
+
|
|
6018
|
+
Using Reference Data
|
|
6019
|
+
--------------------
|
|
6020
|
+
The `col_avg_lt()` method supports comparing column aggregations against reference data. This
|
|
6021
|
+
is useful for validating that statistical properties remain consistent across different
|
|
6022
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
6023
|
+
|
|
6024
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
6025
|
+
|
|
6026
|
+
```python
|
|
6027
|
+
validation = (
|
|
6028
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
6029
|
+
.col_avg_lt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
6030
|
+
.interrogate()
|
|
6031
|
+
)
|
|
6032
|
+
```
|
|
6033
|
+
|
|
6034
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
6035
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
6036
|
+
the `ref()` helper:
|
|
6037
|
+
|
|
6038
|
+
```python
|
|
6039
|
+
.col_avg_lt(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
6040
|
+
```
|
|
6041
|
+
|
|
6042
|
+
Understanding Tolerance
|
|
6043
|
+
-----------------------
|
|
6044
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
6045
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
6046
|
+
|
|
6047
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6048
|
+
`col_avg_lt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
6049
|
+
target value and still pass validation.
|
|
6050
|
+
|
|
6051
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
6052
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
6053
|
+
shifts the comparison boundary.
|
|
6054
|
+
|
|
6055
|
+
Thresholds
|
|
6056
|
+
----------
|
|
6057
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6058
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6059
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6060
|
+
|
|
6061
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
6062
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
6063
|
+
typically set as absolute counts:
|
|
6064
|
+
|
|
6065
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
6066
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
6067
|
+
|
|
6068
|
+
Thresholds can be defined using one of these input schemes:
|
|
6069
|
+
|
|
6070
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6071
|
+
thresholds)
|
|
6072
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6073
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6074
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6075
|
+
'critical'
|
|
6076
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6077
|
+
for the 'warning' level only
|
|
6078
|
+
|
|
6079
|
+
Examples
|
|
6080
|
+
--------
|
|
6081
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
6082
|
+
shown below:
|
|
6083
|
+
|
|
6084
|
+
```python
|
|
6085
|
+
import pointblank as pb
|
|
6086
|
+
import polars as pl
|
|
6087
|
+
|
|
6088
|
+
tbl = pl.DataFrame(
|
|
6089
|
+
{
|
|
6090
|
+
"a": [1, 2, 3, 4, 5],
|
|
6091
|
+
"b": [2, 2, 2, 2, 2],
|
|
6092
|
+
}
|
|
6093
|
+
)
|
|
6094
|
+
|
|
6095
|
+
pb.preview(tbl)
|
|
6096
|
+
```
|
|
6097
|
+
|
|
6098
|
+
Let's validate that the average of column `a` is less than `3`:
|
|
6099
|
+
|
|
6100
|
+
```python
|
|
6101
|
+
validation = (
|
|
6102
|
+
pb.Validate(data=tbl)
|
|
6103
|
+
.col_avg_lt(columns="a", value=3)
|
|
6104
|
+
.interrogate()
|
|
6105
|
+
)
|
|
6106
|
+
|
|
6107
|
+
validation
|
|
6108
|
+
```
|
|
6109
|
+
|
|
6110
|
+
The validation result shows whether the average comparison passed or failed. Since this
|
|
6111
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
6112
|
+
|
|
6113
|
+
When validating multiple columns, each column gets its own validation step:
|
|
6114
|
+
|
|
6115
|
+
```python
|
|
6116
|
+
validation = (
|
|
6117
|
+
pb.Validate(data=tbl)
|
|
6118
|
+
.col_avg_lt(columns=["a", "b"], value=3)
|
|
6119
|
+
.interrogate()
|
|
6120
|
+
)
|
|
6121
|
+
|
|
6122
|
+
validation
|
|
6123
|
+
```
|
|
6124
|
+
|
|
6125
|
+
Using tolerance for flexible comparisons:
|
|
6126
|
+
|
|
6127
|
+
```python
|
|
6128
|
+
validation = (
|
|
6129
|
+
pb.Validate(data=tbl)
|
|
6130
|
+
.col_avg_lt(columns="a", value=3, tol=1.0)
|
|
6131
|
+
.interrogate()
|
|
6132
|
+
)
|
|
6133
|
+
|
|
6134
|
+
validation
|
|
6135
|
+
```
|
|
6136
|
+
|
|
6137
|
+
col_avg_ge(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6138
|
+
Does the column average satisfy a greater than or equal to comparison?
|
|
6139
|
+
|
|
6140
|
+
The `col_avg_ge()` validation method checks whether the average of values in a column
|
|
6141
|
+
is at least a specified `value=`. This is an aggregation-based validation where the entire
|
|
6142
|
+
column is reduced to a single average value that is then compared against the target. The
|
|
6143
|
+
comparison used in this function is `average(column) >= value`.
|
|
6144
|
+
|
|
6145
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
6146
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
6147
|
+
the comparison) or fails completely.
|
|
6148
|
+
|
|
6149
|
+
Parameters
|
|
6150
|
+
----------
|
|
6151
|
+
columns
|
|
6152
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
6153
|
+
there will be a separate validation step generated for each column. The columns must
|
|
6154
|
+
contain numeric data for the average to be computed.
|
|
6155
|
+
value
|
|
6156
|
+
The value to compare the column average against. This can be: (1) a numeric literal
|
|
6157
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
6158
|
+
whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
6159
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
6160
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
6161
|
+
`ref(column_name)` when reference data is set).
|
|
6162
|
+
tol
|
|
6163
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
6164
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
6165
|
+
a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6166
|
+
`col_avg_ge()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
6167
|
+
target value and still pass validation.
|
|
6168
|
+
thresholds
|
|
6169
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
6170
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
6171
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
6172
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
6173
|
+
acceptable.
|
|
6174
|
+
brief
|
|
6175
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6176
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6177
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6178
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6179
|
+
won't be a brief.
|
|
6180
|
+
actions
|
|
6181
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6182
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6183
|
+
define the actions.
|
|
6184
|
+
active
|
|
6185
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6186
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6187
|
+
for the steps unchanged).
|
|
6188
|
+
|
|
6189
|
+
Returns
|
|
6190
|
+
-------
|
|
6191
|
+
Validate
|
|
6192
|
+
The `Validate` object with the added validation step.
|
|
6193
|
+
|
|
6194
|
+
Using Reference Data
|
|
6195
|
+
--------------------
|
|
6196
|
+
The `col_avg_ge()` method supports comparing column aggregations against reference data. This
|
|
6197
|
+
is useful for validating that statistical properties remain consistent across different
|
|
6198
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
6199
|
+
|
|
6200
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
6201
|
+
|
|
6202
|
+
```python
|
|
6203
|
+
validation = (
|
|
6204
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
6205
|
+
.col_avg_ge(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
6206
|
+
.interrogate()
|
|
6207
|
+
)
|
|
6208
|
+
```
|
|
6209
|
+
|
|
6210
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
6211
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
6212
|
+
the `ref()` helper:
|
|
6213
|
+
|
|
6214
|
+
```python
|
|
6215
|
+
.col_avg_ge(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
6216
|
+
```
|
|
6217
|
+
|
|
6218
|
+
Understanding Tolerance
|
|
6219
|
+
-----------------------
|
|
6220
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
6221
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
6222
|
+
|
|
6223
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6224
|
+
`col_avg_ge()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
6225
|
+
target value and still pass validation.
|
|
6226
|
+
|
|
6227
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
6228
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
6229
|
+
shifts the comparison boundary.
|
|
6230
|
+
|
|
6231
|
+
Thresholds
|
|
6232
|
+
----------
|
|
6233
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6234
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6235
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6236
|
+
|
|
6237
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
6238
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
6239
|
+
typically set as absolute counts:
|
|
6240
|
+
|
|
6241
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
6242
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
6243
|
+
|
|
6244
|
+
Thresholds can be defined using one of these input schemes:
|
|
6245
|
+
|
|
6246
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6247
|
+
thresholds)
|
|
6248
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6249
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6250
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6251
|
+
'critical'
|
|
6252
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6253
|
+
for the 'warning' level only
|
|
6254
|
+
|
|
6255
|
+
Examples
|
|
6256
|
+
--------
|
|
6257
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
6258
|
+
shown below:
|
|
6259
|
+
|
|
6260
|
+
```python
|
|
6261
|
+
import pointblank as pb
|
|
6262
|
+
import polars as pl
|
|
6263
|
+
|
|
6264
|
+
tbl = pl.DataFrame(
|
|
6265
|
+
{
|
|
6266
|
+
"a": [1, 2, 3, 4, 5],
|
|
6267
|
+
"b": [2, 2, 2, 2, 2],
|
|
6268
|
+
}
|
|
6269
|
+
)
|
|
6270
|
+
|
|
6271
|
+
pb.preview(tbl)
|
|
6272
|
+
```
|
|
6273
|
+
|
|
6274
|
+
Let's validate that the average of column `a` is at least `3`:
|
|
6275
|
+
|
|
6276
|
+
```python
|
|
6277
|
+
validation = (
|
|
6278
|
+
pb.Validate(data=tbl)
|
|
6279
|
+
.col_avg_ge(columns="a", value=3)
|
|
6280
|
+
.interrogate()
|
|
6281
|
+
)
|
|
6282
|
+
|
|
6283
|
+
validation
|
|
6284
|
+
```
|
|
6285
|
+
|
|
6286
|
+
The validation result shows whether the average comparison passed or failed. Since this
|
|
6287
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
6288
|
+
|
|
6289
|
+
When validating multiple columns, each column gets its own validation step:
|
|
6290
|
+
|
|
6291
|
+
```python
|
|
6292
|
+
validation = (
|
|
6293
|
+
pb.Validate(data=tbl)
|
|
6294
|
+
.col_avg_ge(columns=["a", "b"], value=3)
|
|
6295
|
+
.interrogate()
|
|
6296
|
+
)
|
|
6297
|
+
|
|
6298
|
+
validation
|
|
6299
|
+
```
|
|
6300
|
+
|
|
6301
|
+
Using tolerance for flexible comparisons:
|
|
6302
|
+
|
|
6303
|
+
```python
|
|
6304
|
+
validation = (
|
|
6305
|
+
pb.Validate(data=tbl)
|
|
6306
|
+
.col_avg_ge(columns="a", value=3, tol=1.0)
|
|
6307
|
+
.interrogate()
|
|
6308
|
+
)
|
|
6309
|
+
|
|
6310
|
+
validation
|
|
6311
|
+
```
|
|
6312
|
+
|
|
6313
|
+
col_avg_le(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6314
|
+
Does the column average satisfy a less than or equal to comparison?
|
|
6315
|
+
|
|
6316
|
+
The `col_avg_le()` validation method checks whether the average of values in a column
|
|
6317
|
+
is at most a specified `value=`. This is an aggregation-based validation where the entire
|
|
6318
|
+
column is reduced to a single average value that is then compared against the target. The
|
|
6319
|
+
comparison used in this function is `average(column) <= value`.
|
|
6320
|
+
|
|
6321
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
6322
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
6323
|
+
the comparison) or fails completely.
|
|
6324
|
+
|
|
6325
|
+
Parameters
|
|
6326
|
+
----------
|
|
6327
|
+
columns
|
|
6328
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
6329
|
+
there will be a separate validation step generated for each column. The columns must
|
|
6330
|
+
contain numeric data for the average to be computed.
|
|
6331
|
+
value
|
|
6332
|
+
The value to compare the column average against. This can be: (1) a numeric literal
|
|
6333
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
6334
|
+
whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
6335
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
6336
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
6337
|
+
`ref(column_name)` when reference data is set).
|
|
6338
|
+
tol
|
|
6339
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
6340
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
6341
|
+
a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6342
|
+
`col_avg_le()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
6343
|
+
target value and still pass validation.
|
|
6344
|
+
thresholds
|
|
6345
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
6346
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
6347
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
6348
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
6349
|
+
acceptable.
|
|
6350
|
+
brief
|
|
6351
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6352
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6353
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6354
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6355
|
+
won't be a brief.
|
|
6356
|
+
actions
|
|
6357
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6358
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6359
|
+
define the actions.
|
|
6360
|
+
active
|
|
6361
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6362
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6363
|
+
for the steps unchanged).
|
|
6364
|
+
|
|
6365
|
+
Returns
|
|
6366
|
+
-------
|
|
6367
|
+
Validate
|
|
6368
|
+
The `Validate` object with the added validation step.
|
|
6369
|
+
|
|
6370
|
+
Using Reference Data
|
|
6371
|
+
--------------------
|
|
6372
|
+
The `col_avg_le()` method supports comparing column aggregations against reference data. This
|
|
6373
|
+
is useful for validating that statistical properties remain consistent across different
|
|
6374
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
6375
|
+
|
|
6376
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
6377
|
+
|
|
6378
|
+
```python
|
|
6379
|
+
validation = (
|
|
6380
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
6381
|
+
.col_avg_le(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
6382
|
+
.interrogate()
|
|
6383
|
+
)
|
|
6384
|
+
```
|
|
6385
|
+
|
|
6386
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
6387
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
6388
|
+
the `ref()` helper:
|
|
6389
|
+
|
|
6390
|
+
```python
|
|
6391
|
+
.col_avg_le(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
6392
|
+
```
|
|
6393
|
+
|
|
6394
|
+
Understanding Tolerance
|
|
6395
|
+
-----------------------
|
|
6396
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
6397
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
6398
|
+
|
|
6399
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6400
|
+
`col_avg_le()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the
|
|
6401
|
+
target value and still pass validation.
|
|
6402
|
+
|
|
6403
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
6404
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
6405
|
+
shifts the comparison boundary.
|
|
6406
|
+
|
|
6407
|
+
Thresholds
|
|
6408
|
+
----------
|
|
6409
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6410
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6411
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6412
|
+
|
|
6413
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
6414
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
6415
|
+
typically set as absolute counts:
|
|
6416
|
+
|
|
6417
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
6418
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
6419
|
+
|
|
6420
|
+
Thresholds can be defined using one of these input schemes:
|
|
6421
|
+
|
|
6422
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6423
|
+
thresholds)
|
|
6424
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6425
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6426
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6427
|
+
'critical'
|
|
6428
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6429
|
+
for the 'warning' level only
|
|
6430
|
+
|
|
6431
|
+
Examples
|
|
6432
|
+
--------
|
|
6433
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
6434
|
+
shown below:
|
|
6435
|
+
|
|
6436
|
+
```python
|
|
6437
|
+
import pointblank as pb
|
|
6438
|
+
import polars as pl
|
|
6439
|
+
|
|
6440
|
+
tbl = pl.DataFrame(
|
|
6441
|
+
{
|
|
6442
|
+
"a": [1, 2, 3, 4, 5],
|
|
6443
|
+
"b": [2, 2, 2, 2, 2],
|
|
6444
|
+
}
|
|
6445
|
+
)
|
|
6446
|
+
|
|
6447
|
+
pb.preview(tbl)
|
|
6448
|
+
```
|
|
6449
|
+
|
|
6450
|
+
Let's validate that the average of column `a` is at most `3`:
|
|
6451
|
+
|
|
6452
|
+
```python
|
|
6453
|
+
validation = (
|
|
6454
|
+
pb.Validate(data=tbl)
|
|
6455
|
+
.col_avg_le(columns="a", value=3)
|
|
6456
|
+
.interrogate()
|
|
6457
|
+
)
|
|
6458
|
+
|
|
6459
|
+
validation
|
|
6460
|
+
```
|
|
6461
|
+
|
|
6462
|
+
The validation result shows whether the average comparison passed or failed. Since this
|
|
6463
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
6464
|
+
|
|
6465
|
+
When validating multiple columns, each column gets its own validation step:
|
|
6466
|
+
|
|
6467
|
+
```python
|
|
6468
|
+
validation = (
|
|
6469
|
+
pb.Validate(data=tbl)
|
|
6470
|
+
.col_avg_le(columns=["a", "b"], value=3)
|
|
6471
|
+
.interrogate()
|
|
6472
|
+
)
|
|
6473
|
+
|
|
6474
|
+
validation
|
|
6475
|
+
```
|
|
6476
|
+
|
|
6477
|
+
Using tolerance for flexible comparisons:
|
|
6478
|
+
|
|
6479
|
+
```python
|
|
6480
|
+
validation = (
|
|
6481
|
+
pb.Validate(data=tbl)
|
|
6482
|
+
.col_avg_le(columns="a", value=3, tol=1.0)
|
|
6483
|
+
.interrogate()
|
|
6484
|
+
)
|
|
6485
|
+
|
|
6486
|
+
validation
|
|
6487
|
+
```
|
|
6488
|
+
|
|
6489
|
+
col_avg_eq(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6490
|
+
Does the column average satisfy an equal to comparison?
|
|
6491
|
+
|
|
6492
|
+
The `col_avg_eq()` validation method checks whether the average of values in a column
|
|
6493
|
+
equals a specified `value=`. This is an aggregation-based validation where the entire
|
|
6494
|
+
column is reduced to a single average value that is then compared against the target. The
|
|
6495
|
+
comparison used in this function is `average(column) == value`.
|
|
6496
|
+
|
|
6497
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
6498
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
6499
|
+
the comparison) or fails completely.
|
|
6500
|
+
|
|
6501
|
+
Parameters
|
|
6502
|
+
----------
|
|
6503
|
+
columns
|
|
6504
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
6505
|
+
there will be a separate validation step generated for each column. The columns must
|
|
6506
|
+
contain numeric data for the average to be computed.
|
|
6507
|
+
value
|
|
6508
|
+
The value to compare the column average against. This can be: (1) a numeric literal
|
|
6509
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
6510
|
+
whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
6511
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
6512
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
6513
|
+
`ref(column_name)` when reference data is set).
|
|
6514
|
+
tol
|
|
6515
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
6516
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
6517
|
+
a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter is particularly useful with `col_avg_eq()` since exact equality
|
|
6518
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
6519
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
6520
|
+
floating-point arithmetic.
|
|
6521
|
+
thresholds
|
|
6522
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
6523
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
6524
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
6525
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
6526
|
+
acceptable.
|
|
6527
|
+
brief
|
|
6528
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6529
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6530
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6531
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6532
|
+
won't be a brief.
|
|
6533
|
+
actions
|
|
6534
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6535
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6536
|
+
define the actions.
|
|
6537
|
+
active
|
|
6538
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6539
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6540
|
+
for the steps unchanged).
|
|
6541
|
+
|
|
6542
|
+
Returns
|
|
6543
|
+
-------
|
|
6544
|
+
Validate
|
|
6545
|
+
The `Validate` object with the added validation step.
|
|
6546
|
+
|
|
6547
|
+
Using Reference Data
|
|
6548
|
+
--------------------
|
|
6549
|
+
The `col_avg_eq()` method supports comparing column aggregations against reference data. This
|
|
6550
|
+
is useful for validating that statistical properties remain consistent across different
|
|
6551
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
6552
|
+
|
|
6553
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
6554
|
+
|
|
6555
|
+
```python
|
|
6556
|
+
validation = (
|
|
6557
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
6558
|
+
.col_avg_eq(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
6559
|
+
.interrogate()
|
|
6560
|
+
)
|
|
6561
|
+
```
|
|
6562
|
+
|
|
6563
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
6564
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
6565
|
+
the `ref()` helper:
|
|
6566
|
+
|
|
6567
|
+
```python
|
|
6568
|
+
.col_avg_eq(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
6569
|
+
```
|
|
6570
|
+
|
|
6571
|
+
Understanding Tolerance
|
|
6572
|
+
-----------------------
|
|
6573
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
6574
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
6575
|
+
|
|
6576
|
+
The `tol=` parameter is particularly useful with `col_avg_eq()` since exact equality
|
|
6577
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
6578
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
6579
|
+
floating-point arithmetic.
|
|
6580
|
+
|
|
6581
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
6582
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
6583
|
+
shifts the comparison boundary.
|
|
6584
|
+
|
|
6585
|
+
Thresholds
|
|
6586
|
+
----------
|
|
6587
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6588
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6589
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6590
|
+
|
|
6591
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
6592
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
6593
|
+
typically set as absolute counts:
|
|
6594
|
+
|
|
6595
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
6596
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
6597
|
+
|
|
6598
|
+
Thresholds can be defined using one of these input schemes:
|
|
6599
|
+
|
|
6600
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6601
|
+
thresholds)
|
|
6602
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6603
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6604
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6605
|
+
'critical'
|
|
6606
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6607
|
+
for the 'warning' level only
|
|
6608
|
+
|
|
6609
|
+
Examples
|
|
6610
|
+
--------
|
|
6611
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
6612
|
+
shown below:
|
|
6613
|
+
|
|
6614
|
+
```python
|
|
6615
|
+
import pointblank as pb
|
|
6616
|
+
import polars as pl
|
|
6617
|
+
|
|
6618
|
+
tbl = pl.DataFrame(
|
|
6619
|
+
{
|
|
6620
|
+
"a": [1, 2, 3, 4, 5],
|
|
6621
|
+
"b": [2, 2, 2, 2, 2],
|
|
6622
|
+
}
|
|
6623
|
+
)
|
|
6624
|
+
|
|
6625
|
+
pb.preview(tbl)
|
|
6626
|
+
```
|
|
6627
|
+
|
|
6628
|
+
Let's validate that the average of column `a` equals `3`:
|
|
6629
|
+
|
|
6630
|
+
```python
|
|
6631
|
+
validation = (
|
|
6632
|
+
pb.Validate(data=tbl)
|
|
6633
|
+
.col_avg_eq(columns="a", value=3)
|
|
6634
|
+
.interrogate()
|
|
6635
|
+
)
|
|
6636
|
+
|
|
6637
|
+
validation
|
|
6638
|
+
```
|
|
6639
|
+
|
|
6640
|
+
The validation result shows whether the average comparison passed or failed. Since this
|
|
6641
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
6642
|
+
|
|
6643
|
+
When validating multiple columns, each column gets its own validation step:
|
|
6644
|
+
|
|
6645
|
+
```python
|
|
6646
|
+
validation = (
|
|
6647
|
+
pb.Validate(data=tbl)
|
|
6648
|
+
.col_avg_eq(columns=["a", "b"], value=3)
|
|
6649
|
+
.interrogate()
|
|
6650
|
+
)
|
|
6651
|
+
|
|
6652
|
+
validation
|
|
6653
|
+
```
|
|
6654
|
+
|
|
6655
|
+
Using tolerance for flexible comparisons:
|
|
6656
|
+
|
|
6657
|
+
```python
|
|
6658
|
+
validation = (
|
|
6659
|
+
pb.Validate(data=tbl)
|
|
6660
|
+
.col_avg_eq(columns="a", value=3, tol=1.0)
|
|
6661
|
+
.interrogate()
|
|
6662
|
+
)
|
|
6663
|
+
|
|
6664
|
+
validation
|
|
6665
|
+
```
|
|
6666
|
+
|
|
6667
|
+
col_sd_gt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6668
|
+
Does the column standard deviation satisfy a greater than comparison?
|
|
6669
|
+
|
|
6670
|
+
The `col_sd_gt()` validation method checks whether the standard deviation of values in a column
|
|
6671
|
+
is greater than a specified `value=`. This is an aggregation-based validation where the entire
|
|
6672
|
+
column is reduced to a single standard deviation value that is then compared against the target. The
|
|
6673
|
+
comparison used in this function is `standard deviation(column) > value`.
|
|
6674
|
+
|
|
6675
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
6676
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
6677
|
+
the comparison) or fails completely.
|
|
6678
|
+
|
|
6679
|
+
Parameters
|
|
6680
|
+
----------
|
|
6681
|
+
columns
|
|
6682
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
6683
|
+
there will be a separate validation step generated for each column. The columns must
|
|
6684
|
+
contain numeric data for the standard deviation to be computed.
|
|
6685
|
+
value
|
|
6686
|
+
The value to compare the column standard deviation against. This can be: (1) a numeric literal
|
|
6687
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
6688
|
+
whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
6689
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
6690
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
6691
|
+
`ref(column_name)` when reference data is set).
|
|
6692
|
+
tol
|
|
6693
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
6694
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
6695
|
+
a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6696
|
+
`col_sd_gt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
6697
|
+
target value and still pass validation.
|
|
6698
|
+
thresholds
|
|
6699
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
6700
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
6701
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
6702
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
6703
|
+
acceptable.
|
|
6704
|
+
brief
|
|
6705
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6706
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6707
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6708
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6709
|
+
won't be a brief.
|
|
6710
|
+
actions
|
|
6711
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6712
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6713
|
+
define the actions.
|
|
6714
|
+
active
|
|
6715
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6716
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6717
|
+
for the steps unchanged).
|
|
6718
|
+
|
|
6719
|
+
Returns
|
|
6720
|
+
-------
|
|
6721
|
+
Validate
|
|
6722
|
+
The `Validate` object with the added validation step.
|
|
6723
|
+
|
|
6724
|
+
Using Reference Data
|
|
6725
|
+
--------------------
|
|
6726
|
+
The `col_sd_gt()` method supports comparing column aggregations against reference data. This
|
|
6727
|
+
is useful for validating that statistical properties remain consistent across different
|
|
6728
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
6729
|
+
|
|
6730
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
6731
|
+
|
|
6732
|
+
```python
|
|
6733
|
+
validation = (
|
|
6734
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
6735
|
+
.col_sd_gt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
6736
|
+
.interrogate()
|
|
6737
|
+
)
|
|
6738
|
+
```
|
|
6739
|
+
|
|
6740
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
6741
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
6742
|
+
the `ref()` helper:
|
|
6743
|
+
|
|
6744
|
+
```python
|
|
6745
|
+
.col_sd_gt(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
6746
|
+
```
|
|
6747
|
+
|
|
6748
|
+
Understanding Tolerance
|
|
6749
|
+
-----------------------
|
|
6750
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
6751
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
6752
|
+
|
|
6753
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6754
|
+
`col_sd_gt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
6755
|
+
target value and still pass validation.
|
|
6756
|
+
|
|
6757
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
6758
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
6759
|
+
shifts the comparison boundary.
|
|
6760
|
+
|
|
6761
|
+
Thresholds
|
|
6762
|
+
----------
|
|
6763
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6764
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6765
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6766
|
+
|
|
6767
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
6768
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
6769
|
+
typically set as absolute counts:
|
|
6770
|
+
|
|
6771
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
6772
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
6773
|
+
|
|
6774
|
+
Thresholds can be defined using one of these input schemes:
|
|
6775
|
+
|
|
6776
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6777
|
+
thresholds)
|
|
6778
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6779
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6780
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6781
|
+
'critical'
|
|
6782
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6783
|
+
for the 'warning' level only
|
|
6784
|
+
|
|
6785
|
+
Examples
|
|
6786
|
+
--------
|
|
6787
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
6788
|
+
shown below:
|
|
6789
|
+
|
|
6790
|
+
```python
|
|
6791
|
+
import pointblank as pb
|
|
6792
|
+
import polars as pl
|
|
6793
|
+
|
|
6794
|
+
tbl = pl.DataFrame(
|
|
6795
|
+
{
|
|
6796
|
+
"a": [1, 2, 3, 4, 5],
|
|
6797
|
+
"b": [2, 2, 2, 2, 2],
|
|
6798
|
+
}
|
|
6799
|
+
)
|
|
6800
|
+
|
|
6801
|
+
pb.preview(tbl)
|
|
6802
|
+
```
|
|
6803
|
+
|
|
6804
|
+
Let's validate that the standard deviation of column `a` is greater than `2`:
|
|
6805
|
+
|
|
6806
|
+
```python
|
|
6807
|
+
validation = (
|
|
6808
|
+
pb.Validate(data=tbl)
|
|
6809
|
+
.col_sd_gt(columns="a", value=2)
|
|
6810
|
+
.interrogate()
|
|
6811
|
+
)
|
|
6812
|
+
|
|
6813
|
+
validation
|
|
6814
|
+
```
|
|
6815
|
+
|
|
6816
|
+
The validation result shows whether the standard deviation comparison passed or failed. Since this
|
|
6817
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
6818
|
+
|
|
6819
|
+
When validating multiple columns, each column gets its own validation step:
|
|
6820
|
+
|
|
6821
|
+
```python
|
|
6822
|
+
validation = (
|
|
6823
|
+
pb.Validate(data=tbl)
|
|
6824
|
+
.col_sd_gt(columns=["a", "b"], value=2)
|
|
6825
|
+
.interrogate()
|
|
6826
|
+
)
|
|
6827
|
+
|
|
6828
|
+
validation
|
|
6829
|
+
```
|
|
6830
|
+
|
|
6831
|
+
Using tolerance for flexible comparisons:
|
|
6832
|
+
|
|
6833
|
+
```python
|
|
6834
|
+
validation = (
|
|
6835
|
+
pb.Validate(data=tbl)
|
|
6836
|
+
.col_sd_gt(columns="a", value=2, tol=1.0)
|
|
6837
|
+
.interrogate()
|
|
6838
|
+
)
|
|
6839
|
+
|
|
6840
|
+
validation
|
|
6841
|
+
```
|
|
6842
|
+
|
|
6843
|
+
col_sd_lt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6844
|
+
Does the column standard deviation satisfy a less than comparison?
|
|
6845
|
+
|
|
6846
|
+
The `col_sd_lt()` validation method checks whether the standard deviation of values in a column
|
|
6847
|
+
is less than a specified `value=`. This is an aggregation-based validation where the entire
|
|
6848
|
+
column is reduced to a single standard deviation value that is then compared against the target. The
|
|
6849
|
+
comparison used in this function is `standard deviation(column) < value`.
|
|
6850
|
+
|
|
6851
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
6852
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
6853
|
+
the comparison) or fails completely.
|
|
6854
|
+
|
|
6855
|
+
Parameters
|
|
6856
|
+
----------
|
|
6857
|
+
columns
|
|
6858
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
6859
|
+
there will be a separate validation step generated for each column. The columns must
|
|
6860
|
+
contain numeric data for the standard deviation to be computed.
|
|
6861
|
+
value
|
|
6862
|
+
The value to compare the column standard deviation against. This can be: (1) a numeric literal
|
|
6863
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
6864
|
+
whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
6865
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
6866
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
6867
|
+
`ref(column_name)` when reference data is set).
|
|
6868
|
+
tol
|
|
6869
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
6870
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
6871
|
+
a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6872
|
+
`col_sd_lt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
6873
|
+
target value and still pass validation.
|
|
6874
|
+
thresholds
|
|
6875
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
6876
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
6877
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
6878
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
6879
|
+
acceptable.
|
|
6880
|
+
brief
|
|
6881
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6882
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6883
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6884
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6885
|
+
won't be a brief.
|
|
6886
|
+
actions
|
|
6887
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6888
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6889
|
+
define the actions.
|
|
6890
|
+
active
|
|
6891
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6892
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6893
|
+
for the steps unchanged).
|
|
6894
|
+
|
|
6895
|
+
Returns
|
|
6896
|
+
-------
|
|
6897
|
+
Validate
|
|
6898
|
+
The `Validate` object with the added validation step.
|
|
6899
|
+
|
|
6900
|
+
Using Reference Data
|
|
6901
|
+
--------------------
|
|
6902
|
+
The `col_sd_lt()` method supports comparing column aggregations against reference data. This
|
|
6903
|
+
is useful for validating that statistical properties remain consistent across different
|
|
6904
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
6905
|
+
|
|
6906
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
6907
|
+
|
|
6908
|
+
```python
|
|
6909
|
+
validation = (
|
|
6910
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
6911
|
+
.col_sd_lt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
6912
|
+
.interrogate()
|
|
6913
|
+
)
|
|
6914
|
+
```
|
|
6915
|
+
|
|
6916
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
6917
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
6918
|
+
the `ref()` helper:
|
|
6919
|
+
|
|
6920
|
+
```python
|
|
6921
|
+
.col_sd_lt(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
6922
|
+
```
|
|
6923
|
+
|
|
6924
|
+
Understanding Tolerance
|
|
6925
|
+
-----------------------
|
|
6926
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
6927
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
6928
|
+
|
|
6929
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
6930
|
+
`col_sd_lt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
6931
|
+
target value and still pass validation.
|
|
6932
|
+
|
|
6933
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
6934
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
6935
|
+
shifts the comparison boundary.
|
|
6936
|
+
|
|
6937
|
+
Thresholds
|
|
6938
|
+
----------
|
|
6939
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6940
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6941
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6942
|
+
|
|
6943
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
6944
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
6945
|
+
typically set as absolute counts:
|
|
6946
|
+
|
|
6947
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
6948
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
6949
|
+
|
|
6950
|
+
Thresholds can be defined using one of these input schemes:
|
|
6951
|
+
|
|
6952
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6953
|
+
thresholds)
|
|
6954
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6955
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6956
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6957
|
+
'critical'
|
|
6958
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6959
|
+
for the 'warning' level only
|
|
6960
|
+
|
|
6961
|
+
Examples
|
|
6962
|
+
--------
|
|
6963
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
6964
|
+
shown below:
|
|
6965
|
+
|
|
6966
|
+
```python
|
|
6967
|
+
import pointblank as pb
|
|
6968
|
+
import polars as pl
|
|
6969
|
+
|
|
6970
|
+
tbl = pl.DataFrame(
|
|
6971
|
+
{
|
|
6972
|
+
"a": [1, 2, 3, 4, 5],
|
|
6973
|
+
"b": [2, 2, 2, 2, 2],
|
|
6974
|
+
}
|
|
6975
|
+
)
|
|
6976
|
+
|
|
6977
|
+
pb.preview(tbl)
|
|
6978
|
+
```
|
|
6979
|
+
|
|
6980
|
+
Let's validate that the standard deviation of column `a` is less than `2`:
|
|
6981
|
+
|
|
6982
|
+
```python
|
|
6983
|
+
validation = (
|
|
6984
|
+
pb.Validate(data=tbl)
|
|
6985
|
+
.col_sd_lt(columns="a", value=2)
|
|
6986
|
+
.interrogate()
|
|
6987
|
+
)
|
|
6988
|
+
|
|
6989
|
+
validation
|
|
6990
|
+
```
|
|
6991
|
+
|
|
6992
|
+
The validation result shows whether the standard deviation comparison passed or failed. Since this
|
|
6993
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
6994
|
+
|
|
6995
|
+
When validating multiple columns, each column gets its own validation step:
|
|
6996
|
+
|
|
6997
|
+
```python
|
|
6998
|
+
validation = (
|
|
6999
|
+
pb.Validate(data=tbl)
|
|
7000
|
+
.col_sd_lt(columns=["a", "b"], value=2)
|
|
7001
|
+
.interrogate()
|
|
7002
|
+
)
|
|
7003
|
+
|
|
7004
|
+
validation
|
|
7005
|
+
```
|
|
7006
|
+
|
|
7007
|
+
Using tolerance for flexible comparisons:
|
|
7008
|
+
|
|
7009
|
+
```python
|
|
7010
|
+
validation = (
|
|
7011
|
+
pb.Validate(data=tbl)
|
|
7012
|
+
.col_sd_lt(columns="a", value=2, tol=1.0)
|
|
7013
|
+
.interrogate()
|
|
7014
|
+
)
|
|
7015
|
+
|
|
7016
|
+
validation
|
|
7017
|
+
```
|
|
7018
|
+
|
|
7019
|
+
col_sd_ge(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
7020
|
+
Does the column standard deviation satisfy a greater than or equal to comparison?
|
|
7021
|
+
|
|
7022
|
+
The `col_sd_ge()` validation method checks whether the standard deviation of values in a column
|
|
7023
|
+
is at least a specified `value=`. This is an aggregation-based validation where the entire
|
|
7024
|
+
column is reduced to a single standard deviation value that is then compared against the target. The
|
|
7025
|
+
comparison used in this function is `standard deviation(column) >= value`.
|
|
7026
|
+
|
|
7027
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
7028
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
7029
|
+
the comparison) or fails completely.
|
|
7030
|
+
|
|
7031
|
+
Parameters
|
|
7032
|
+
----------
|
|
7033
|
+
columns
|
|
7034
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
7035
|
+
there will be a separate validation step generated for each column. The columns must
|
|
7036
|
+
contain numeric data for the standard deviation to be computed.
|
|
7037
|
+
value
|
|
7038
|
+
The value to compare the column standard deviation against. This can be: (1) a numeric literal
|
|
7039
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
7040
|
+
whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
7041
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
7042
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
7043
|
+
`ref(column_name)` when reference data is set).
|
|
7044
|
+
tol
|
|
7045
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
7046
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
7047
|
+
a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
7048
|
+
`col_sd_ge()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
7049
|
+
target value and still pass validation.
|
|
7050
|
+
thresholds
|
|
7051
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
7052
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
7053
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
7054
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
7055
|
+
acceptable.
|
|
7056
|
+
brief
|
|
7057
|
+
An optional brief description of the validation step that will be displayed in the
|
|
7058
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
7059
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
7060
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
7061
|
+
won't be a brief.
|
|
7062
|
+
actions
|
|
7063
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
7064
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
7065
|
+
define the actions.
|
|
7066
|
+
active
|
|
7067
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
7068
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
7069
|
+
for the steps unchanged).
|
|
7070
|
+
|
|
7071
|
+
Returns
|
|
7072
|
+
-------
|
|
7073
|
+
Validate
|
|
7074
|
+
The `Validate` object with the added validation step.
|
|
7075
|
+
|
|
7076
|
+
Using Reference Data
|
|
7077
|
+
--------------------
|
|
7078
|
+
The `col_sd_ge()` method supports comparing column aggregations against reference data. This
|
|
7079
|
+
is useful for validating that statistical properties remain consistent across different
|
|
7080
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
7081
|
+
|
|
7082
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
7083
|
+
|
|
7084
|
+
```python
|
|
7085
|
+
validation = (
|
|
7086
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
7087
|
+
.col_sd_ge(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
7088
|
+
.interrogate()
|
|
7089
|
+
)
|
|
7090
|
+
```
|
|
7091
|
+
|
|
7092
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
7093
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
7094
|
+
the `ref()` helper:
|
|
7095
|
+
|
|
7096
|
+
```python
|
|
7097
|
+
.col_sd_ge(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
7098
|
+
```
|
|
7099
|
+
|
|
7100
|
+
Understanding Tolerance
|
|
7101
|
+
-----------------------
|
|
7102
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
7103
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
7104
|
+
|
|
7105
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
7106
|
+
`col_sd_ge()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
7107
|
+
target value and still pass validation.
|
|
7108
|
+
|
|
7109
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
7110
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
7111
|
+
shifts the comparison boundary.
|
|
7112
|
+
|
|
7113
|
+
Thresholds
|
|
7114
|
+
----------
|
|
7115
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
7116
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
7117
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
7118
|
+
|
|
7119
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
7120
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
7121
|
+
typically set as absolute counts:
|
|
7122
|
+
|
|
7123
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
7124
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
7125
|
+
|
|
7126
|
+
Thresholds can be defined using one of these input schemes:
|
|
7127
|
+
|
|
7128
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
7129
|
+
thresholds)
|
|
7130
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
7131
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
7132
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
7133
|
+
'critical'
|
|
7134
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
7135
|
+
for the 'warning' level only
|
|
7136
|
+
|
|
7137
|
+
Examples
|
|
7138
|
+
--------
|
|
7139
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
7140
|
+
shown below:
|
|
7141
|
+
|
|
7142
|
+
```python
|
|
7143
|
+
import pointblank as pb
|
|
7144
|
+
import polars as pl
|
|
7145
|
+
|
|
7146
|
+
tbl = pl.DataFrame(
|
|
7147
|
+
{
|
|
7148
|
+
"a": [1, 2, 3, 4, 5],
|
|
7149
|
+
"b": [2, 2, 2, 2, 2],
|
|
7150
|
+
}
|
|
7151
|
+
)
|
|
7152
|
+
|
|
7153
|
+
pb.preview(tbl)
|
|
7154
|
+
```
|
|
7155
|
+
|
|
7156
|
+
Let's validate that the standard deviation of column `a` is at least `2`:
|
|
7157
|
+
|
|
7158
|
+
```python
|
|
7159
|
+
validation = (
|
|
7160
|
+
pb.Validate(data=tbl)
|
|
7161
|
+
.col_sd_ge(columns="a", value=2)
|
|
7162
|
+
.interrogate()
|
|
7163
|
+
)
|
|
7164
|
+
|
|
7165
|
+
validation
|
|
7166
|
+
```
|
|
7167
|
+
|
|
7168
|
+
The validation result shows whether the standard deviation comparison passed or failed. Since this
|
|
7169
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
7170
|
+
|
|
7171
|
+
When validating multiple columns, each column gets its own validation step:
|
|
7172
|
+
|
|
7173
|
+
```python
|
|
7174
|
+
validation = (
|
|
7175
|
+
pb.Validate(data=tbl)
|
|
7176
|
+
.col_sd_ge(columns=["a", "b"], value=2)
|
|
7177
|
+
.interrogate()
|
|
7178
|
+
)
|
|
7179
|
+
|
|
7180
|
+
validation
|
|
7181
|
+
```
|
|
7182
|
+
|
|
7183
|
+
Using tolerance for flexible comparisons:
|
|
7184
|
+
|
|
7185
|
+
```python
|
|
7186
|
+
validation = (
|
|
7187
|
+
pb.Validate(data=tbl)
|
|
7188
|
+
.col_sd_ge(columns="a", value=2, tol=1.0)
|
|
7189
|
+
.interrogate()
|
|
7190
|
+
)
|
|
7191
|
+
|
|
7192
|
+
validation
|
|
7193
|
+
```
|
|
7194
|
+
|
|
7195
|
+
col_sd_le(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
7196
|
+
Does the column standard deviation satisfy a less than or equal to comparison?
|
|
7197
|
+
|
|
7198
|
+
The `col_sd_le()` validation method checks whether the standard deviation of values in a column
|
|
7199
|
+
is at most a specified `value=`. This is an aggregation-based validation where the entire
|
|
7200
|
+
column is reduced to a single standard deviation value that is then compared against the target. The
|
|
7201
|
+
comparison used in this function is `standard deviation(column) <= value`.
|
|
7202
|
+
|
|
7203
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
7204
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
7205
|
+
the comparison) or fails completely.
|
|
7206
|
+
|
|
7207
|
+
Parameters
|
|
7208
|
+
----------
|
|
7209
|
+
columns
|
|
7210
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
7211
|
+
there will be a separate validation step generated for each column. The columns must
|
|
7212
|
+
contain numeric data for the standard deviation to be computed.
|
|
7213
|
+
value
|
|
7214
|
+
The value to compare the column standard deviation against. This can be: (1) a numeric literal
|
|
7215
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
7216
|
+
whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
7217
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
7218
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
7219
|
+
`ref(column_name)` when reference data is set).
|
|
7220
|
+
tol
|
|
7221
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
7222
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
7223
|
+
a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For
|
|
7224
|
+
`col_sd_le()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
7225
|
+
target value and still pass validation.
|
|
7226
|
+
thresholds
|
|
7227
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
7228
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
7229
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
7230
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
7231
|
+
acceptable.
|
|
7232
|
+
brief
|
|
7233
|
+
An optional brief description of the validation step that will be displayed in the
|
|
7234
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
7235
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
7236
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
7237
|
+
won't be a brief.
|
|
7238
|
+
actions
|
|
7239
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
7240
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
7241
|
+
define the actions.
|
|
7242
|
+
active
|
|
7243
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
7244
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
7245
|
+
for the steps unchanged).
|
|
7246
|
+
|
|
7247
|
+
Returns
|
|
7248
|
+
-------
|
|
7249
|
+
Validate
|
|
7250
|
+
The `Validate` object with the added validation step.
|
|
7251
|
+
|
|
7252
|
+
Using Reference Data
|
|
7253
|
+
--------------------
|
|
7254
|
+
The `col_sd_le()` method supports comparing column aggregations against reference data. This
|
|
7255
|
+
is useful for validating that statistical properties remain consistent across different
|
|
7256
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
7257
|
+
|
|
7258
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
7259
|
+
|
|
7260
|
+
```python
|
|
7261
|
+
validation = (
|
|
7262
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
7263
|
+
.col_sd_le(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
7264
|
+
.interrogate()
|
|
7265
|
+
)
|
|
7266
|
+
```
|
|
7267
|
+
|
|
7268
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
7269
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
7270
|
+
the `ref()` helper:
|
|
7271
|
+
|
|
7272
|
+
```python
|
|
7273
|
+
.col_sd_le(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
7274
|
+
```
|
|
7275
|
+
|
|
7276
|
+
Understanding Tolerance
|
|
7277
|
+
-----------------------
|
|
7278
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
7279
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
7280
|
+
|
|
7281
|
+
The `tol=` parameter expands the acceptable range for the comparison. For
|
|
7282
|
+
`col_sd_le()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the
|
|
7283
|
+
target value and still pass validation.
|
|
7284
|
+
|
|
7285
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
7286
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
7287
|
+
shifts the comparison boundary.
|
|
7288
|
+
|
|
7289
|
+
Thresholds
|
|
7290
|
+
----------
|
|
7291
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
7292
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
7293
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
7294
|
+
|
|
7295
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
7296
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
7297
|
+
typically set as absolute counts:
|
|
7298
|
+
|
|
7299
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
7300
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
7301
|
+
|
|
7302
|
+
Thresholds can be defined using one of these input schemes:
|
|
7303
|
+
|
|
7304
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
7305
|
+
thresholds)
|
|
7306
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
7307
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
7308
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
7309
|
+
'critical'
|
|
7310
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
7311
|
+
for the 'warning' level only
|
|
7312
|
+
|
|
7313
|
+
Examples
|
|
7314
|
+
--------
|
|
7315
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
7316
|
+
shown below:
|
|
7317
|
+
|
|
7318
|
+
```python
|
|
7319
|
+
import pointblank as pb
|
|
7320
|
+
import polars as pl
|
|
7321
|
+
|
|
7322
|
+
tbl = pl.DataFrame(
|
|
7323
|
+
{
|
|
7324
|
+
"a": [1, 2, 3, 4, 5],
|
|
7325
|
+
"b": [2, 2, 2, 2, 2],
|
|
7326
|
+
}
|
|
7327
|
+
)
|
|
7328
|
+
|
|
7329
|
+
pb.preview(tbl)
|
|
7330
|
+
```
|
|
7331
|
+
|
|
7332
|
+
Let's validate that the standard deviation of column `a` is at most `2`:
|
|
7333
|
+
|
|
7334
|
+
```python
|
|
7335
|
+
validation = (
|
|
7336
|
+
pb.Validate(data=tbl)
|
|
7337
|
+
.col_sd_le(columns="a", value=2)
|
|
7338
|
+
.interrogate()
|
|
7339
|
+
)
|
|
7340
|
+
|
|
7341
|
+
validation
|
|
7342
|
+
```
|
|
7343
|
+
|
|
7344
|
+
The validation result shows whether the standard deviation comparison passed or failed. Since this
|
|
7345
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
7346
|
+
|
|
7347
|
+
When validating multiple columns, each column gets its own validation step:
|
|
7348
|
+
|
|
7349
|
+
```python
|
|
7350
|
+
validation = (
|
|
7351
|
+
pb.Validate(data=tbl)
|
|
7352
|
+
.col_sd_le(columns=["a", "b"], value=2)
|
|
7353
|
+
.interrogate()
|
|
7354
|
+
)
|
|
7355
|
+
|
|
7356
|
+
validation
|
|
7357
|
+
```
|
|
7358
|
+
|
|
7359
|
+
Using tolerance for flexible comparisons:
|
|
7360
|
+
|
|
7361
|
+
```python
|
|
7362
|
+
validation = (
|
|
7363
|
+
pb.Validate(data=tbl)
|
|
7364
|
+
.col_sd_le(columns="a", value=2, tol=1.0)
|
|
7365
|
+
.interrogate()
|
|
7366
|
+
)
|
|
7367
|
+
|
|
7368
|
+
validation
|
|
7369
|
+
```
|
|
7370
|
+
|
|
7371
|
+
col_sd_eq(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool' = True) -> 'Validate'
|
|
7372
|
+
Does the column standard deviation satisfy an equal to comparison?
|
|
7373
|
+
|
|
7374
|
+
The `col_sd_eq()` validation method checks whether the standard deviation of values in a column
|
|
7375
|
+
equals a specified `value=`. This is an aggregation-based validation where the entire
|
|
7376
|
+
column is reduced to a single standard deviation value that is then compared against the target. The
|
|
7377
|
+
comparison used in this function is `standard deviation(column) == value`.
|
|
7378
|
+
|
|
7379
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
7380
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
7381
|
+
the comparison) or fails completely.
|
|
7382
|
+
|
|
7383
|
+
Parameters
|
|
7384
|
+
----------
|
|
7385
|
+
columns
|
|
7386
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
7387
|
+
there will be a separate validation step generated for each column. The columns must
|
|
7388
|
+
contain numeric data for the standard deviation to be computed.
|
|
7389
|
+
value
|
|
7390
|
+
The value to compare the column standard deviation against. This can be: (1) a numeric literal
|
|
7391
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
7392
|
+
whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
7393
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
7394
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
7395
|
+
`ref(column_name)` when reference data is set).
|
|
7396
|
+
tol
|
|
7397
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
7398
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
7399
|
+
a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter is particularly useful with `col_sd_eq()` since exact equality
|
|
7400
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
7401
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
7402
|
+
floating-point arithmetic.
|
|
7403
|
+
thresholds
|
|
7404
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
7405
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
7406
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
7407
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
7408
|
+
acceptable.
|
|
7409
|
+
brief
|
|
7410
|
+
An optional brief description of the validation step that will be displayed in the
|
|
7411
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
7412
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
7413
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
7414
|
+
won't be a brief.
|
|
7415
|
+
actions
|
|
7416
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
7417
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
7418
|
+
define the actions.
|
|
7419
|
+
active
|
|
7420
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
7421
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
7422
|
+
for the steps unchanged).
|
|
7423
|
+
|
|
7424
|
+
Returns
|
|
7425
|
+
-------
|
|
7426
|
+
Validate
|
|
7427
|
+
The `Validate` object with the added validation step.
|
|
7428
|
+
|
|
7429
|
+
Using Reference Data
|
|
7430
|
+
--------------------
|
|
7431
|
+
The `col_sd_eq()` method supports comparing column aggregations against reference data. This
|
|
7432
|
+
is useful for validating that statistical properties remain consistent across different
|
|
7433
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
7434
|
+
|
|
7435
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
7436
|
+
|
|
7437
|
+
```python
|
|
7438
|
+
validation = (
|
|
7439
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
7440
|
+
.col_sd_eq(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
7441
|
+
.interrogate()
|
|
7442
|
+
)
|
|
7443
|
+
```
|
|
7444
|
+
|
|
7445
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
7446
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
7447
|
+
the `ref()` helper:
|
|
7448
|
+
|
|
7449
|
+
```python
|
|
7450
|
+
.col_sd_eq(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
7451
|
+
```
|
|
7452
|
+
|
|
7453
|
+
Understanding Tolerance
|
|
7454
|
+
-----------------------
|
|
7455
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
7456
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
7457
|
+
|
|
7458
|
+
The `tol=` parameter is particularly useful with `col_sd_eq()` since exact equality
|
|
7459
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
7460
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
7461
|
+
floating-point arithmetic.
|
|
7462
|
+
|
|
7463
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
7464
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
7465
|
+
shifts the comparison boundary.
|
|
7466
|
+
|
|
7467
|
+
Thresholds
|
|
7468
|
+
----------
|
|
7469
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
7470
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
7471
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
7472
|
+
|
|
7473
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
7474
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
7475
|
+
typically set as absolute counts:
|
|
7476
|
+
|
|
7477
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
7478
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
7479
|
+
|
|
7480
|
+
Thresholds can be defined using one of these input schemes:
|
|
7481
|
+
|
|
7482
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
7483
|
+
thresholds)
|
|
7484
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
7485
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
7486
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
7487
|
+
'critical'
|
|
7488
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
7489
|
+
for the 'warning' level only
|
|
7490
|
+
|
|
7491
|
+
Examples
|
|
7492
|
+
--------
|
|
7493
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
7494
|
+
shown below:
|
|
7495
|
+
|
|
7496
|
+
```python
|
|
7497
|
+
import pointblank as pb
|
|
7498
|
+
import polars as pl
|
|
7499
|
+
|
|
7500
|
+
tbl = pl.DataFrame(
|
|
7501
|
+
{
|
|
7502
|
+
"a": [1, 2, 3, 4, 5],
|
|
7503
|
+
"b": [2, 2, 2, 2, 2],
|
|
7504
|
+
}
|
|
7505
|
+
)
|
|
7506
|
+
|
|
7507
|
+
pb.preview(tbl)
|
|
7508
|
+
```
|
|
7509
|
+
|
|
7510
|
+
Let's validate that the standard deviation of column `a` equals `2`:
|
|
7511
|
+
|
|
7512
|
+
```python
|
|
7513
|
+
validation = (
|
|
7514
|
+
pb.Validate(data=tbl)
|
|
7515
|
+
.col_sd_eq(columns="a", value=2)
|
|
7516
|
+
.interrogate()
|
|
7517
|
+
)
|
|
7518
|
+
|
|
7519
|
+
validation
|
|
7520
|
+
```
|
|
7521
|
+
|
|
7522
|
+
The validation result shows whether the standard deviation comparison passed or failed. Since this
|
|
7523
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
7524
|
+
|
|
7525
|
+
When validating multiple columns, each column gets its own validation step:
|
|
7526
|
+
|
|
7527
|
+
```python
|
|
7528
|
+
validation = (
|
|
7529
|
+
pb.Validate(data=tbl)
|
|
7530
|
+
.col_sd_eq(columns=["a", "b"], value=2)
|
|
7531
|
+
.interrogate()
|
|
7532
|
+
)
|
|
7533
|
+
|
|
7534
|
+
validation
|
|
7535
|
+
```
|
|
7536
|
+
|
|
7537
|
+
Using tolerance for flexible comparisons:
|
|
7538
|
+
|
|
7539
|
+
```python
|
|
7540
|
+
validation = (
|
|
7541
|
+
pb.Validate(data=tbl)
|
|
7542
|
+
.col_sd_eq(columns="a", value=2, tol=1.0)
|
|
7543
|
+
.interrogate()
|
|
7544
|
+
)
|
|
7545
|
+
|
|
7546
|
+
validation
|
|
7547
|
+
```
|
|
7548
|
+
|
|
7549
|
+
rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4904
7550
|
|
|
4905
7551
|
Validate whether rows in the table are distinct.
|
|
4906
7552
|
|
|
@@ -5090,7 +7736,7 @@ rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
5090
7736
|
others.
|
|
5091
7737
|
|
|
5092
7738
|
|
|
5093
|
-
rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
7739
|
+
rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5094
7740
|
|
|
5095
7741
|
Validate whether row data are complete by having no missing values.
|
|
5096
7742
|
|
|
@@ -5280,7 +7926,7 @@ rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
5280
7926
|
others.
|
|
5281
7927
|
|
|
5282
7928
|
|
|
5283
|
-
col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
7929
|
+
col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5284
7930
|
|
|
5285
7931
|
Validate whether one or more columns exist in the table.
|
|
5286
7932
|
|
|
@@ -5402,7 +8048,248 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
|
|
|
5402
8048
|
failing validation step (the check for column `c`, which doesn't exist).
|
|
5403
8049
|
|
|
5404
8050
|
|
|
5405
|
-
|
|
8051
|
+
col_pct_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', p: 'float', tol: 'Tolerance' = 0, thresholds: 'int | float | None | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
8052
|
+
|
|
8053
|
+
Validate whether a column has a specific percentage of Null values.
|
|
8054
|
+
|
|
8055
|
+
The `col_pct_null()` validation method checks whether the percentage of Null values in a
|
|
8056
|
+
column matches a specified percentage `p=` (within an optional tolerance `tol=`). This
|
|
8057
|
+
validation operates at the column level, generating a single validation step per column that
|
|
8058
|
+
passes or fails based on whether the actual percentage of Null values falls within the
|
|
8059
|
+
acceptable range defined by `p ± tol`.
|
|
8060
|
+
|
|
8061
|
+
Parameters
|
|
8062
|
+
----------
|
|
8063
|
+
columns
|
|
8064
|
+
A single column or a list of columns to validate. Can also use
|
|
8065
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
8066
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
8067
|
+
generated for each column.
|
|
8068
|
+
p
|
|
8069
|
+
The expected percentage of Null values in the column, expressed as a decimal between
|
|
8070
|
+
`0.0` and `1.0`. For example, `p=0.5` means 50% of values should be Null.
|
|
8071
|
+
tol
|
|
8072
|
+
The tolerance allowed when comparing the actual percentage of Null values to the
|
|
8073
|
+
expected percentage `p=`. The validation passes if the actual percentage falls within
|
|
8074
|
+
the range `[p - tol, p + tol]`. Default is `0`, meaning an exact match is required. See
|
|
8075
|
+
the *Tolerance* section for details on all supported formats (absolute, relative,
|
|
8076
|
+
symmetric, and asymmetric bounds).
|
|
8077
|
+
thresholds
|
|
8078
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
8079
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
8080
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
8081
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
8082
|
+
section for information on how to set threshold levels.
|
|
8083
|
+
actions
|
|
8084
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
8085
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
8086
|
+
define the actions.
|
|
8087
|
+
brief
|
|
8088
|
+
An optional brief description of the validation step that will be displayed in the
|
|
8089
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
8090
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
8091
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
8092
|
+
won't be a brief.
|
|
8093
|
+
active
|
|
8094
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
8095
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
8096
|
+
for the steps unchanged).
|
|
8097
|
+
|
|
8098
|
+
Returns
|
|
8099
|
+
-------
|
|
8100
|
+
Validate
|
|
8101
|
+
The `Validate` object with the added validation step.
|
|
8102
|
+
|
|
8103
|
+
Tolerance
|
|
8104
|
+
---------
|
|
8105
|
+
The `tol=` parameter accepts several different formats to specify the acceptable deviation
|
|
8106
|
+
from the expected percentage `p=`. The tolerance can be expressed as:
|
|
8107
|
+
|
|
8108
|
+
1. *single integer* (absolute tolerance): the exact number of test units that can deviate.
|
|
8109
|
+
For example, `tol=2` means the actual count can differ from the expected count by up to 2
|
|
8110
|
+
units in either direction.
|
|
8111
|
+
|
|
8112
|
+
2. *single float between 0 and 1* (relative tolerance): a proportion of the expected
|
|
8113
|
+
count. For example, if the expected count is 50 and `tol=0.1`, the acceptable range is
|
|
8114
|
+
45 to 55 (50 ± 10% of 50 = 50 ± 5).
|
|
8115
|
+
|
|
8116
|
+
3. *tuple of two integers* (absolute bounds): explicitly specify the lower and upper
|
|
8117
|
+
bounds as absolute deviations. For example, `tol=(1, 3)` means the actual count can be
|
|
8118
|
+
1 unit below or 3 units above the expected count.
|
|
8119
|
+
|
|
8120
|
+
4. *tuple of two floats between 0 and 1* (relative bounds): explicitly specify the lower
|
|
8121
|
+
and upper bounds as proportional deviations. For example, `tol=(0.05, 0.15)` means the
|
|
8122
|
+
lower bound is 5% below and the upper bound is 15% above the expected count.
|
|
8123
|
+
|
|
8124
|
+
When using a single value (integer or float), the tolerance is applied symmetrically in both
|
|
8125
|
+
directions. When using a tuple, you can specify asymmetric tolerances where the lower and
|
|
8126
|
+
upper bounds differ.
|
|
8127
|
+
|
|
8128
|
+
Thresholds
|
|
8129
|
+
----------
|
|
8130
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
8131
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
8132
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
8133
|
+
|
|
8134
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
8135
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
8136
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
8137
|
+
|
|
8138
|
+
Thresholds can be defined using one of these input schemes:
|
|
8139
|
+
|
|
8140
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
8141
|
+
thresholds)
|
|
8142
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
8143
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
8144
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
8145
|
+
'critical'
|
|
8146
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
8147
|
+
for the 'warning' level only
|
|
8148
|
+
|
|
8149
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
8150
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
8151
|
+
set, you're free to set any combination of them.
|
|
8152
|
+
|
|
8153
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
8154
|
+
take for each level of failure (using the `actions=` parameter).
|
|
8155
|
+
|
|
8156
|
+
Examples
|
|
8157
|
+
--------
|
|
8158
|
+
For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`,
|
|
8159
|
+
and `c`) that have different percentages of Null values. The table is shown below:
|
|
8160
|
+
|
|
8161
|
+
```python
|
|
8162
|
+
import pointblank as pb
|
|
8163
|
+
import polars as pl
|
|
8164
|
+
|
|
8165
|
+
tbl = pl.DataFrame(
|
|
8166
|
+
{
|
|
8167
|
+
"a": [1, 2, 3, 4, 5, 6, 7, 8],
|
|
8168
|
+
"b": [1, None, 3, None, 5, None, 7, None],
|
|
8169
|
+
"c": [None, None, None, None, None, None, 1, 2],
|
|
8170
|
+
}
|
|
8171
|
+
)
|
|
8172
|
+
|
|
8173
|
+
pb.preview(tbl)
|
|
8174
|
+
```
|
|
8175
|
+
|
|
8176
|
+
Let's validate that column `a` has 0% Null values (i.e., no Null values at all).
|
|
8177
|
+
|
|
8178
|
+
```python
|
|
8179
|
+
validation = (
|
|
8180
|
+
pb.Validate(data=tbl)
|
|
8181
|
+
.col_pct_null(columns="a", p=0.0)
|
|
8182
|
+
.interrogate()
|
|
8183
|
+
)
|
|
8184
|
+
|
|
8185
|
+
validation
|
|
8186
|
+
```
|
|
8187
|
+
|
|
8188
|
+
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
8189
|
+
The validation table shows the single entry that corresponds to the validation step created
|
|
8190
|
+
by using `col_pct_null()`. The validation passed since column `a` has no Null values.
|
|
8191
|
+
|
|
8192
|
+
Now, let's check that column `b` has exactly 50% Null values.
|
|
8193
|
+
|
|
8194
|
+
```python
|
|
8195
|
+
validation = (
|
|
8196
|
+
pb.Validate(data=tbl)
|
|
8197
|
+
.col_pct_null(columns="b", p=0.5)
|
|
8198
|
+
.interrogate()
|
|
8199
|
+
)
|
|
8200
|
+
|
|
8201
|
+
validation
|
|
8202
|
+
```
|
|
8203
|
+
|
|
8204
|
+
This validation also passes, as column `b` has exactly 4 out of 8 values as Null (50%).
|
|
8205
|
+
|
|
8206
|
+
Finally, let's validate column `c` with a tolerance. Column `c` has 75% Null values, so
|
|
8207
|
+
we'll check if it's approximately 70% Null with a tolerance of 10%.
|
|
8208
|
+
|
|
8209
|
+
```python
|
|
8210
|
+
validation = (
|
|
8211
|
+
pb.Validate(data=tbl)
|
|
8212
|
+
.col_pct_null(columns="c", p=0.70, tol=0.10)
|
|
8213
|
+
.interrogate()
|
|
8214
|
+
)
|
|
8215
|
+
|
|
8216
|
+
validation
|
|
8217
|
+
```
|
|
8218
|
+
|
|
8219
|
+
This validation passes because the actual percentage (75%) falls within the acceptable
|
|
8220
|
+
range of 60% to 80% (70% ± 10%).
|
|
8221
|
+
|
|
8222
|
+
The `tol=` parameter supports multiple formats to express tolerance. Let's explore all the
|
|
8223
|
+
different ways to specify tolerance using column `b`, which has exactly 50% Null values
|
|
8224
|
+
(4 out of 8 values).
|
|
8225
|
+
|
|
8226
|
+
*Using an absolute tolerance (integer)*: Specify the exact number of rows that can
|
|
8227
|
+
deviate. With `tol=1`, we allow the count to differ by 1 row in either direction.
|
|
8228
|
+
|
|
8229
|
+
```python
|
|
8230
|
+
validation = (
|
|
8231
|
+
pb.Validate(data=tbl)
|
|
8232
|
+
.col_pct_null(columns="b", p=0.375, tol=1) # Expect 3 nulls, allow ±1 (range: 2-4)
|
|
8233
|
+
.interrogate()
|
|
8234
|
+
)
|
|
8235
|
+
|
|
8236
|
+
validation
|
|
8237
|
+
```
|
|
8238
|
+
|
|
8239
|
+
This passes because column `b` has 4 Null values, which falls within the acceptable range
|
|
8240
|
+
of 2 to 4 (3 ± 1).
|
|
8241
|
+
|
|
8242
|
+
*Using a relative tolerance (float)*: Specify the tolerance as a proportion of the
|
|
8243
|
+
expected count. With `tol=0.25`, we allow a 25% deviation from the expected count.
|
|
8244
|
+
|
|
8245
|
+
```python
|
|
8246
|
+
validation = (
|
|
8247
|
+
pb.Validate(data=tbl)
|
|
8248
|
+
.col_pct_null(columns="b", p=0.375, tol=0.25) # Expect 3 nulls, allow ±25% (range: 2.25-3.75)
|
|
8249
|
+
.interrogate()
|
|
8250
|
+
)
|
|
8251
|
+
|
|
8252
|
+
validation
|
|
8253
|
+
```
|
|
8254
|
+
|
|
8255
|
+
This passes because 4 Null values falls within the acceptable range (3 ± 0.75 calculates
|
|
8256
|
+
to 2.25 to 3.75, which rounds down to 2 to 3 rows).
|
|
8257
|
+
|
|
8258
|
+
*Using asymmetric absolute bounds (tuple of integers)*: Specify different lower and
|
|
8259
|
+
upper bounds as absolute values. With `tol=(0, 2)`, we allow no deviation below but up
|
|
8260
|
+
to 2 rows above the expected count.
|
|
8261
|
+
|
|
8262
|
+
```python
|
|
8263
|
+
validation = (
|
|
8264
|
+
pb.Validate(data=tbl)
|
|
8265
|
+
.col_pct_null(columns="b", p=0.25, tol=(0, 2)) # Expect 2 Nulls, allow +0/-2 (range: 2-4)
|
|
8266
|
+
.interrogate()
|
|
8267
|
+
)
|
|
8268
|
+
|
|
8269
|
+
validation
|
|
8270
|
+
```
|
|
8271
|
+
|
|
8272
|
+
This passes because 4 Null values falls within the acceptable range of 2 to 4.
|
|
8273
|
+
|
|
8274
|
+
*Using asymmetric relative bounds (tuple of floats)*: Specify different lower and upper
|
|
8275
|
+
bounds as proportions. With `tol=(0.1, 0.3)`, we allow 10% below and 30% above the
|
|
8276
|
+
expected count.
|
|
8277
|
+
|
|
8278
|
+
```python
|
|
8279
|
+
validation = (
|
|
8280
|
+
pb.Validate(data=tbl)
|
|
8281
|
+
.col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3)) # Expect 3 Nulls, allow -10%/+30%
|
|
8282
|
+
.interrogate()
|
|
8283
|
+
)
|
|
8284
|
+
|
|
8285
|
+
validation
|
|
8286
|
+
```
|
|
8287
|
+
|
|
8288
|
+
This passes because 4 Null values falls within the acceptable range (3 - 0.3 to 3 + 0.9
|
|
8289
|
+
calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows).
|
|
8290
|
+
|
|
8291
|
+
|
|
8292
|
+
col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5406
8293
|
|
|
5407
8294
|
Do columns in the table (and their types) match a predefined schema?
|
|
5408
8295
|
|
|
@@ -5562,7 +8449,7 @@ col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'boo
|
|
|
5562
8449
|
since the table columns and their types match the schema.
|
|
5563
8450
|
|
|
5564
8451
|
|
|
5565
|
-
row_count_match(self, count: 'int |
|
|
8452
|
+
row_count_match(self, count: 'int | Any', tol: 'Tolerance' = 0, inverse: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5566
8453
|
|
|
5567
8454
|
Validate whether the row count of the table matches a specified count.
|
|
5568
8455
|
|
|
@@ -5716,7 +8603,7 @@ row_count_match(self, count: 'int | FrameT | Any', tol: 'Tolerance' = 0, inverse
|
|
|
5716
8603
|
|
|
5717
8604
|
|
|
5718
8605
|
|
|
5719
|
-
col_count_match(self, count: 'int |
|
|
8606
|
+
col_count_match(self, count: 'int | Any', inverse: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5720
8607
|
|
|
5721
8608
|
Validate whether the column count of the table matches a specified count.
|
|
5722
8609
|
|
|
@@ -5831,7 +8718,7 @@ col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre:
|
|
|
5831
8718
|
columns in the target table. So, the single test unit passed.
|
|
5832
8719
|
|
|
5833
8720
|
|
|
5834
|
-
tbl_match(self, tbl_compare: '
|
|
8721
|
+
tbl_match(self, tbl_compare: 'Any', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5835
8722
|
|
|
5836
8723
|
Validate whether the target table matches a comparison table.
|
|
5837
8724
|
|
|
@@ -6054,7 +8941,7 @@ tbl_match(self, tbl_compare: 'FrameT | Any', pre: 'Callable | None' = None, thre
|
|
|
6054
8941
|
(one value is different in column `c`).
|
|
6055
8942
|
|
|
6056
8943
|
|
|
6057
|
-
conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
8944
|
+
conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6058
8945
|
|
|
6059
8946
|
Perform multiple row-wise validations for joint validity.
|
|
6060
8947
|
|
|
@@ -6253,7 +9140,7 @@ conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds:
|
|
|
6253
9140
|
information on how to use it with different table backends.
|
|
6254
9141
|
|
|
6255
9142
|
|
|
6256
|
-
specially(self, expr: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
9143
|
+
specially(self, expr: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6257
9144
|
|
|
6258
9145
|
Perform a specialized validation with customized logic.
|
|
6259
9146
|
|
|
@@ -6553,7 +9440,7 @@ specially(self, expr: 'Callable', pre: 'Callable | None' = None, thresholds: 'in
|
|
|
6553
9440
|
virtually any data quality requirement in your organization.
|
|
6554
9441
|
|
|
6555
9442
|
|
|
6556
|
-
prompt(self, prompt: 'str', model: 'str', columns_subset: 'str | list[str] | None' = None, batch_size: 'int' = 1000, max_concurrent: 'int' = 3, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
9443
|
+
prompt(self, prompt: 'str', model: 'str', columns_subset: 'str | list[str] | None' = None, batch_size: 'int' = 1000, max_concurrent: 'int' = 3, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6557
9444
|
|
|
6558
9445
|
Validate rows using AI/LLM-powered analysis.
|
|
6559
9446
|
|
|
@@ -6874,7 +9761,7 @@ many steps). Furthermore, the `col()` function can be used to declare a comparis
|
|
|
6874
9761
|
for the `value=` argument in many `col_vals_*()` methods) when you can't use a fixed value
|
|
6875
9762
|
for comparison.
|
|
6876
9763
|
|
|
6877
|
-
col(exprs: 'str | ColumnSelector | ColumnSelectorNarwhals') -> 'Column | ColumnLiteral | ColumnSelectorNarwhals'
|
|
9764
|
+
col(exprs: 'str | ColumnSelector | ColumnSelectorNarwhals | nw.selectors.Selector') -> 'Column | ColumnLiteral | ColumnSelectorNarwhals'
|
|
6878
9765
|
|
|
6879
9766
|
Helper function for referencing a column in the input table.
|
|
6880
9767
|
|
|
@@ -8494,7 +11381,7 @@ interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' =
|
|
|
8494
11381
|
`get_first_n=10`.
|
|
8495
11382
|
|
|
8496
11383
|
|
|
8497
|
-
set_tbl(self, tbl: '
|
|
11384
|
+
set_tbl(self, tbl: 'Any', tbl_name: 'str | None' = None, label: 'str | None' = None) -> 'Validate'
|
|
8498
11385
|
|
|
8499
11386
|
Set or replace the table associated with the Validate object.
|
|
8500
11387
|
|
|
@@ -8596,7 +11483,7 @@ set_tbl(self, tbl: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str |
|
|
|
8596
11483
|
```
|
|
8597
11484
|
|
|
8598
11485
|
|
|
8599
|
-
get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool' = None, incl_footer: 'bool' = None) -> 'GT'
|
|
11486
|
+
get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool | None' = None, incl_footer: 'bool | None' = None, incl_footer_timings: 'bool | None' = None, incl_footer_notes: 'bool | None' = None) -> 'GT'
|
|
8600
11487
|
|
|
8601
11488
|
Validation report as a GT table.
|
|
8602
11489
|
|
|
@@ -8618,6 +11505,20 @@ get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool'
|
|
|
8618
11505
|
name of the table as the title for the report. If no title is wanted, then `":none:"`
|
|
8619
11506
|
can be used. Aside from keyword options, text can be provided for the title. This will
|
|
8620
11507
|
be interpreted as Markdown text and transformed internally to HTML.
|
|
11508
|
+
incl_header
|
|
11509
|
+
Controls whether the header section should be displayed. If `None`, uses the global
|
|
11510
|
+
configuration setting. The header contains the table name, label, and threshold
|
|
11511
|
+
information.
|
|
11512
|
+
incl_footer
|
|
11513
|
+
Controls whether the footer section should be displayed. If `None`, uses the global
|
|
11514
|
+
configuration setting. The footer can contain validation timing information and notes.
|
|
11515
|
+
incl_footer_timings
|
|
11516
|
+
Controls whether validation timing information (start time, duration, end time) should
|
|
11517
|
+
be displayed in the footer. If `None`, uses the global configuration setting. Only
|
|
11518
|
+
applies when `incl_footer=True`.
|
|
11519
|
+
incl_footer_notes
|
|
11520
|
+
Controls whether notes from validation steps should be displayed in the footer. If
|
|
11521
|
+
`None`, uses the global configuration setting. Only applies when `incl_footer=True`.
|
|
8621
11522
|
|
|
8622
11523
|
Returns
|
|
8623
11524
|
-------
|
|
@@ -8955,7 +11856,7 @@ get_json_report(self, use_fields: 'list[str] | None' = None, exclude_fields: 'li
|
|
|
8955
11856
|
failed validation
|
|
8956
11857
|
|
|
8957
11858
|
|
|
8958
|
-
get_sundered_data(self, type='pass') -> '
|
|
11859
|
+
get_sundered_data(self, type='pass') -> 'Any'
|
|
8959
11860
|
|
|
8960
11861
|
Get the data that passed or failed the validation steps.
|
|
8961
11862
|
|
|
@@ -8991,7 +11892,7 @@ get_sundered_data(self, type='pass') -> 'FrameT'
|
|
|
8991
11892
|
|
|
8992
11893
|
Returns
|
|
8993
11894
|
-------
|
|
8994
|
-
|
|
11895
|
+
Any
|
|
8995
11896
|
A table containing the data that passed or failed the validation steps.
|
|
8996
11897
|
|
|
8997
11898
|
Examples
|
|
@@ -9036,7 +11937,7 @@ get_sundered_data(self, type='pass') -> 'FrameT'
|
|
|
9036
11937
|
that's what we see in the returned DataFrame.
|
|
9037
11938
|
|
|
9038
11939
|
|
|
9039
|
-
get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = False) -> 'dict[int,
|
|
11940
|
+
get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = False) -> 'dict[int, Any] | Any'
|
|
9040
11941
|
|
|
9041
11942
|
Get the rows that failed for each validation step.
|
|
9042
11943
|
|
|
@@ -9059,7 +11960,7 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
|
|
|
9059
11960
|
|
|
9060
11961
|
Returns
|
|
9061
11962
|
-------
|
|
9062
|
-
dict[int,
|
|
11963
|
+
dict[int, Any] | Any
|
|
9063
11964
|
A dictionary of tables containing the rows that failed in every compatible validation
|
|
9064
11965
|
step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
|
|
9065
11966
|
|
|
@@ -10216,7 +13117,7 @@ datasets included in the package can be accessed via the `load_dataset()` functi
|
|
|
10216
13117
|
`config()` utility lets us set global configuration parameters. Want to chat with an assistant? Use
|
|
10217
13118
|
the `assistant()` function to get help with Pointblank.
|
|
10218
13119
|
|
|
10219
|
-
DataScan(data: '
|
|
13120
|
+
DataScan(data: 'Any', tbl_name: 'str | None' = None) -> 'None'
|
|
10220
13121
|
|
|
10221
13122
|
Get a summary of a dataset.
|
|
10222
13123
|
|
|
@@ -10312,7 +13213,7 @@ DataScan(data: 'IntoFrameT', tbl_name: 'str | None' = None) -> 'None'
|
|
|
10312
13213
|
A DataScan object.
|
|
10313
13214
|
|
|
10314
13215
|
|
|
10315
|
-
preview(data: '
|
|
13216
|
+
preview(data: 'Any', columns_subset: 'str | list[str] | Column | None' = None, n_head: 'int' = 5, n_tail: 'int' = 5, limit: 'int' = 50, show_row_numbers: 'bool' = True, max_col_width: 'int' = 250, min_tbl_width: 'int' = 500, incl_header: 'bool | None' = None) -> 'GT'
|
|
10316
13217
|
|
|
10317
13218
|
Display a table preview that shows some rows from the top, some from the bottom.
|
|
10318
13219
|
|
|
@@ -10511,7 +13412,7 @@ preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None'
|
|
|
10511
13412
|
function.
|
|
10512
13413
|
|
|
10513
13414
|
|
|
10514
|
-
col_summary_tbl(data: '
|
|
13415
|
+
col_summary_tbl(data: 'Any', tbl_name: 'str | None' = None) -> 'GT'
|
|
10515
13416
|
|
|
10516
13417
|
Generate a column-level summary table of a dataset.
|
|
10517
13418
|
|
|
@@ -10588,7 +13489,7 @@ col_summary_tbl(data: 'FrameT | Any', tbl_name: 'str | None' = None) -> 'GT'
|
|
|
10588
13489
|
```
|
|
10589
13490
|
|
|
10590
13491
|
|
|
10591
|
-
missing_vals_tbl(data: '
|
|
13492
|
+
missing_vals_tbl(data: 'Any') -> 'GT'
|
|
10592
13493
|
|
|
10593
13494
|
Display a table that shows the missing values in the input table.
|
|
10594
13495
|
|
|
@@ -10662,7 +13563,7 @@ missing_vals_tbl(data: 'FrameT | Any') -> 'GT'
|
|
|
10662
13563
|
sector. Many columns have no missing values at all, and those sectors are colored light blue.
|
|
10663
13564
|
|
|
10664
13565
|
|
|
10665
|
-
assistant(model: 'str', data: '
|
|
13566
|
+
assistant(model: 'str', data: 'Any' = None, tbl_name: 'str | None' = None, api_key: 'str | None' = None, display: 'str | None' = None) -> 'None'
|
|
10666
13567
|
|
|
10667
13568
|
Chat with the PbA (Pointblank Assistant) about your data validation needs.
|
|
10668
13569
|
|
|
@@ -10806,7 +13707,7 @@ assistant(model: 'str', data: 'FrameT | Any | None' = None, tbl_name: 'str | Non
|
|
|
10806
13707
|
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
10807
13708
|
|
|
10808
13709
|
|
|
10809
|
-
load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'global_sales']" = 'small_table', tbl_type: "Literal['polars', 'pandas', 'duckdb']" = 'polars') -> '
|
|
13710
|
+
load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'global_sales']" = 'small_table', tbl_type: "Literal['polars', 'pandas', 'duckdb']" = 'polars') -> 'Any'
|
|
10810
13711
|
|
|
10811
13712
|
Load a dataset hosted in the library as specified table type.
|
|
10812
13713
|
|
|
@@ -10827,7 +13728,7 @@ load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'glo
|
|
|
10827
13728
|
|
|
10828
13729
|
Returns
|
|
10829
13730
|
-------
|
|
10830
|
-
|
|
13731
|
+
Any
|
|
10831
13732
|
The dataset for the `Validate` object. This could be a Polars DataFrame, a Pandas DataFrame,
|
|
10832
13733
|
or a DuckDB table as an Ibis table.
|
|
10833
13734
|
|
|
@@ -11119,7 +14020,7 @@ from YAML strings or files. The `validate_yaml()` function checks if the YAML co
|
|
|
11119
14020
|
its own validity checks. The `yaml_to_python()` function converts YAML configuration to equivalent
|
|
11120
14021
|
Python code.
|
|
11121
14022
|
|
|
11122
|
-
yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: '
|
|
14023
|
+
yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Any' = None, namespaces: 'Optional[Union[Iterable[str], Mapping[str, str]]]' = None) -> 'Validate'
|
|
11123
14024
|
Execute a YAML-based validation workflow.
|
|
11124
14025
|
|
|
11125
14026
|
This is the main entry point for YAML-based validation workflows. It takes YAML configuration
|
|
@@ -11608,7 +14509,7 @@ columns or rows in a table. The `get_action_metadata()` function is useful when
|
|
|
11608
14509
|
actions since it returns metadata about the validation step that's triggering the action. Lastly,
|
|
11609
14510
|
the `config()` utility lets us set global configuration parameters.
|
|
11610
14511
|
|
|
11611
|
-
get_column_count(data: '
|
|
14512
|
+
get_column_count(data: 'Any') -> 'int'
|
|
11612
14513
|
|
|
11613
14514
|
Get the number of columns in a table.
|
|
11614
14515
|
|
|
@@ -11723,7 +14624,7 @@ get_column_count(data: 'FrameT | Any') -> 'int'
|
|
|
11723
14624
|
`8` for the `small_table` dataset.
|
|
11724
14625
|
|
|
11725
14626
|
|
|
11726
|
-
get_row_count(data: '
|
|
14627
|
+
get_row_count(data: 'Any') -> 'int'
|
|
11727
14628
|
|
|
11728
14629
|
Get the number of rows in a table.
|
|
11729
14630
|
|
|
@@ -12310,7 +15211,7 @@ read_file(filepath: 'str | Path') -> 'Validate'
|
|
|
12310
15211
|
to disk for later retrieval with this function.
|
|
12311
15212
|
|
|
12312
15213
|
|
|
12313
|
-
config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, preview_incl_header: 'bool' = True) -> 'PointblankConfig'
|
|
15214
|
+
config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, report_incl_footer_timings: 'bool' = True, report_incl_footer_notes: 'bool' = True, preview_incl_header: 'bool' = True) -> 'PointblankConfig'
|
|
12314
15215
|
|
|
12315
15216
|
Configuration settings for the Pointblank library.
|
|
12316
15217
|
|
|
@@ -12322,7 +15223,13 @@ config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, pre
|
|
|
12322
15223
|
threshold levels (if set).
|
|
12323
15224
|
report_incl_footer
|
|
12324
15225
|
Should the footer of the validation table report be displayed? The footer contains the
|
|
12325
|
-
starting and ending times of the interrogation.
|
|
15226
|
+
starting and ending times of the interrogation and any notes added to validation steps.
|
|
15227
|
+
report_incl_footer_timings
|
|
15228
|
+
Controls whether the validation timing information (start time, duration, and end time)
|
|
15229
|
+
should be displayed in the footer. Only applies when `report_incl_footer=True`.
|
|
15230
|
+
report_incl_footer_notes
|
|
15231
|
+
Controls whether the notes from validation steps should be displayed in the footer. Only
|
|
15232
|
+
applies when `report_incl_footer=True`.
|
|
12326
15233
|
preview_incl_header
|
|
12327
15234
|
Whether the header should be present in any preview table (generated via the
|
|
12328
15235
|
[`preview()`](`pointblank.preview`) function).
|
|
@@ -12341,7 +15248,7 @@ send a Slack notification when validation steps exceed failure threshold levels
|
|
|
12341
15248
|
summary of the validation results, including the status, number of steps, passing and failing steps,
|
|
12342
15249
|
table information, and timing details.
|
|
12343
15250
|
|
|
12344
|
-
send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None' = None, summary_msg: 'str | None' = None, debug: 'bool' = False) -> 'Callable'
|
|
15251
|
+
send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None' = None, summary_msg: 'str | None' = None, debug: 'bool' = False) -> 'Callable | None'
|
|
12345
15252
|
|
|
12346
15253
|
Create a Slack notification function using a webhook URL.
|
|
12347
15254
|
|