pointblank 0.8.7__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants.py +26 -10
- pointblank/_constants_translations.py +162 -0
- pointblank/_interrogation.py +117 -0
- pointblank/_typing.py +19 -3
- pointblank/_utils.py +1 -0
- pointblank/data/api-docs.txt +1022 -52
- pointblank/datascan.py +4 -4
- pointblank/draft.py +1 -1
- pointblank/thresholds.py +10 -0
- pointblank/validate.py +1462 -55
- {pointblank-0.8.7.dist-info → pointblank-0.9.1.dist-info}/METADATA +6 -2
- {pointblank-0.8.7.dist-info → pointblank-0.9.1.dist-info}/RECORD +15 -15
- {pointblank-0.8.7.dist-info → pointblank-0.9.1.dist-info}/WHEEL +1 -1
- {pointblank-0.8.7.dist-info → pointblank-0.9.1.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.8.7.dist-info → pointblank-0.9.1.dist-info}/top_level.txt +0 -0
pointblank/data/api-docs.txt
CHANGED
|
@@ -45,7 +45,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
45
45
|
The table to validate, which could be a DataFrame object or an Ibis table object. Read the
|
|
46
46
|
*Supported Input Table Types* section for details on the supported table types.
|
|
47
47
|
tbl_name
|
|
48
|
-
|
|
48
|
+
An optional name to assign to the input table object. If no value is provided, a name will
|
|
49
49
|
be generated based on whatever information is available. This table name will be displayed
|
|
50
50
|
in the header area of the tabular report.
|
|
51
51
|
label
|
|
@@ -628,6 +628,11 @@ Actions(warning: 'str | Callable | list[str | Callable] | None' = None, error: '
|
|
|
628
628
|
all three thresholds are exceeded in step 3, the 'warning' action of executing the function
|
|
629
629
|
occurs (resulting in a message being printed to the console). If actions were set for the other
|
|
630
630
|
two threshold levels, they would also be executed.
|
|
631
|
+
|
|
632
|
+
See Also
|
|
633
|
+
--------
|
|
634
|
+
The [`get_action_metadata()`](`pointblank.get_action_metadata`) function, which can be used to
|
|
635
|
+
retrieve metadata about the step where the action is executed.
|
|
631
636
|
|
|
632
637
|
|
|
633
638
|
FinalActions(*args)
|
|
@@ -723,6 +728,11 @@ FinalActions(*args)
|
|
|
723
728
|
.interrogate()
|
|
724
729
|
)
|
|
725
730
|
```
|
|
731
|
+
|
|
732
|
+
See Also
|
|
733
|
+
--------
|
|
734
|
+
The [`get_validation_summary()`](`pointblank.get_validation_summary`) function, which can be
|
|
735
|
+
used to retrieve the summary of the validation results.
|
|
726
736
|
|
|
727
737
|
|
|
728
738
|
Schema(columns: 'str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None' = None, tbl: 'any | None' = None, **kwargs)
|
|
@@ -1075,7 +1085,7 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None
|
|
|
1075
1085
|
data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
|
|
1076
1086
|
|
|
1077
1087
|
# Draft a validation plan for the "nycflights" table
|
|
1078
|
-
pb.DraftValidation(data=
|
|
1088
|
+
pb.DraftValidation(data=data, model="anthropic:claude-3-5-sonnet-latest")
|
|
1079
1089
|
```
|
|
1080
1090
|
|
|
1081
1091
|
The output will be a drafted validation plan for the `"nycflights"` table and this will appear
|
|
@@ -1154,7 +1164,7 @@ Validation steps can be thought of as sequential validations on the target
|
|
|
1154
1164
|
data. We call `Validate`'s validation methods to build up a validation plan: a collection of steps
|
|
1155
1165
|
that, in the aggregate, provides good validation coverage.
|
|
1156
1166
|
|
|
1157
|
-
col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1167
|
+
col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1158
1168
|
|
|
1159
1169
|
Are column data greater than a fixed value or data in another column?
|
|
1160
1170
|
|
|
@@ -1181,10 +1191,15 @@ col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1181
1191
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
1182
1192
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
1183
1193
|
pre
|
|
1184
|
-
|
|
1194
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
1185
1195
|
interrogation. This function should take a table as input and return a modified table.
|
|
1186
1196
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
1187
1197
|
argument.
|
|
1198
|
+
segments
|
|
1199
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
1200
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
1201
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
1202
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
1188
1203
|
thresholds
|
|
1189
1204
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
1190
1205
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -1247,6 +1262,42 @@ col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1247
1262
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
1248
1263
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
1249
1264
|
|
|
1265
|
+
Segmentation
|
|
1266
|
+
------------
|
|
1267
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
1268
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
1269
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
1270
|
+
column.
|
|
1271
|
+
|
|
1272
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
1273
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
1274
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
1275
|
+
region.
|
|
1276
|
+
|
|
1277
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
1278
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
1279
|
+
segment on only specific dates, you can provide a tuple like
|
|
1280
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
1281
|
+
(i.e., no validation steps will be created for them).
|
|
1282
|
+
|
|
1283
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1284
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
1285
|
+
|
|
1286
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
1287
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
1288
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
1289
|
+
columns
|
|
1290
|
+
|
|
1291
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1292
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
1293
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
1294
|
+
identify issues within specific segments.
|
|
1295
|
+
|
|
1296
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
1297
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
1298
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
1299
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
1300
|
+
|
|
1250
1301
|
Thresholds
|
|
1251
1302
|
----------
|
|
1252
1303
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -1334,7 +1385,7 @@ col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1334
1385
|
- Row 3: `c` is `2` and `b` is `2`.
|
|
1335
1386
|
|
|
1336
1387
|
|
|
1337
|
-
col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1388
|
+
col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1338
1389
|
|
|
1339
1390
|
Are column data less than a fixed value or data in another column?
|
|
1340
1391
|
|
|
@@ -1361,10 +1412,15 @@ col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1361
1412
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
1362
1413
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
1363
1414
|
pre
|
|
1364
|
-
|
|
1415
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
1365
1416
|
interrogation. This function should take a table as input and return a modified table.
|
|
1366
1417
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
1367
1418
|
argument.
|
|
1419
|
+
segments
|
|
1420
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
1421
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
1422
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
1423
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
1368
1424
|
thresholds
|
|
1369
1425
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
1370
1426
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -1427,6 +1483,42 @@ col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1427
1483
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
1428
1484
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
1429
1485
|
|
|
1486
|
+
Segmentation
|
|
1487
|
+
------------
|
|
1488
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
1489
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
1490
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
1491
|
+
column.
|
|
1492
|
+
|
|
1493
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
1494
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
1495
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
1496
|
+
region.
|
|
1497
|
+
|
|
1498
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
1499
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
1500
|
+
segment on only specific dates, you can provide a tuple like
|
|
1501
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
1502
|
+
(i.e., no validation steps will be created for them).
|
|
1503
|
+
|
|
1504
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1505
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
1506
|
+
|
|
1507
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
1508
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
1509
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
1510
|
+
columns
|
|
1511
|
+
|
|
1512
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1513
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
1514
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
1515
|
+
identify issues within specific segments.
|
|
1516
|
+
|
|
1517
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
1518
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
1519
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
1520
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
1521
|
+
|
|
1430
1522
|
Thresholds
|
|
1431
1523
|
----------
|
|
1432
1524
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -1514,7 +1606,7 @@ col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1514
1606
|
- Row 2: `b` is `1` and `c` is `1`.
|
|
1515
1607
|
|
|
1516
1608
|
|
|
1517
|
-
col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1609
|
+
col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1518
1610
|
|
|
1519
1611
|
Are column data greater than or equal to a fixed value or data in another column?
|
|
1520
1612
|
|
|
@@ -1541,10 +1633,15 @@ col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1541
1633
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
1542
1634
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
1543
1635
|
pre
|
|
1544
|
-
|
|
1636
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
1545
1637
|
interrogation. This function should take a table as input and return a modified table.
|
|
1546
1638
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
1547
1639
|
argument.
|
|
1640
|
+
segments
|
|
1641
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
1642
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
1643
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
1644
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
1548
1645
|
thresholds
|
|
1549
1646
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
1550
1647
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -1607,6 +1704,42 @@ col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1607
1704
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
1608
1705
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
1609
1706
|
|
|
1707
|
+
Segmentation
|
|
1708
|
+
------------
|
|
1709
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
1710
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
1711
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
1712
|
+
column.
|
|
1713
|
+
|
|
1714
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
1715
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
1716
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
1717
|
+
region.
|
|
1718
|
+
|
|
1719
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
1720
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
1721
|
+
segment on only specific dates, you can provide a tuple like
|
|
1722
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
1723
|
+
(i.e., no validation steps will be created for them).
|
|
1724
|
+
|
|
1725
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1726
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
1727
|
+
|
|
1728
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
1729
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
1730
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
1731
|
+
columns
|
|
1732
|
+
|
|
1733
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1734
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
1735
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
1736
|
+
identify issues within specific segments.
|
|
1737
|
+
|
|
1738
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
1739
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
1740
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
1741
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
1742
|
+
|
|
1610
1743
|
Thresholds
|
|
1611
1744
|
----------
|
|
1612
1745
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -1694,7 +1827,7 @@ col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1694
1827
|
- Row 4: `b` is `3` and `c` is `4`.
|
|
1695
1828
|
|
|
1696
1829
|
|
|
1697
|
-
col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1830
|
+
col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1698
1831
|
|
|
1699
1832
|
Are column data less than or equal to a fixed value or data in another column?
|
|
1700
1833
|
|
|
@@ -1721,10 +1854,15 @@ col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1721
1854
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
1722
1855
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
1723
1856
|
pre
|
|
1724
|
-
|
|
1857
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
1725
1858
|
interrogation. This function should take a table as input and return a modified table.
|
|
1726
1859
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
1727
1860
|
argument.
|
|
1861
|
+
segments
|
|
1862
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
1863
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
1864
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
1865
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
1728
1866
|
thresholds
|
|
1729
1867
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
1730
1868
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -1787,6 +1925,42 @@ col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1787
1925
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
1788
1926
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
1789
1927
|
|
|
1928
|
+
Segmentation
|
|
1929
|
+
------------
|
|
1930
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
1931
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
1932
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
1933
|
+
column.
|
|
1934
|
+
|
|
1935
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
1936
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
1937
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
1938
|
+
region.
|
|
1939
|
+
|
|
1940
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
1941
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
1942
|
+
segment on only specific dates, you can provide a tuple like
|
|
1943
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
1944
|
+
(i.e., no validation steps will be created for them).
|
|
1945
|
+
|
|
1946
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1947
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
1948
|
+
|
|
1949
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
1950
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
1951
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
1952
|
+
columns
|
|
1953
|
+
|
|
1954
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1955
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
1956
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
1957
|
+
identify issues within specific segments.
|
|
1958
|
+
|
|
1959
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
1960
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
1961
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
1962
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
1963
|
+
|
|
1790
1964
|
Thresholds
|
|
1791
1965
|
----------
|
|
1792
1966
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -1874,7 +2048,7 @@ col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1874
2048
|
- Row 4: `c` is `3` and `b` is `2`.
|
|
1875
2049
|
|
|
1876
2050
|
|
|
1877
|
-
col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2051
|
+
col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1878
2052
|
|
|
1879
2053
|
Are column data equal to a fixed value or data in another column?
|
|
1880
2054
|
|
|
@@ -1901,10 +2075,15 @@ col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1901
2075
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
1902
2076
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
1903
2077
|
pre
|
|
1904
|
-
|
|
2078
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
1905
2079
|
interrogation. This function should take a table as input and return a modified table.
|
|
1906
2080
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
1907
2081
|
argument.
|
|
2082
|
+
segments
|
|
2083
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2084
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2085
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2086
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
1908
2087
|
thresholds
|
|
1909
2088
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
1910
2089
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -1967,6 +2146,42 @@ col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1967
2146
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
1968
2147
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
1969
2148
|
|
|
2149
|
+
Segmentation
|
|
2150
|
+
------------
|
|
2151
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2152
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2153
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2154
|
+
column.
|
|
2155
|
+
|
|
2156
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2157
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2158
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2159
|
+
region.
|
|
2160
|
+
|
|
2161
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2162
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2163
|
+
segment on only specific dates, you can provide a tuple like
|
|
2164
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2165
|
+
(i.e., no validation steps will be created for them).
|
|
2166
|
+
|
|
2167
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2168
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2169
|
+
|
|
2170
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2171
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2172
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2173
|
+
columns
|
|
2174
|
+
|
|
2175
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2176
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2177
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2178
|
+
identify issues within specific segments.
|
|
2179
|
+
|
|
2180
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2181
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2182
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2183
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2184
|
+
|
|
1970
2185
|
Thresholds
|
|
1971
2186
|
----------
|
|
1972
2187
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2052,7 +2267,7 @@ col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2052
2267
|
- Row 5: `a` is `5` and `b` is `4`.
|
|
2053
2268
|
|
|
2054
2269
|
|
|
2055
|
-
col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2270
|
+
col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2056
2271
|
|
|
2057
2272
|
Are column data not equal to a fixed value or data in another column?
|
|
2058
2273
|
|
|
@@ -2079,10 +2294,15 @@ col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2079
2294
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2080
2295
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2081
2296
|
pre
|
|
2082
|
-
|
|
2297
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2083
2298
|
interrogation. This function should take a table as input and return a modified table.
|
|
2084
2299
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2085
2300
|
argument.
|
|
2301
|
+
segments
|
|
2302
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2303
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2304
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2305
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2086
2306
|
thresholds
|
|
2087
2307
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2088
2308
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2145,6 +2365,42 @@ col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2145
2365
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2146
2366
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2147
2367
|
|
|
2368
|
+
Segmentation
|
|
2369
|
+
------------
|
|
2370
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2371
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2372
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2373
|
+
column.
|
|
2374
|
+
|
|
2375
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2376
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2377
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2378
|
+
region.
|
|
2379
|
+
|
|
2380
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2381
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2382
|
+
segment on only specific dates, you can provide a tuple like
|
|
2383
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2384
|
+
(i.e., no validation steps will be created for them).
|
|
2385
|
+
|
|
2386
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2387
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2388
|
+
|
|
2389
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2390
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2391
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2392
|
+
columns
|
|
2393
|
+
|
|
2394
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2395
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2396
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2397
|
+
identify issues within specific segments.
|
|
2398
|
+
|
|
2399
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2400
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2401
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2402
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2403
|
+
|
|
2148
2404
|
Thresholds
|
|
2149
2405
|
----------
|
|
2150
2406
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2228,7 +2484,7 @@ col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2228
2484
|
0 and 4, where `a` is `5` and `b` is `5` in both cases (i.e., they are equal to each other).
|
|
2229
2485
|
|
|
2230
2486
|
|
|
2231
|
-
col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2487
|
+
col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2232
2488
|
|
|
2233
2489
|
Do column data lie between two specified values or data in other columns?
|
|
2234
2490
|
|
|
@@ -2265,10 +2521,15 @@ col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2265
2521
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2266
2522
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2267
2523
|
pre
|
|
2268
|
-
|
|
2524
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2269
2525
|
interrogation. This function should take a table as input and return a modified table.
|
|
2270
2526
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2271
2527
|
argument.
|
|
2528
|
+
segments
|
|
2529
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2530
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2531
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2532
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2272
2533
|
thresholds
|
|
2273
2534
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2274
2535
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2333,6 +2594,42 @@ col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2333
2594
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2334
2595
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2335
2596
|
|
|
2597
|
+
Segmentation
|
|
2598
|
+
------------
|
|
2599
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2600
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2601
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2602
|
+
column.
|
|
2603
|
+
|
|
2604
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2605
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2606
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2607
|
+
region.
|
|
2608
|
+
|
|
2609
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2610
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2611
|
+
segment on only specific dates, you can provide a tuple like
|
|
2612
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2613
|
+
(i.e., no validation steps will be created for them).
|
|
2614
|
+
|
|
2615
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2616
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2617
|
+
|
|
2618
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2619
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2620
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2621
|
+
columns
|
|
2622
|
+
|
|
2623
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2624
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2625
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2626
|
+
identify issues within specific segments.
|
|
2627
|
+
|
|
2628
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2629
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2630
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2631
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2632
|
+
|
|
2336
2633
|
Thresholds
|
|
2337
2634
|
----------
|
|
2338
2635
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2428,7 +2725,7 @@ col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2428
2725
|
- Row 4: `b` is `8` but the bounds are `3` (`a`) and `7` (`c`).
|
|
2429
2726
|
|
|
2430
2727
|
|
|
2431
|
-
col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2728
|
+
col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2432
2729
|
|
|
2433
2730
|
Do column data lie outside of two specified values or data in other columns?
|
|
2434
2731
|
|
|
@@ -2465,10 +2762,15 @@ col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2465
2762
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2466
2763
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2467
2764
|
pre
|
|
2468
|
-
|
|
2765
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2469
2766
|
interrogation. This function should take a table as input and return a modified table.
|
|
2470
2767
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2471
2768
|
argument.
|
|
2769
|
+
segments
|
|
2770
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2771
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2772
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2773
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2472
2774
|
thresholds
|
|
2473
2775
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2474
2776
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2533,6 +2835,42 @@ col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2533
2835
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2534
2836
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2535
2837
|
|
|
2838
|
+
Segmentation
|
|
2839
|
+
------------
|
|
2840
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2841
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2842
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2843
|
+
column.
|
|
2844
|
+
|
|
2845
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2846
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2847
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2848
|
+
region.
|
|
2849
|
+
|
|
2850
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2851
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2852
|
+
segment on only specific dates, you can provide a tuple like
|
|
2853
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2854
|
+
(i.e., no validation steps will be created for them).
|
|
2855
|
+
|
|
2856
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2857
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2858
|
+
|
|
2859
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2860
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2861
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2862
|
+
columns
|
|
2863
|
+
|
|
2864
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2865
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2866
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2867
|
+
identify issues within specific segments.
|
|
2868
|
+
|
|
2869
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2870
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2871
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2872
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2873
|
+
|
|
2536
2874
|
Thresholds
|
|
2537
2875
|
----------
|
|
2538
2876
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2628,7 +2966,7 @@ col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2628
2966
|
- Row 5: `b` is `6` and the bounds are `5` (`a`) and `7` (`c`).
|
|
2629
2967
|
|
|
2630
2968
|
|
|
2631
|
-
col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: '
|
|
2969
|
+
col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2632
2970
|
|
|
2633
2971
|
Validate whether column values are in a set of values.
|
|
2634
2972
|
|
|
@@ -2647,10 +2985,15 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
2647
2985
|
set
|
|
2648
2986
|
A list of values to compare against.
|
|
2649
2987
|
pre
|
|
2650
|
-
|
|
2988
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2651
2989
|
interrogation. This function should take a table as input and return a modified table.
|
|
2652
2990
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2653
2991
|
argument.
|
|
2992
|
+
segments
|
|
2993
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2994
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2995
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2996
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2654
2997
|
thresholds
|
|
2655
2998
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2656
2999
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2692,6 +3035,42 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
2692
3035
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
2693
3036
|
subsequent validation steps.
|
|
2694
3037
|
|
|
3038
|
+
Segmentation
|
|
3039
|
+
------------
|
|
3040
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3041
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3042
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3043
|
+
column.
|
|
3044
|
+
|
|
3045
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3046
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3047
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3048
|
+
region.
|
|
3049
|
+
|
|
3050
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3051
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3052
|
+
segment on only specific dates, you can provide a tuple like
|
|
3053
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3054
|
+
(i.e., no validation steps will be created for them).
|
|
3055
|
+
|
|
3056
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3057
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3058
|
+
|
|
3059
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3060
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3061
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3062
|
+
columns
|
|
3063
|
+
|
|
3064
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3065
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3066
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3067
|
+
identify issues within specific segments.
|
|
3068
|
+
|
|
3069
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3070
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3071
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3072
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3073
|
+
|
|
2695
3074
|
Thresholds
|
|
2696
3075
|
----------
|
|
2697
3076
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2773,7 +3152,7 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
2773
3152
|
column `b` values of `8` and `1`, which are not in the set of `[2, 3, 4, 5, 6]`.
|
|
2774
3153
|
|
|
2775
3154
|
|
|
2776
|
-
col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'list[float | int]', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3155
|
+
col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'list[float | int]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2777
3156
|
|
|
2778
3157
|
Validate whether column values are not in a set of values.
|
|
2779
3158
|
|
|
@@ -2792,10 +3171,15 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
2792
3171
|
set
|
|
2793
3172
|
A list of values to compare against.
|
|
2794
3173
|
pre
|
|
2795
|
-
|
|
3174
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2796
3175
|
interrogation. This function should take a table as input and return a modified table.
|
|
2797
3176
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2798
3177
|
argument.
|
|
3178
|
+
segments
|
|
3179
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3180
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3181
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3182
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2799
3183
|
thresholds
|
|
2800
3184
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2801
3185
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2837,6 +3221,42 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
2837
3221
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
2838
3222
|
subsequent validation steps.
|
|
2839
3223
|
|
|
3224
|
+
Segmentation
|
|
3225
|
+
------------
|
|
3226
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3227
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3228
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3229
|
+
column.
|
|
3230
|
+
|
|
3231
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3232
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3233
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3234
|
+
region.
|
|
3235
|
+
|
|
3236
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3237
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3238
|
+
segment on only specific dates, you can provide a tuple like
|
|
3239
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3240
|
+
(i.e., no validation steps will be created for them).
|
|
3241
|
+
|
|
3242
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3243
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3244
|
+
|
|
3245
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3246
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3247
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3248
|
+
columns
|
|
3249
|
+
|
|
3250
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3251
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3252
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3253
|
+
identify issues within specific segments.
|
|
3254
|
+
|
|
3255
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3256
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3257
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3258
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3259
|
+
|
|
2840
3260
|
Thresholds
|
|
2841
3261
|
----------
|
|
2842
3262
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2919,7 +3339,7 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
2919
3339
|
column `b` values of `2` and `6`, both of which are in the set of `[2, 3, 4, 5, 6]`.
|
|
2920
3340
|
|
|
2921
3341
|
|
|
2922
|
-
col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3342
|
+
col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2923
3343
|
|
|
2924
3344
|
Validate whether values in a column are NULL.
|
|
2925
3345
|
|
|
@@ -2935,10 +3355,15 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
|
|
|
2935
3355
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
2936
3356
|
generated for each column.
|
|
2937
3357
|
pre
|
|
2938
|
-
|
|
3358
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2939
3359
|
interrogation. This function should take a table as input and return a modified table.
|
|
2940
3360
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2941
3361
|
argument.
|
|
3362
|
+
segments
|
|
3363
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3364
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3365
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3366
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2942
3367
|
thresholds
|
|
2943
3368
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2944
3369
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2980,6 +3405,42 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
|
|
|
2980
3405
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
2981
3406
|
subsequent validation steps.
|
|
2982
3407
|
|
|
3408
|
+
Segmentation
|
|
3409
|
+
------------
|
|
3410
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3411
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3412
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3413
|
+
column.
|
|
3414
|
+
|
|
3415
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3416
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3417
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3418
|
+
region.
|
|
3419
|
+
|
|
3420
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3421
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3422
|
+
segment on only specific dates, you can provide a tuple like
|
|
3423
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3424
|
+
(i.e., no validation steps will be created for them).
|
|
3425
|
+
|
|
3426
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3427
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3428
|
+
|
|
3429
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3430
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3431
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3432
|
+
columns
|
|
3433
|
+
|
|
3434
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3435
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3436
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3437
|
+
identify issues within specific segments.
|
|
3438
|
+
|
|
3439
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3440
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3441
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3442
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3443
|
+
|
|
2983
3444
|
Thresholds
|
|
2984
3445
|
----------
|
|
2985
3446
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3060,7 +3521,7 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
|
|
|
3060
3521
|
two non-Null values in column `b`.
|
|
3061
3522
|
|
|
3062
3523
|
|
|
3063
|
-
col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3524
|
+
col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3064
3525
|
|
|
3065
3526
|
Validate whether values in a column are not NULL.
|
|
3066
3527
|
|
|
@@ -3076,10 +3537,15 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
|
|
|
3076
3537
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
3077
3538
|
generated for each column.
|
|
3078
3539
|
pre
|
|
3079
|
-
|
|
3540
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3080
3541
|
interrogation. This function should take a table as input and return a modified table.
|
|
3081
3542
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3082
3543
|
argument.
|
|
3544
|
+
segments
|
|
3545
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3546
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3547
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3548
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3083
3549
|
thresholds
|
|
3084
3550
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3085
3551
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3121,6 +3587,42 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
|
|
|
3121
3587
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
3122
3588
|
subsequent validation steps.
|
|
3123
3589
|
|
|
3590
|
+
Segmentation
|
|
3591
|
+
------------
|
|
3592
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3593
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3594
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3595
|
+
column.
|
|
3596
|
+
|
|
3597
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3598
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3599
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3600
|
+
region.
|
|
3601
|
+
|
|
3602
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3603
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3604
|
+
segment on only specific dates, you can provide a tuple like
|
|
3605
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3606
|
+
(i.e., no validation steps will be created for them).
|
|
3607
|
+
|
|
3608
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3609
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3610
|
+
|
|
3611
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3612
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3613
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3614
|
+
columns
|
|
3615
|
+
|
|
3616
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3617
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3618
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3619
|
+
identify issues within specific segments.
|
|
3620
|
+
|
|
3621
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3622
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3623
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3624
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3625
|
+
|
|
3124
3626
|
Thresholds
|
|
3125
3627
|
----------
|
|
3126
3628
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3201,7 +3703,7 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
|
|
|
3201
3703
|
two Null values in column `b`.
|
|
3202
3704
|
|
|
3203
3705
|
|
|
3204
|
-
col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3706
|
+
col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3205
3707
|
|
|
3206
3708
|
Validate whether column values match a regular expression pattern.
|
|
3207
3709
|
|
|
@@ -3223,10 +3725,15 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
|
|
|
3223
3725
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3224
3726
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3225
3727
|
pre
|
|
3226
|
-
|
|
3728
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3227
3729
|
interrogation. This function should take a table as input and return a modified table.
|
|
3228
3730
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3229
3731
|
argument.
|
|
3732
|
+
segments
|
|
3733
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3734
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3735
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3736
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3230
3737
|
thresholds
|
|
3231
3738
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3232
3739
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3268,6 +3775,42 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
|
|
|
3268
3775
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
3269
3776
|
subsequent validation steps.
|
|
3270
3777
|
|
|
3778
|
+
Segmentation
|
|
3779
|
+
------------
|
|
3780
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3781
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3782
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3783
|
+
column.
|
|
3784
|
+
|
|
3785
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3786
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3787
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3788
|
+
region.
|
|
3789
|
+
|
|
3790
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3791
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3792
|
+
segment on only specific dates, you can provide a tuple like
|
|
3793
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3794
|
+
(i.e., no validation steps will be created for them).
|
|
3795
|
+
|
|
3796
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3797
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3798
|
+
|
|
3799
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3800
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3801
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3802
|
+
columns
|
|
3803
|
+
|
|
3804
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3805
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3806
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3807
|
+
identify issues within specific segments.
|
|
3808
|
+
|
|
3809
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3810
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3811
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3812
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3813
|
+
|
|
3271
3814
|
Thresholds
|
|
3272
3815
|
----------
|
|
3273
3816
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3349,7 +3892,7 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
|
|
|
3349
3892
|
string values of rows 1 and 2 in column `b`.
|
|
3350
3893
|
|
|
3351
3894
|
|
|
3352
|
-
col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3895
|
+
col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3353
3896
|
|
|
3354
3897
|
Validate column values using a custom expression.
|
|
3355
3898
|
|
|
@@ -3366,10 +3909,15 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, thresholds: 'int
|
|
|
3366
3909
|
be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
|
|
3367
3910
|
should either be a lambda expression or a Narwhals column expression.
|
|
3368
3911
|
pre
|
|
3369
|
-
|
|
3912
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3370
3913
|
interrogation. This function should take a table as input and return a modified table.
|
|
3371
3914
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3372
3915
|
argument.
|
|
3916
|
+
segments
|
|
3917
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3918
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3919
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3920
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3373
3921
|
thresholds
|
|
3374
3922
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3375
3923
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3409,6 +3957,42 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, thresholds: 'int
|
|
|
3409
3957
|
transformed table, it only exists during the validation step and is not stored in the
|
|
3410
3958
|
`Validate` object or used in subsequent validation steps.
|
|
3411
3959
|
|
|
3960
|
+
Segmentation
|
|
3961
|
+
------------
|
|
3962
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3963
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3964
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3965
|
+
column.
|
|
3966
|
+
|
|
3967
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3968
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3969
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3970
|
+
region.
|
|
3971
|
+
|
|
3972
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3973
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3974
|
+
segment on only specific dates, you can provide a tuple like
|
|
3975
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3976
|
+
(i.e., no validation steps will be created for them).
|
|
3977
|
+
|
|
3978
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3979
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3980
|
+
|
|
3981
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3982
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3983
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3984
|
+
columns
|
|
3985
|
+
|
|
3986
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3987
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3988
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3989
|
+
identify issues within specific segments.
|
|
3990
|
+
|
|
3991
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3992
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3993
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3994
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3995
|
+
|
|
3412
3996
|
Thresholds
|
|
3413
3997
|
----------
|
|
3414
3998
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3597,7 +4181,7 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
|
|
|
3597
4181
|
failing validation step (the check for column `c`, which doesn't exist).
|
|
3598
4182
|
|
|
3599
4183
|
|
|
3600
|
-
rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4184
|
+
rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3601
4185
|
|
|
3602
4186
|
Validate whether rows in the table are distinct.
|
|
3603
4187
|
|
|
@@ -3613,10 +4197,15 @@ rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
3613
4197
|
columns are supplied, the distinct comparison will be made over the combination of
|
|
3614
4198
|
values in those columns.
|
|
3615
4199
|
pre
|
|
3616
|
-
|
|
4200
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3617
4201
|
interrogation. This function should take a table as input and return a modified table.
|
|
3618
4202
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3619
4203
|
argument.
|
|
4204
|
+
segments
|
|
4205
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4206
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4207
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4208
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3620
4209
|
thresholds
|
|
3621
4210
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3622
4211
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3658,6 +4247,42 @@ rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
3658
4247
|
table, it only exists during the validation step and is not stored in the `Validate` object
|
|
3659
4248
|
or used in subsequent validation steps.
|
|
3660
4249
|
|
|
4250
|
+
Segmentation
|
|
4251
|
+
------------
|
|
4252
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
4253
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
4254
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
4255
|
+
column.
|
|
4256
|
+
|
|
4257
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
4258
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
4259
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
4260
|
+
region.
|
|
4261
|
+
|
|
4262
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
4263
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
4264
|
+
segment on only specific dates, you can provide a tuple like
|
|
4265
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
4266
|
+
(i.e., no validation steps will be created for them).
|
|
4267
|
+
|
|
4268
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4269
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
4270
|
+
|
|
4271
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
4272
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
4273
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
4274
|
+
columns
|
|
4275
|
+
|
|
4276
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4277
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
4278
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
4279
|
+
identify issues within specific segments.
|
|
4280
|
+
|
|
4281
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
4282
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
4283
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
4284
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
4285
|
+
|
|
3661
4286
|
Thresholds
|
|
3662
4287
|
----------
|
|
3663
4288
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3742,6 +4367,192 @@ rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
3742
4367
|
others.
|
|
3743
4368
|
|
|
3744
4369
|
|
|
4370
|
+
rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4371
|
+
|
|
4372
|
+
Validate whether row data are complete by having no missing values.
|
|
4373
|
+
|
|
4374
|
+
The `rows_complete()` method checks whether rows in the table are complete. Completeness
|
|
4375
|
+
of a row means that there are no missing values within the row. This validation will operate
|
|
4376
|
+
over the number of test units that is equal to the number of rows in the table (determined
|
|
4377
|
+
after any `pre=` mutation has been applied). A subset of columns can be specified for the
|
|
4378
|
+
completeness check. If no subset is provided, all columns in the table will be used.
|
|
4379
|
+
|
|
4380
|
+
Parameters
|
|
4381
|
+
----------
|
|
4382
|
+
columns_subset
|
|
4383
|
+
A single column or a list of columns to use as a subset for the completeness check. If
|
|
4384
|
+
`None` (the default), then all columns in the table will be used.
|
|
4385
|
+
pre
|
|
4386
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4387
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
4388
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4389
|
+
argument.
|
|
4390
|
+
segments
|
|
4391
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4392
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4393
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4394
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4395
|
+
thresholds
|
|
4396
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4397
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
4398
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
4399
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
4400
|
+
section for information on how to set threshold levels.
|
|
4401
|
+
actions
|
|
4402
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
4403
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
4404
|
+
define the actions.
|
|
4405
|
+
brief
|
|
4406
|
+
An optional brief description of the validation step that will be displayed in the
|
|
4407
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
4408
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
4409
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
4410
|
+
won't be a brief.
|
|
4411
|
+
active
|
|
4412
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
4413
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
4414
|
+
for the steps unchanged).
|
|
4415
|
+
|
|
4416
|
+
Returns
|
|
4417
|
+
-------
|
|
4418
|
+
Validate
|
|
4419
|
+
The `Validate` object with the added validation step.
|
|
4420
|
+
|
|
4421
|
+
Preprocessing
|
|
4422
|
+
-------------
|
|
4423
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
4424
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
4425
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
4426
|
+
before the validation step is applied.
|
|
4427
|
+
|
|
4428
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
4429
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
4430
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
4431
|
+
columns via `columns_subset=` that are expected to be present in the transformed table, but
|
|
4432
|
+
may not exist in the table before preprocessing. Regarding the lifetime of the transformed
|
|
4433
|
+
table, it only exists during the validation step and is not stored in the `Validate` object
|
|
4434
|
+
or used in subsequent validation steps.
|
|
4435
|
+
|
|
4436
|
+
Segmentation
|
|
4437
|
+
------------
|
|
4438
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
4439
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
4440
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
4441
|
+
column.
|
|
4442
|
+
|
|
4443
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
4444
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
4445
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
4446
|
+
region.
|
|
4447
|
+
|
|
4448
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
4449
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
4450
|
+
segment on only specific dates, you can provide a tuple like
|
|
4451
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
4452
|
+
(i.e., no validation steps will be created for them).
|
|
4453
|
+
|
|
4454
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4455
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
4456
|
+
|
|
4457
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
4458
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
4459
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
4460
|
+
columns
|
|
4461
|
+
|
|
4462
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4463
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
4464
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
4465
|
+
identify issues within specific segments.
|
|
4466
|
+
|
|
4467
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
4468
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
4469
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
4470
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
4471
|
+
|
|
4472
|
+
Thresholds
|
|
4473
|
+
----------
|
|
4474
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
4475
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
4476
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
4477
|
+
|
|
4478
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
4479
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
4480
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
4481
|
+
|
|
4482
|
+
Thresholds can be defined using one of these input schemes:
|
|
4483
|
+
|
|
4484
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
4485
|
+
thresholds)
|
|
4486
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
4487
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
4488
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
4489
|
+
'critical'
|
|
4490
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
4491
|
+
for the 'warning' level only
|
|
4492
|
+
|
|
4493
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
4494
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
4495
|
+
set, you're free to set any combination of them.
|
|
4496
|
+
|
|
4497
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
4498
|
+
take for each level of failure (using the `actions=` parameter).
|
|
4499
|
+
|
|
4500
|
+
Examples
|
|
4501
|
+
--------
|
|
4502
|
+
For the examples here, we'll use a simple Polars DataFrame with three string columns
|
|
4503
|
+
(`col_1`, `col_2`, and `col_3`). The table is shown below:
|
|
4504
|
+
|
|
4505
|
+
```python
|
|
4506
|
+
import pointblank as pb
|
|
4507
|
+
import polars as pl
|
|
4508
|
+
|
|
4509
|
+
tbl = pl.DataFrame(
|
|
4510
|
+
{
|
|
4511
|
+
"col_1": ["a", None, "c", "d"],
|
|
4512
|
+
"col_2": ["a", "a", "c", None],
|
|
4513
|
+
"col_3": ["a", "a", "d", None],
|
|
4514
|
+
}
|
|
4515
|
+
)
|
|
4516
|
+
|
|
4517
|
+
pb.preview(tbl)
|
|
4518
|
+
```
|
|
4519
|
+
|
|
4520
|
+
Let's validate that the rows in the table are complete with `rows_complete()`. We'll
|
|
4521
|
+
determine if this validation had any failing test units (there are four test units, one for
|
|
4522
|
+
each row). A failing test units means that a given row is not complete (i.e., has at least
|
|
4523
|
+
one missing value).
|
|
4524
|
+
|
|
4525
|
+
```python
|
|
4526
|
+
validation = (
|
|
4527
|
+
pb.Validate(data=tbl)
|
|
4528
|
+
.rows_complete()
|
|
4529
|
+
.interrogate()
|
|
4530
|
+
)
|
|
4531
|
+
|
|
4532
|
+
validation
|
|
4533
|
+
```
|
|
4534
|
+
|
|
4535
|
+
From this validation table we see that there are two failing test units. This is because
|
|
4536
|
+
two rows in the table have at least one missing value (the second row and the last row).
|
|
4537
|
+
|
|
4538
|
+
We can also use a subset of columns to determine completeness. Let's specify the subset
|
|
4539
|
+
using columns `col_2` and `col_3` for the next validation.
|
|
4540
|
+
|
|
4541
|
+
```python
|
|
4542
|
+
validation = (
|
|
4543
|
+
pb.Validate(data=tbl)
|
|
4544
|
+
.rows_complete(columns_subset=["col_2", "col_3"])
|
|
4545
|
+
.interrogate()
|
|
4546
|
+
)
|
|
4547
|
+
|
|
4548
|
+
validation
|
|
4549
|
+
```
|
|
4550
|
+
|
|
4551
|
+
The validation table reports a single failing test units. The last row contains missing
|
|
4552
|
+
values in both the `col_2` and `col_3` columns.
|
|
4553
|
+
others.
|
|
4554
|
+
|
|
4555
|
+
|
|
3745
4556
|
col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3746
4557
|
|
|
3747
4558
|
Do columns in the table (and their types) match a predefined schema?
|
|
@@ -3779,7 +4590,7 @@ col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'boo
|
|
|
3779
4590
|
substring matches are allowed, so a schema data type of `Int` would match a target table
|
|
3780
4591
|
data type of `Int64`.
|
|
3781
4592
|
pre
|
|
3782
|
-
|
|
4593
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3783
4594
|
interrogation. This function should take a table as input and return a modified table.
|
|
3784
4595
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3785
4596
|
argument.
|
|
@@ -3932,7 +4743,7 @@ row_count_match(self, count: 'int | FrameT | Any', tol: 'Tolerance' = 0, inverse
|
|
|
3932
4743
|
Should the validation step be inverted? If `True`, then the expectation is that the row
|
|
3933
4744
|
count of the target table should not match the specified `count=` value.
|
|
3934
4745
|
pre
|
|
3935
|
-
|
|
4746
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3936
4747
|
interrogation. This function should take a table as input and return a modified table.
|
|
3937
4748
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3938
4749
|
argument.
|
|
@@ -4078,7 +4889,7 @@ col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre:
|
|
|
4078
4889
|
Should the validation step be inverted? If `True`, then the expectation is that the
|
|
4079
4890
|
column count of the target table should not match the specified `count=` value.
|
|
4080
4891
|
pre
|
|
4081
|
-
|
|
4892
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4082
4893
|
interrogation. This function should take a table as input and return a modified table.
|
|
4083
4894
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4084
4895
|
argument.
|
|
@@ -5936,8 +6747,8 @@ get_step_report(self, i: 'int', columns_subset: 'str | list[str] | Column | None
|
|
|
5936
6747
|
table object, which can be displayed in a notebook or exported to an HTML file.
|
|
5937
6748
|
|
|
5938
6749
|
:::{.callout-warning}
|
|
5939
|
-
The `get_step_report()` is still experimental. Please report any issues you encounter
|
|
5940
|
-
[Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
6750
|
+
The `get_step_report()` method is still experimental. Please report any issues you encounter
|
|
6751
|
+
in the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
5941
6752
|
:::
|
|
5942
6753
|
|
|
5943
6754
|
Parameters
|
|
@@ -5970,6 +6781,36 @@ get_step_report(self, i: 'int', columns_subset: 'str | list[str] | Column | None
|
|
|
5970
6781
|
GT
|
|
5971
6782
|
A GT table object that represents the detailed report for the validation step.
|
|
5972
6783
|
|
|
6784
|
+
Types of Step Reports
|
|
6785
|
+
---------------------
|
|
6786
|
+
The `get_step_report()` method produces a report based on the *type* of validation step.
|
|
6787
|
+
The following row-based validation methods will produce a report that shows the rows of the
|
|
6788
|
+
data that failed because of failing test units within one or more columns failed:
|
|
6789
|
+
|
|
6790
|
+
- [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
|
|
6791
|
+
- [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
|
|
6792
|
+
- [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
|
|
6793
|
+
- [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
|
|
6794
|
+
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
6795
|
+
- [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
|
|
6796
|
+
- [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
|
|
6797
|
+
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
6798
|
+
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
6799
|
+
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
6800
|
+
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
6801
|
+
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
6802
|
+
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
6803
|
+
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
6804
|
+
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
6805
|
+
|
|
6806
|
+
The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
|
|
6807
|
+
report that shows duplicate rows (or duplicate values in one or a set of columns as defined
|
|
6808
|
+
in that method's `columns_subset=` parameter.
|
|
6809
|
+
|
|
6810
|
+
The [`col_schema_match()`](`pointblank.Validate.col_schema_match`) validation step will
|
|
6811
|
+
produce a report that shows the schema of the data table and the schema of the validation
|
|
6812
|
+
step. The report will indicate whether the schemas match or not.
|
|
6813
|
+
|
|
5973
6814
|
Examples
|
|
5974
6815
|
--------
|
|
5975
6816
|
Let's create a validation plan with a few validation steps and interrogate the data. With
|
|
@@ -5989,7 +6830,7 @@ get_step_report(self, i: 'int', columns_subset: 'str | list[str] | Column | None
|
|
|
5989
6830
|
.col_vals_lt(columns="d", value=3500)
|
|
5990
6831
|
.col_vals_between(columns="c", left=1, right=8)
|
|
5991
6832
|
.col_vals_gt(columns="a", value=3)
|
|
5992
|
-
.col_vals_regex(columns="b", pattern=r"
|
|
6833
|
+
.col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
|
|
5993
6834
|
.interrogate()
|
|
5994
6835
|
)
|
|
5995
6836
|
|
|
@@ -6044,17 +6885,133 @@ get_json_report(self, use_fields: 'list[str] | None' = None, exclude_fields: 'li
|
|
|
6044
6885
|
|
|
6045
6886
|
Get a report of the validation results as a JSON-formatted string.
|
|
6046
6887
|
|
|
6888
|
+
The `get_json_report()` method provides a machine-readable report of validation results in
|
|
6889
|
+
JSON format. This is particularly useful for programmatic processing, storing validation
|
|
6890
|
+
results, or integrating with other systems. The report includes detailed information about
|
|
6891
|
+
each validation step, such as assertion type, columns validated, threshold values, test
|
|
6892
|
+
results, and more.
|
|
6893
|
+
|
|
6894
|
+
By default, all available validation information fields are included in the report. However,
|
|
6895
|
+
you can customize the fields to include or exclude using the `use_fields=` and
|
|
6896
|
+
`exclude_fields=` parameters.
|
|
6897
|
+
|
|
6047
6898
|
Parameters
|
|
6048
6899
|
----------
|
|
6049
6900
|
use_fields
|
|
6050
|
-
|
|
6901
|
+
An optional list of specific fields to include in the report. If provided, only these
|
|
6902
|
+
fields will be included in the JSON output. If `None` (the default), all standard
|
|
6903
|
+
validation report fields are included. Have a look at the *Available Report Fields*
|
|
6904
|
+
section below for a list of fields that can be included in the report.
|
|
6051
6905
|
exclude_fields
|
|
6052
|
-
|
|
6906
|
+
An optional list of fields to exclude from the report. If provided, these fields will
|
|
6907
|
+
be omitted from the JSON output. If `None` (the default), no fields are excluded.
|
|
6908
|
+
This parameter cannot be used together with `use_fields=`. The *Available Report Fields*
|
|
6909
|
+
provides a listing of fields that can be excluded from the report.
|
|
6053
6910
|
|
|
6054
6911
|
Returns
|
|
6055
6912
|
-------
|
|
6056
6913
|
str
|
|
6057
|
-
A JSON-formatted string representing the validation report
|
|
6914
|
+
A JSON-formatted string representing the validation report, with each validation step
|
|
6915
|
+
as an object in the report array.
|
|
6916
|
+
|
|
6917
|
+
Available Report Fields
|
|
6918
|
+
-----------------------
|
|
6919
|
+
The JSON report can include any of the standard validation report fields, including:
|
|
6920
|
+
|
|
6921
|
+
- `i`: the step number (1-indexed)
|
|
6922
|
+
- `i_o`: the original step index from the validation plan (pre-expansion)
|
|
6923
|
+
- `assertion_type`: the type of validation assertion (e.g., `"col_vals_gt"`, etc.)
|
|
6924
|
+
- `column`: the column being validated (or columns used in certain validations)
|
|
6925
|
+
- `values`: the comparison values or parameters used in the validation
|
|
6926
|
+
- `inclusive`: whether the comparison is inclusive (for range-based validations)
|
|
6927
|
+
- `na_pass`: whether `NA`/`Null` values are considered passing (for certain validations)
|
|
6928
|
+
- `pre`: preprocessing function applied before validation
|
|
6929
|
+
- `segments`: data segments to which the validation was applied
|
|
6930
|
+
- `thresholds`: threshold level statement that was used for the validation step
|
|
6931
|
+
- `label`: custom label for the validation step
|
|
6932
|
+
- `brief`: a brief description of the validation step
|
|
6933
|
+
- `active`: whether the validation step is active
|
|
6934
|
+
- `all_passed`: whether all test units passed in the step
|
|
6935
|
+
- `n`: total number of test units
|
|
6936
|
+
- `n_passed`, `n_failed`: number of test units that passed and failed
|
|
6937
|
+
- `f_passed`, `f_failed`: Fraction of test units that passed and failed
|
|
6938
|
+
- `warning`, `error`, `critical`: whether the namesake threshold level was exceeded (is
|
|
6939
|
+
`null` if threshold not set)
|
|
6940
|
+
- `time_processed`: when the validation step was processed (ISO 8601 format)
|
|
6941
|
+
- `proc_duration_s`: the processing duration in seconds
|
|
6942
|
+
|
|
6943
|
+
Examples
|
|
6944
|
+
--------
|
|
6945
|
+
Let's create a validation plan with a few validation steps and generate a JSON report of the
|
|
6946
|
+
results:
|
|
6947
|
+
|
|
6948
|
+
```python
|
|
6949
|
+
import pointblank as pb
|
|
6950
|
+
import polars as pl
|
|
6951
|
+
|
|
6952
|
+
# Create a sample DataFrame
|
|
6953
|
+
tbl = pl.DataFrame({
|
|
6954
|
+
"a": [5, 7, 8, 9],
|
|
6955
|
+
"b": [3, 4, 2, 1]
|
|
6956
|
+
})
|
|
6957
|
+
|
|
6958
|
+
# Create and execute a validation plan
|
|
6959
|
+
validation = (
|
|
6960
|
+
pb.Validate(data=tbl)
|
|
6961
|
+
.col_vals_gt(columns="a", value=6)
|
|
6962
|
+
.col_vals_lt(columns="b", value=4)
|
|
6963
|
+
.interrogate()
|
|
6964
|
+
)
|
|
6965
|
+
|
|
6966
|
+
# Get the full JSON report
|
|
6967
|
+
json_report = validation.get_json_report()
|
|
6968
|
+
|
|
6969
|
+
print(json_report)
|
|
6970
|
+
```
|
|
6971
|
+
|
|
6972
|
+
You can also customize which fields to include:
|
|
6973
|
+
|
|
6974
|
+
```python
|
|
6975
|
+
json_report = validation.get_json_report(
|
|
6976
|
+
use_fields=["i", "assertion_type", "column", "n_passed", "n_failed"]
|
|
6977
|
+
)
|
|
6978
|
+
|
|
6979
|
+
print(json_report)
|
|
6980
|
+
```
|
|
6981
|
+
|
|
6982
|
+
Or which fields to exclude:
|
|
6983
|
+
|
|
6984
|
+
```python
|
|
6985
|
+
json_report = validation.get_json_report(
|
|
6986
|
+
exclude_fields=[
|
|
6987
|
+
"i_o", "thresholds", "pre", "segments", "values",
|
|
6988
|
+
"na_pass", "inclusive", "label", "brief", "active",
|
|
6989
|
+
"time_processed", "proc_duration_s"
|
|
6990
|
+
]
|
|
6991
|
+
)
|
|
6992
|
+
|
|
6993
|
+
print(json_report)
|
|
6994
|
+
```
|
|
6995
|
+
|
|
6996
|
+
The JSON output can be further processed or analyzed programmatically:
|
|
6997
|
+
|
|
6998
|
+
```python
|
|
6999
|
+
import json
|
|
7000
|
+
|
|
7001
|
+
# Parse the JSON report
|
|
7002
|
+
report_data = json.loads(validation.get_json_report())
|
|
7003
|
+
|
|
7004
|
+
# Extract and analyze validation results
|
|
7005
|
+
failing_steps = [step for step in report_data if step["n_failed"] > 0]
|
|
7006
|
+
print(f"Number of failing validation steps: {len(failing_steps)}")
|
|
7007
|
+
```
|
|
7008
|
+
|
|
7009
|
+
See Also
|
|
7010
|
+
--------
|
|
7011
|
+
- [`get_tabular_report()`](`pointblank.Validate.get_tabular_report`): Get a formatted HTML
|
|
7012
|
+
report as a GT table
|
|
7013
|
+
- [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`): Get rows that
|
|
7014
|
+
failed validation
|
|
6058
7015
|
|
|
6059
7016
|
|
|
6060
7017
|
get_sundered_data(self, type='pass') -> 'FrameT'
|
|
@@ -7378,9 +8335,9 @@ col_summary_tbl(data: 'FrameT | Any', tbl_name: 'str | None' = None) -> 'GT'
|
|
|
7378
8335
|
Here's an example using a DuckDB table handled by Ibis:
|
|
7379
8336
|
|
|
7380
8337
|
```python
|
|
7381
|
-
|
|
8338
|
+
nycflights = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
|
|
7382
8339
|
|
|
7383
|
-
pb.col_summary_tbl(data=
|
|
8340
|
+
pb.col_summary_tbl(data=nycflights, tbl_name="nycflights")
|
|
7384
8341
|
```
|
|
7385
8342
|
|
|
7386
8343
|
|
|
@@ -7782,16 +8739,18 @@ get_row_count(data: 'FrameT | Any') -> 'int'
|
|
|
7782
8739
|
for the `game_revenue` dataset.
|
|
7783
8740
|
|
|
7784
8741
|
|
|
7785
|
-
get_action_metadata()
|
|
8742
|
+
get_action_metadata() -> 'dict | None'
|
|
7786
8743
|
Access step-level metadata when authoring custom actions.
|
|
7787
8744
|
|
|
7788
8745
|
Get the metadata for the validation step where an action was triggered. This can be called by
|
|
7789
|
-
user functions to get the metadata for the current action.
|
|
8746
|
+
user functions to get the metadata for the current action. This function can only be used within
|
|
8747
|
+
callables crafted for the [`Actions`](`pointblank.Actions`) class.
|
|
7790
8748
|
|
|
7791
8749
|
Returns
|
|
7792
8750
|
-------
|
|
7793
|
-
dict
|
|
7794
|
-
A dictionary containing the metadata for the current step.
|
|
8751
|
+
dict | None
|
|
8752
|
+
A dictionary containing the metadata for the current step. If called outside of an action
|
|
8753
|
+
(i.e., when no action is being executed), this function will return `None`.
|
|
7795
8754
|
|
|
7796
8755
|
Description of the Metadata Fields
|
|
7797
8756
|
----------------------------------
|
|
@@ -7826,7 +8785,7 @@ Access step-level metadata when authoring custom actions.
|
|
|
7826
8785
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
7827
8786
|
actions=pb.Actions(warning=log_issue),
|
|
7828
8787
|
)
|
|
7829
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
8788
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
7830
8789
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
7831
8790
|
.col_vals_gt(
|
|
7832
8791
|
columns="session_duration",
|
|
@@ -7844,19 +8803,26 @@ Access step-level metadata when authoring custom actions.
|
|
|
7844
8803
|
- the `metadata` is a dictionary that is used to craft the log message
|
|
7845
8804
|
- the action is passed as a bare function to the `Actions` object within the `Validate` object
|
|
7846
8805
|
(placing it within `Validate(actions=)` ensures it's set as an action for every validation step)
|
|
8806
|
+
|
|
8807
|
+
See Also
|
|
8808
|
+
--------
|
|
8809
|
+
Have a look at [`Actions`](`pointblank.Actions`) for more information on how to create custom
|
|
8810
|
+
actions for validation steps that exceed a set threshold value.
|
|
7847
8811
|
|
|
7848
8812
|
|
|
7849
|
-
get_validation_summary()
|
|
8813
|
+
get_validation_summary() -> 'dict | None'
|
|
7850
8814
|
Access validation summary information when authoring final actions.
|
|
7851
8815
|
|
|
7852
8816
|
This function provides a convenient way to access summary information about the validation
|
|
7853
8817
|
process within a final action. It returns a dictionary with key metrics from the validation
|
|
7854
|
-
process.
|
|
8818
|
+
process. This function can only be used within callables crafted for the
|
|
8819
|
+
[`FinalActions`](`pointblank.FinalActions`) class.
|
|
7855
8820
|
|
|
7856
8821
|
Returns
|
|
7857
8822
|
-------
|
|
7858
8823
|
dict | None
|
|
7859
|
-
A dictionary containing validation metrics
|
|
8824
|
+
A dictionary containing validation metrics. If called outside of an final action context,
|
|
8825
|
+
this function will return `None`.
|
|
7860
8826
|
|
|
7861
8827
|
Description of the Summary Fields
|
|
7862
8828
|
--------------------------------
|
|
@@ -7946,6 +8912,11 @@ Access validation summary information when authoring final actions.
|
|
|
7946
8912
|
|
|
7947
8913
|
Final actions work well with both simple logging and more complex notification systems, allowing
|
|
7948
8914
|
you to integrate validation results into your broader data quality workflows.
|
|
8915
|
+
|
|
8916
|
+
See Also
|
|
8917
|
+
--------
|
|
8918
|
+
Have a look at [`FinalActions`](`pointblank.FinalActions`) for more information on how to create
|
|
8919
|
+
custom actions that are executed after all validation steps have been completed.
|
|
7949
8920
|
|
|
7950
8921
|
|
|
7951
8922
|
config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, preview_incl_header: 'bool' = True) -> 'PointblankConfig'
|
|
@@ -8329,7 +9300,6 @@ A validation with a comprehensive set of rules
|
|
|
8329
9300
|
```python
|
|
8330
9301
|
import pointblank as pb
|
|
8331
9302
|
import polars as pl
|
|
8332
|
-
import narwhals as nw
|
|
8333
9303
|
|
|
8334
9304
|
validation = (
|
|
8335
9305
|
pb.Validate(
|