pointblank 0.8.7__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants.py +11 -10
- pointblank/_typing.py +19 -3
- pointblank/data/api-docs.txt +716 -49
- pointblank/datascan.py +4 -4
- pointblank/draft.py +1 -1
- pointblank/thresholds.py +10 -0
- pointblank/validate.py +1061 -48
- {pointblank-0.8.7.dist-info → pointblank-0.9.0.dist-info}/METADATA +6 -2
- {pointblank-0.8.7.dist-info → pointblank-0.9.0.dist-info}/RECORD +12 -12
- {pointblank-0.8.7.dist-info → pointblank-0.9.0.dist-info}/WHEEL +1 -1
- {pointblank-0.8.7.dist-info → pointblank-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.8.7.dist-info → pointblank-0.9.0.dist-info}/top_level.txt +0 -0
pointblank/data/api-docs.txt
CHANGED
|
@@ -45,7 +45,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
45
45
|
The table to validate, which could be a DataFrame object or an Ibis table object. Read the
|
|
46
46
|
*Supported Input Table Types* section for details on the supported table types.
|
|
47
47
|
tbl_name
|
|
48
|
-
|
|
48
|
+
An optional name to assign to the input table object. If no value is provided, a name will
|
|
49
49
|
be generated based on whatever information is available. This table name will be displayed
|
|
50
50
|
in the header area of the tabular report.
|
|
51
51
|
label
|
|
@@ -628,6 +628,11 @@ Actions(warning: 'str | Callable | list[str | Callable] | None' = None, error: '
|
|
|
628
628
|
all three thresholds are exceeded in step 3, the 'warning' action of executing the function
|
|
629
629
|
occurs (resulting in a message being printed to the console). If actions were set for the other
|
|
630
630
|
two threshold levels, they would also be executed.
|
|
631
|
+
|
|
632
|
+
See Also
|
|
633
|
+
--------
|
|
634
|
+
The [`get_action_metadata()`](`pointblank.get_action_metadata`) function, which can be used to
|
|
635
|
+
retrieve metadata about the step where the action is executed.
|
|
631
636
|
|
|
632
637
|
|
|
633
638
|
FinalActions(*args)
|
|
@@ -723,6 +728,11 @@ FinalActions(*args)
|
|
|
723
728
|
.interrogate()
|
|
724
729
|
)
|
|
725
730
|
```
|
|
731
|
+
|
|
732
|
+
See Also
|
|
733
|
+
--------
|
|
734
|
+
The [`get_validation_summary()`](`pointblank.get_validation_summary`) function, which can be
|
|
735
|
+
used to retrieve the summary of the validation results.
|
|
726
736
|
|
|
727
737
|
|
|
728
738
|
Schema(columns: 'str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None' = None, tbl: 'any | None' = None, **kwargs)
|
|
@@ -1075,7 +1085,7 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None
|
|
|
1075
1085
|
data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
|
|
1076
1086
|
|
|
1077
1087
|
# Draft a validation plan for the "nycflights" table
|
|
1078
|
-
pb.DraftValidation(data=
|
|
1088
|
+
pb.DraftValidation(data=data, model="anthropic:claude-3-5-sonnet-latest")
|
|
1079
1089
|
```
|
|
1080
1090
|
|
|
1081
1091
|
The output will be a drafted validation plan for the `"nycflights"` table and this will appear
|
|
@@ -1154,7 +1164,7 @@ Validation steps can be thought of as sequential validations on the target
|
|
|
1154
1164
|
data. We call `Validate`'s validation methods to build up a validation plan: a collection of steps
|
|
1155
1165
|
that, in the aggregate, provides good validation coverage.
|
|
1156
1166
|
|
|
1157
|
-
col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1167
|
+
col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1158
1168
|
|
|
1159
1169
|
Are column data greater than a fixed value or data in another column?
|
|
1160
1170
|
|
|
@@ -1181,10 +1191,15 @@ col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1181
1191
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
1182
1192
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
1183
1193
|
pre
|
|
1184
|
-
|
|
1194
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
1185
1195
|
interrogation. This function should take a table as input and return a modified table.
|
|
1186
1196
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
1187
1197
|
argument.
|
|
1198
|
+
segments
|
|
1199
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
1200
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
1201
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
1202
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
1188
1203
|
thresholds
|
|
1189
1204
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
1190
1205
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -1247,6 +1262,42 @@ col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1247
1262
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
1248
1263
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
1249
1264
|
|
|
1265
|
+
Segmentation
|
|
1266
|
+
------------
|
|
1267
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
1268
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
1269
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
1270
|
+
column.
|
|
1271
|
+
|
|
1272
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
1273
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
1274
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
1275
|
+
region.
|
|
1276
|
+
|
|
1277
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
1278
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
1279
|
+
segment on only specific dates, you can provide a tuple like
|
|
1280
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
1281
|
+
(i.e., no validation steps will be created for them).
|
|
1282
|
+
|
|
1283
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1284
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
1285
|
+
|
|
1286
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
1287
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
1288
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
1289
|
+
columns
|
|
1290
|
+
|
|
1291
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1292
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
1293
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
1294
|
+
identify issues within specific segments.
|
|
1295
|
+
|
|
1296
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
1297
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
1298
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
1299
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
1300
|
+
|
|
1250
1301
|
Thresholds
|
|
1251
1302
|
----------
|
|
1252
1303
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -1334,7 +1385,7 @@ col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1334
1385
|
- Row 3: `c` is `2` and `b` is `2`.
|
|
1335
1386
|
|
|
1336
1387
|
|
|
1337
|
-
col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1388
|
+
col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1338
1389
|
|
|
1339
1390
|
Are column data less than a fixed value or data in another column?
|
|
1340
1391
|
|
|
@@ -1361,10 +1412,15 @@ col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1361
1412
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
1362
1413
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
1363
1414
|
pre
|
|
1364
|
-
|
|
1415
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
1365
1416
|
interrogation. This function should take a table as input and return a modified table.
|
|
1366
1417
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
1367
1418
|
argument.
|
|
1419
|
+
segments
|
|
1420
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
1421
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
1422
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
1423
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
1368
1424
|
thresholds
|
|
1369
1425
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
1370
1426
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -1427,6 +1483,42 @@ col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1427
1483
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
1428
1484
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
1429
1485
|
|
|
1486
|
+
Segmentation
|
|
1487
|
+
------------
|
|
1488
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
1489
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
1490
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
1491
|
+
column.
|
|
1492
|
+
|
|
1493
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
1494
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
1495
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
1496
|
+
region.
|
|
1497
|
+
|
|
1498
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
1499
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
1500
|
+
segment on only specific dates, you can provide a tuple like
|
|
1501
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
1502
|
+
(i.e., no validation steps will be created for them).
|
|
1503
|
+
|
|
1504
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1505
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
1506
|
+
|
|
1507
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
1508
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
1509
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
1510
|
+
columns
|
|
1511
|
+
|
|
1512
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1513
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
1514
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
1515
|
+
identify issues within specific segments.
|
|
1516
|
+
|
|
1517
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
1518
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
1519
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
1520
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
1521
|
+
|
|
1430
1522
|
Thresholds
|
|
1431
1523
|
----------
|
|
1432
1524
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -1514,7 +1606,7 @@ col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1514
1606
|
- Row 2: `b` is `1` and `c` is `1`.
|
|
1515
1607
|
|
|
1516
1608
|
|
|
1517
|
-
col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1609
|
+
col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1518
1610
|
|
|
1519
1611
|
Are column data greater than or equal to a fixed value or data in another column?
|
|
1520
1612
|
|
|
@@ -1541,10 +1633,15 @@ col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1541
1633
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
1542
1634
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
1543
1635
|
pre
|
|
1544
|
-
|
|
1636
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
1545
1637
|
interrogation. This function should take a table as input and return a modified table.
|
|
1546
1638
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
1547
1639
|
argument.
|
|
1640
|
+
segments
|
|
1641
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
1642
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
1643
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
1644
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
1548
1645
|
thresholds
|
|
1549
1646
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
1550
1647
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -1607,6 +1704,42 @@ col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1607
1704
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
1608
1705
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
1609
1706
|
|
|
1707
|
+
Segmentation
|
|
1708
|
+
------------
|
|
1709
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
1710
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
1711
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
1712
|
+
column.
|
|
1713
|
+
|
|
1714
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
1715
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
1716
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
1717
|
+
region.
|
|
1718
|
+
|
|
1719
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
1720
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
1721
|
+
segment on only specific dates, you can provide a tuple like
|
|
1722
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
1723
|
+
(i.e., no validation steps will be created for them).
|
|
1724
|
+
|
|
1725
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1726
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
1727
|
+
|
|
1728
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
1729
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
1730
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
1731
|
+
columns
|
|
1732
|
+
|
|
1733
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1734
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
1735
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
1736
|
+
identify issues within specific segments.
|
|
1737
|
+
|
|
1738
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
1739
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
1740
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
1741
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
1742
|
+
|
|
1610
1743
|
Thresholds
|
|
1611
1744
|
----------
|
|
1612
1745
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -1694,7 +1827,7 @@ col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1694
1827
|
- Row 4: `b` is `3` and `c` is `4`.
|
|
1695
1828
|
|
|
1696
1829
|
|
|
1697
|
-
col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1830
|
+
col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1698
1831
|
|
|
1699
1832
|
Are column data less than or equal to a fixed value or data in another column?
|
|
1700
1833
|
|
|
@@ -1721,10 +1854,15 @@ col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1721
1854
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
1722
1855
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
1723
1856
|
pre
|
|
1724
|
-
|
|
1857
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
1725
1858
|
interrogation. This function should take a table as input and return a modified table.
|
|
1726
1859
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
1727
1860
|
argument.
|
|
1861
|
+
segments
|
|
1862
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
1863
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
1864
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
1865
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
1728
1866
|
thresholds
|
|
1729
1867
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
1730
1868
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -1787,6 +1925,42 @@ col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1787
1925
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
1788
1926
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
1789
1927
|
|
|
1928
|
+
Segmentation
|
|
1929
|
+
------------
|
|
1930
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
1931
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
1932
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
1933
|
+
column.
|
|
1934
|
+
|
|
1935
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
1936
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
1937
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
1938
|
+
region.
|
|
1939
|
+
|
|
1940
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
1941
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
1942
|
+
segment on only specific dates, you can provide a tuple like
|
|
1943
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
1944
|
+
(i.e., no validation steps will be created for them).
|
|
1945
|
+
|
|
1946
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1947
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
1948
|
+
|
|
1949
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
1950
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
1951
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
1952
|
+
columns
|
|
1953
|
+
|
|
1954
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1955
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
1956
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
1957
|
+
identify issues within specific segments.
|
|
1958
|
+
|
|
1959
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
1960
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
1961
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
1962
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
1963
|
+
|
|
1790
1964
|
Thresholds
|
|
1791
1965
|
----------
|
|
1792
1966
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -1874,7 +2048,7 @@ col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1874
2048
|
- Row 4: `c` is `3` and `b` is `2`.
|
|
1875
2049
|
|
|
1876
2050
|
|
|
1877
|
-
col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2051
|
+
col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
1878
2052
|
|
|
1879
2053
|
Are column data equal to a fixed value or data in another column?
|
|
1880
2054
|
|
|
@@ -1901,10 +2075,15 @@ col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1901
2075
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
1902
2076
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
1903
2077
|
pre
|
|
1904
|
-
|
|
2078
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
1905
2079
|
interrogation. This function should take a table as input and return a modified table.
|
|
1906
2080
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
1907
2081
|
argument.
|
|
2082
|
+
segments
|
|
2083
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2084
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2085
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2086
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
1908
2087
|
thresholds
|
|
1909
2088
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
1910
2089
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -1967,6 +2146,42 @@ col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1967
2146
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
1968
2147
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
1969
2148
|
|
|
2149
|
+
Segmentation
|
|
2150
|
+
------------
|
|
2151
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2152
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2153
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2154
|
+
column.
|
|
2155
|
+
|
|
2156
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2157
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2158
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2159
|
+
region.
|
|
2160
|
+
|
|
2161
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2162
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2163
|
+
segment on only specific dates, you can provide a tuple like
|
|
2164
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2165
|
+
(i.e., no validation steps will be created for them).
|
|
2166
|
+
|
|
2167
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2168
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2169
|
+
|
|
2170
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2171
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2172
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2173
|
+
columns
|
|
2174
|
+
|
|
2175
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2176
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2177
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2178
|
+
identify issues within specific segments.
|
|
2179
|
+
|
|
2180
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2181
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2182
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2183
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2184
|
+
|
|
1970
2185
|
Thresholds
|
|
1971
2186
|
----------
|
|
1972
2187
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2052,7 +2267,7 @@ col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2052
2267
|
- Row 5: `a` is `5` and `b` is `4`.
|
|
2053
2268
|
|
|
2054
2269
|
|
|
2055
|
-
col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2270
|
+
col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2056
2271
|
|
|
2057
2272
|
Are column data not equal to a fixed value or data in another column?
|
|
2058
2273
|
|
|
@@ -2079,10 +2294,15 @@ col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2079
2294
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2080
2295
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2081
2296
|
pre
|
|
2082
|
-
|
|
2297
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2083
2298
|
interrogation. This function should take a table as input and return a modified table.
|
|
2084
2299
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2085
2300
|
argument.
|
|
2301
|
+
segments
|
|
2302
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2303
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2304
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2305
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2086
2306
|
thresholds
|
|
2087
2307
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2088
2308
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2145,6 +2365,42 @@ col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2145
2365
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2146
2366
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2147
2367
|
|
|
2368
|
+
Segmentation
|
|
2369
|
+
------------
|
|
2370
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2371
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2372
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2373
|
+
column.
|
|
2374
|
+
|
|
2375
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2376
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2377
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2378
|
+
region.
|
|
2379
|
+
|
|
2380
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2381
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2382
|
+
segment on only specific dates, you can provide a tuple like
|
|
2383
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2384
|
+
(i.e., no validation steps will be created for them).
|
|
2385
|
+
|
|
2386
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2387
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2388
|
+
|
|
2389
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2390
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2391
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2392
|
+
columns
|
|
2393
|
+
|
|
2394
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2395
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2396
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2397
|
+
identify issues within specific segments.
|
|
2398
|
+
|
|
2399
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2400
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2401
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2402
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2403
|
+
|
|
2148
2404
|
Thresholds
|
|
2149
2405
|
----------
|
|
2150
2406
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2228,7 +2484,7 @@ col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2228
2484
|
0 and 4, where `a` is `5` and `b` is `5` in both cases (i.e., they are equal to each other).
|
|
2229
2485
|
|
|
2230
2486
|
|
|
2231
|
-
col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2487
|
+
col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2232
2488
|
|
|
2233
2489
|
Do column data lie between two specified values or data in other columns?
|
|
2234
2490
|
|
|
@@ -2265,10 +2521,15 @@ col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2265
2521
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2266
2522
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2267
2523
|
pre
|
|
2268
|
-
|
|
2524
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2269
2525
|
interrogation. This function should take a table as input and return a modified table.
|
|
2270
2526
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2271
2527
|
argument.
|
|
2528
|
+
segments
|
|
2529
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2530
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2531
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2532
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2272
2533
|
thresholds
|
|
2273
2534
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2274
2535
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2333,6 +2594,42 @@ col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2333
2594
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2334
2595
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2335
2596
|
|
|
2597
|
+
Segmentation
|
|
2598
|
+
------------
|
|
2599
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2600
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2601
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2602
|
+
column.
|
|
2603
|
+
|
|
2604
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2605
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2606
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2607
|
+
region.
|
|
2608
|
+
|
|
2609
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2610
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2611
|
+
segment on only specific dates, you can provide a tuple like
|
|
2612
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2613
|
+
(i.e., no validation steps will be created for them).
|
|
2614
|
+
|
|
2615
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2616
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2617
|
+
|
|
2618
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2619
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2620
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2621
|
+
columns
|
|
2622
|
+
|
|
2623
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2624
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2625
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2626
|
+
identify issues within specific segments.
|
|
2627
|
+
|
|
2628
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2629
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2630
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2631
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2632
|
+
|
|
2336
2633
|
Thresholds
|
|
2337
2634
|
----------
|
|
2338
2635
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2428,7 +2725,7 @@ col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2428
2725
|
- Row 4: `b` is `8` but the bounds are `3` (`a`) and `7` (`c`).
|
|
2429
2726
|
|
|
2430
2727
|
|
|
2431
|
-
col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2728
|
+
col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2432
2729
|
|
|
2433
2730
|
Do column data lie outside of two specified values or data in other columns?
|
|
2434
2731
|
|
|
@@ -2465,10 +2762,15 @@ col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2465
2762
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2466
2763
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2467
2764
|
pre
|
|
2468
|
-
|
|
2765
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2469
2766
|
interrogation. This function should take a table as input and return a modified table.
|
|
2470
2767
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2471
2768
|
argument.
|
|
2769
|
+
segments
|
|
2770
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2771
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2772
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2773
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2472
2774
|
thresholds
|
|
2473
2775
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2474
2776
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2533,6 +2835,42 @@ col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2533
2835
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2534
2836
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2535
2837
|
|
|
2838
|
+
Segmentation
|
|
2839
|
+
------------
|
|
2840
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2841
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2842
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2843
|
+
column.
|
|
2844
|
+
|
|
2845
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2846
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2847
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2848
|
+
region.
|
|
2849
|
+
|
|
2850
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2851
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2852
|
+
segment on only specific dates, you can provide a tuple like
|
|
2853
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2854
|
+
(i.e., no validation steps will be created for them).
|
|
2855
|
+
|
|
2856
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2857
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2858
|
+
|
|
2859
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2860
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2861
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2862
|
+
columns
|
|
2863
|
+
|
|
2864
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2865
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2866
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2867
|
+
identify issues within specific segments.
|
|
2868
|
+
|
|
2869
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2870
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2871
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2872
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2873
|
+
|
|
2536
2874
|
Thresholds
|
|
2537
2875
|
----------
|
|
2538
2876
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2628,7 +2966,7 @@ col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2628
2966
|
- Row 5: `b` is `6` and the bounds are `5` (`a`) and `7` (`c`).
|
|
2629
2967
|
|
|
2630
2968
|
|
|
2631
|
-
col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: '
|
|
2969
|
+
col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2632
2970
|
|
|
2633
2971
|
Validate whether column values are in a set of values.
|
|
2634
2972
|
|
|
@@ -2647,10 +2985,15 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
2647
2985
|
set
|
|
2648
2986
|
A list of values to compare against.
|
|
2649
2987
|
pre
|
|
2650
|
-
|
|
2988
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2651
2989
|
interrogation. This function should take a table as input and return a modified table.
|
|
2652
2990
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2653
2991
|
argument.
|
|
2992
|
+
segments
|
|
2993
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2994
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2995
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2996
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2654
2997
|
thresholds
|
|
2655
2998
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2656
2999
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2692,6 +3035,42 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
2692
3035
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
2693
3036
|
subsequent validation steps.
|
|
2694
3037
|
|
|
3038
|
+
Segmentation
|
|
3039
|
+
------------
|
|
3040
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3041
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3042
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3043
|
+
column.
|
|
3044
|
+
|
|
3045
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3046
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3047
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3048
|
+
region.
|
|
3049
|
+
|
|
3050
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3051
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3052
|
+
segment on only specific dates, you can provide a tuple like
|
|
3053
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3054
|
+
(i.e., no validation steps will be created for them).
|
|
3055
|
+
|
|
3056
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3057
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3058
|
+
|
|
3059
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3060
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3061
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3062
|
+
columns
|
|
3063
|
+
|
|
3064
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3065
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3066
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3067
|
+
identify issues within specific segments.
|
|
3068
|
+
|
|
3069
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3070
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3071
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3072
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3073
|
+
|
|
2695
3074
|
Thresholds
|
|
2696
3075
|
----------
|
|
2697
3076
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2773,7 +3152,7 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
2773
3152
|
column `b` values of `8` and `1`, which are not in the set of `[2, 3, 4, 5, 6]`.
|
|
2774
3153
|
|
|
2775
3154
|
|
|
2776
|
-
col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'list[float | int]', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3155
|
+
col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'list[float | int]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2777
3156
|
|
|
2778
3157
|
Validate whether column values are not in a set of values.
|
|
2779
3158
|
|
|
@@ -2792,10 +3171,15 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
2792
3171
|
set
|
|
2793
3172
|
A list of values to compare against.
|
|
2794
3173
|
pre
|
|
2795
|
-
|
|
3174
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2796
3175
|
interrogation. This function should take a table as input and return a modified table.
|
|
2797
3176
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2798
3177
|
argument.
|
|
3178
|
+
segments
|
|
3179
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3180
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3181
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3182
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2799
3183
|
thresholds
|
|
2800
3184
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2801
3185
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2837,6 +3221,42 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
2837
3221
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
2838
3222
|
subsequent validation steps.
|
|
2839
3223
|
|
|
3224
|
+
Segmentation
|
|
3225
|
+
------------
|
|
3226
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3227
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3228
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3229
|
+
column.
|
|
3230
|
+
|
|
3231
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3232
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3233
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3234
|
+
region.
|
|
3235
|
+
|
|
3236
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3237
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3238
|
+
segment on only specific dates, you can provide a tuple like
|
|
3239
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3240
|
+
(i.e., no validation steps will be created for them).
|
|
3241
|
+
|
|
3242
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3243
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3244
|
+
|
|
3245
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3246
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3247
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3248
|
+
columns
|
|
3249
|
+
|
|
3250
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3251
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3252
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3253
|
+
identify issues within specific segments.
|
|
3254
|
+
|
|
3255
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3256
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3257
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3258
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3259
|
+
|
|
2840
3260
|
Thresholds
|
|
2841
3261
|
----------
|
|
2842
3262
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2919,7 +3339,7 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
2919
3339
|
column `b` values of `2` and `6`, both of which are in the set of `[2, 3, 4, 5, 6]`.
|
|
2920
3340
|
|
|
2921
3341
|
|
|
2922
|
-
col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3342
|
+
col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
2923
3343
|
|
|
2924
3344
|
Validate whether values in a column are NULL.
|
|
2925
3345
|
|
|
@@ -2935,10 +3355,15 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
|
|
|
2935
3355
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
2936
3356
|
generated for each column.
|
|
2937
3357
|
pre
|
|
2938
|
-
|
|
3358
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2939
3359
|
interrogation. This function should take a table as input and return a modified table.
|
|
2940
3360
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2941
3361
|
argument.
|
|
3362
|
+
segments
|
|
3363
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3364
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3365
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3366
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2942
3367
|
thresholds
|
|
2943
3368
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2944
3369
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2980,6 +3405,42 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
|
|
|
2980
3405
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
2981
3406
|
subsequent validation steps.
|
|
2982
3407
|
|
|
3408
|
+
Segmentation
|
|
3409
|
+
------------
|
|
3410
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3411
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3412
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3413
|
+
column.
|
|
3414
|
+
|
|
3415
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3416
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3417
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3418
|
+
region.
|
|
3419
|
+
|
|
3420
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3421
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3422
|
+
segment on only specific dates, you can provide a tuple like
|
|
3423
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3424
|
+
(i.e., no validation steps will be created for them).
|
|
3425
|
+
|
|
3426
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3427
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3428
|
+
|
|
3429
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3430
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3431
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3432
|
+
columns
|
|
3433
|
+
|
|
3434
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3435
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3436
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3437
|
+
identify issues within specific segments.
|
|
3438
|
+
|
|
3439
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3440
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3441
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3442
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3443
|
+
|
|
2983
3444
|
Thresholds
|
|
2984
3445
|
----------
|
|
2985
3446
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3060,7 +3521,7 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
|
|
|
3060
3521
|
two non-Null values in column `b`.
|
|
3061
3522
|
|
|
3062
3523
|
|
|
3063
|
-
col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3524
|
+
col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3064
3525
|
|
|
3065
3526
|
Validate whether values in a column are not NULL.
|
|
3066
3527
|
|
|
@@ -3076,10 +3537,15 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
|
|
|
3076
3537
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
3077
3538
|
generated for each column.
|
|
3078
3539
|
pre
|
|
3079
|
-
|
|
3540
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3080
3541
|
interrogation. This function should take a table as input and return a modified table.
|
|
3081
3542
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3082
3543
|
argument.
|
|
3544
|
+
segments
|
|
3545
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3546
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3547
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3548
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3083
3549
|
thresholds
|
|
3084
3550
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3085
3551
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3121,6 +3587,42 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
|
|
|
3121
3587
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
3122
3588
|
subsequent validation steps.
|
|
3123
3589
|
|
|
3590
|
+
Segmentation
|
|
3591
|
+
------------
|
|
3592
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3593
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3594
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3595
|
+
column.
|
|
3596
|
+
|
|
3597
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3598
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3599
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3600
|
+
region.
|
|
3601
|
+
|
|
3602
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3603
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3604
|
+
segment on only specific dates, you can provide a tuple like
|
|
3605
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3606
|
+
(i.e., no validation steps will be created for them).
|
|
3607
|
+
|
|
3608
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3609
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3610
|
+
|
|
3611
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3612
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3613
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3614
|
+
columns
|
|
3615
|
+
|
|
3616
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3617
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3618
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3619
|
+
identify issues within specific segments.
|
|
3620
|
+
|
|
3621
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3622
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3623
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3624
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3625
|
+
|
|
3124
3626
|
Thresholds
|
|
3125
3627
|
----------
|
|
3126
3628
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3201,7 +3703,7 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
|
|
|
3201
3703
|
two Null values in column `b`.
|
|
3202
3704
|
|
|
3203
3705
|
|
|
3204
|
-
col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3706
|
+
col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3205
3707
|
|
|
3206
3708
|
Validate whether column values match a regular expression pattern.
|
|
3207
3709
|
|
|
@@ -3223,10 +3725,15 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
|
|
|
3223
3725
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3224
3726
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3225
3727
|
pre
|
|
3226
|
-
|
|
3728
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3227
3729
|
interrogation. This function should take a table as input and return a modified table.
|
|
3228
3730
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3229
3731
|
argument.
|
|
3732
|
+
segments
|
|
3733
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3734
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3735
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3736
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3230
3737
|
thresholds
|
|
3231
3738
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3232
3739
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3268,6 +3775,42 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
|
|
|
3268
3775
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
3269
3776
|
subsequent validation steps.
|
|
3270
3777
|
|
|
3778
|
+
Segmentation
|
|
3779
|
+
------------
|
|
3780
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3781
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3782
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3783
|
+
column.
|
|
3784
|
+
|
|
3785
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3786
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3787
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3788
|
+
region.
|
|
3789
|
+
|
|
3790
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3791
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3792
|
+
segment on only specific dates, you can provide a tuple like
|
|
3793
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3794
|
+
(i.e., no validation steps will be created for them).
|
|
3795
|
+
|
|
3796
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3797
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3798
|
+
|
|
3799
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3800
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3801
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3802
|
+
columns
|
|
3803
|
+
|
|
3804
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3805
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3806
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3807
|
+
identify issues within specific segments.
|
|
3808
|
+
|
|
3809
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3810
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3811
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3812
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3813
|
+
|
|
3271
3814
|
Thresholds
|
|
3272
3815
|
----------
|
|
3273
3816
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3349,7 +3892,7 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
|
|
|
3349
3892
|
string values of rows 1 and 2 in column `b`.
|
|
3350
3893
|
|
|
3351
3894
|
|
|
3352
|
-
col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3895
|
+
col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3353
3896
|
|
|
3354
3897
|
Validate column values using a custom expression.
|
|
3355
3898
|
|
|
@@ -3366,10 +3909,15 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, thresholds: 'int
|
|
|
3366
3909
|
be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
|
|
3367
3910
|
should either be a lambda expression or a Narwhals column expression.
|
|
3368
3911
|
pre
|
|
3369
|
-
|
|
3912
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3370
3913
|
interrogation. This function should take a table as input and return a modified table.
|
|
3371
3914
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3372
3915
|
argument.
|
|
3916
|
+
segments
|
|
3917
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3918
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3919
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3920
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3373
3921
|
thresholds
|
|
3374
3922
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3375
3923
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3409,6 +3957,42 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, thresholds: 'int
|
|
|
3409
3957
|
transformed table, it only exists during the validation step and is not stored in the
|
|
3410
3958
|
`Validate` object or used in subsequent validation steps.
|
|
3411
3959
|
|
|
3960
|
+
Segmentation
|
|
3961
|
+
------------
|
|
3962
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3963
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3964
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3965
|
+
column.
|
|
3966
|
+
|
|
3967
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3968
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3969
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3970
|
+
region.
|
|
3971
|
+
|
|
3972
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3973
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3974
|
+
segment on only specific dates, you can provide a tuple like
|
|
3975
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3976
|
+
(i.e., no validation steps will be created for them).
|
|
3977
|
+
|
|
3978
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3979
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3980
|
+
|
|
3981
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3982
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3983
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3984
|
+
columns
|
|
3985
|
+
|
|
3986
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3987
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3988
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3989
|
+
identify issues within specific segments.
|
|
3990
|
+
|
|
3991
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3992
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3993
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3994
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3995
|
+
|
|
3412
3996
|
Thresholds
|
|
3413
3997
|
----------
|
|
3414
3998
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3597,7 +4181,7 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
|
|
|
3597
4181
|
failing validation step (the check for column `c`, which doesn't exist).
|
|
3598
4182
|
|
|
3599
4183
|
|
|
3600
|
-
rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4184
|
+
rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3601
4185
|
|
|
3602
4186
|
Validate whether rows in the table are distinct.
|
|
3603
4187
|
|
|
@@ -3613,10 +4197,15 @@ rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
3613
4197
|
columns are supplied, the distinct comparison will be made over the combination of
|
|
3614
4198
|
values in those columns.
|
|
3615
4199
|
pre
|
|
3616
|
-
|
|
4200
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3617
4201
|
interrogation. This function should take a table as input and return a modified table.
|
|
3618
4202
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3619
4203
|
argument.
|
|
4204
|
+
segments
|
|
4205
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4206
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4207
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4208
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3620
4209
|
thresholds
|
|
3621
4210
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3622
4211
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3658,6 +4247,42 @@ rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
3658
4247
|
table, it only exists during the validation step and is not stored in the `Validate` object
|
|
3659
4248
|
or used in subsequent validation steps.
|
|
3660
4249
|
|
|
4250
|
+
Segmentation
|
|
4251
|
+
------------
|
|
4252
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
4253
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
4254
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
4255
|
+
column.
|
|
4256
|
+
|
|
4257
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
4258
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
4259
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
4260
|
+
region.
|
|
4261
|
+
|
|
4262
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
4263
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
4264
|
+
segment on only specific dates, you can provide a tuple like
|
|
4265
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
4266
|
+
(i.e., no validation steps will be created for them).
|
|
4267
|
+
|
|
4268
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4269
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
4270
|
+
|
|
4271
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
4272
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
4273
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
4274
|
+
columns
|
|
4275
|
+
|
|
4276
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4277
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
4278
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
4279
|
+
identify issues within specific segments.
|
|
4280
|
+
|
|
4281
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
4282
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
4283
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
4284
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
4285
|
+
|
|
3661
4286
|
Thresholds
|
|
3662
4287
|
----------
|
|
3663
4288
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3779,7 +4404,7 @@ col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'boo
|
|
|
3779
4404
|
substring matches are allowed, so a schema data type of `Int` would match a target table
|
|
3780
4405
|
data type of `Int64`.
|
|
3781
4406
|
pre
|
|
3782
|
-
|
|
4407
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3783
4408
|
interrogation. This function should take a table as input and return a modified table.
|
|
3784
4409
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3785
4410
|
argument.
|
|
@@ -3932,7 +4557,7 @@ row_count_match(self, count: 'int | FrameT | Any', tol: 'Tolerance' = 0, inverse
|
|
|
3932
4557
|
Should the validation step be inverted? If `True`, then the expectation is that the row
|
|
3933
4558
|
count of the target table should not match the specified `count=` value.
|
|
3934
4559
|
pre
|
|
3935
|
-
|
|
4560
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3936
4561
|
interrogation. This function should take a table as input and return a modified table.
|
|
3937
4562
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3938
4563
|
argument.
|
|
@@ -4078,7 +4703,7 @@ col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre:
|
|
|
4078
4703
|
Should the validation step be inverted? If `True`, then the expectation is that the
|
|
4079
4704
|
column count of the target table should not match the specified `count=` value.
|
|
4080
4705
|
pre
|
|
4081
|
-
|
|
4706
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4082
4707
|
interrogation. This function should take a table as input and return a modified table.
|
|
4083
4708
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4084
4709
|
argument.
|
|
@@ -5936,8 +6561,8 @@ get_step_report(self, i: 'int', columns_subset: 'str | list[str] | Column | None
|
|
|
5936
6561
|
table object, which can be displayed in a notebook or exported to an HTML file.
|
|
5937
6562
|
|
|
5938
6563
|
:::{.callout-warning}
|
|
5939
|
-
The `get_step_report()` is still experimental. Please report any issues you encounter
|
|
5940
|
-
[Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
6564
|
+
The `get_step_report()` method is still experimental. Please report any issues you encounter
|
|
6565
|
+
in the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
5941
6566
|
:::
|
|
5942
6567
|
|
|
5943
6568
|
Parameters
|
|
@@ -5970,6 +6595,35 @@ get_step_report(self, i: 'int', columns_subset: 'str | list[str] | Column | None
|
|
|
5970
6595
|
GT
|
|
5971
6596
|
A GT table object that represents the detailed report for the validation step.
|
|
5972
6597
|
|
|
6598
|
+
Types of Step Reports
|
|
6599
|
+
---------------------
|
|
6600
|
+
The `get_step_report()` method produces a report based on the *type* of validation step.
|
|
6601
|
+
The following row-based validation methods will produce a report that shows the rows of the
|
|
6602
|
+
data that failed because of failing test units within one or more columns failed:
|
|
6603
|
+
|
|
6604
|
+
- [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
|
|
6605
|
+
- [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
|
|
6606
|
+
- [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
|
|
6607
|
+
- [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
|
|
6608
|
+
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
6609
|
+
- [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
|
|
6610
|
+
- [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
|
|
6611
|
+
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
6612
|
+
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
6613
|
+
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
6614
|
+
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
6615
|
+
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
6616
|
+
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
6617
|
+
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
6618
|
+
|
|
6619
|
+
The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
|
|
6620
|
+
report that shows duplicate rows (or duplicate values in one or a set of columns as defined
|
|
6621
|
+
in that method's `columns_subset=` parameter.
|
|
6622
|
+
|
|
6623
|
+
The [`col_schema_match()`](`pointblank.Validate.col_schema_match`) validation step will
|
|
6624
|
+
produce a report that shows the schema of the data table and the schema of the validation
|
|
6625
|
+
step. The report will indicate whether the schemas match or not.
|
|
6626
|
+
|
|
5973
6627
|
Examples
|
|
5974
6628
|
--------
|
|
5975
6629
|
Let's create a validation plan with a few validation steps and interrogate the data. With
|
|
@@ -5989,7 +6643,7 @@ get_step_report(self, i: 'int', columns_subset: 'str | list[str] | Column | None
|
|
|
5989
6643
|
.col_vals_lt(columns="d", value=3500)
|
|
5990
6644
|
.col_vals_between(columns="c", left=1, right=8)
|
|
5991
6645
|
.col_vals_gt(columns="a", value=3)
|
|
5992
|
-
.col_vals_regex(columns="b", pattern=r"
|
|
6646
|
+
.col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
|
|
5993
6647
|
.interrogate()
|
|
5994
6648
|
)
|
|
5995
6649
|
|
|
@@ -7378,9 +8032,9 @@ col_summary_tbl(data: 'FrameT | Any', tbl_name: 'str | None' = None) -> 'GT'
|
|
|
7378
8032
|
Here's an example using a DuckDB table handled by Ibis:
|
|
7379
8033
|
|
|
7380
8034
|
```python
|
|
7381
|
-
|
|
8035
|
+
nycflights = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
|
|
7382
8036
|
|
|
7383
|
-
pb.col_summary_tbl(data=
|
|
8037
|
+
pb.col_summary_tbl(data=nycflights, tbl_name="nycflights")
|
|
7384
8038
|
```
|
|
7385
8039
|
|
|
7386
8040
|
|
|
@@ -7782,16 +8436,18 @@ get_row_count(data: 'FrameT | Any') -> 'int'
|
|
|
7782
8436
|
for the `game_revenue` dataset.
|
|
7783
8437
|
|
|
7784
8438
|
|
|
7785
|
-
get_action_metadata()
|
|
8439
|
+
get_action_metadata() -> 'dict | None'
|
|
7786
8440
|
Access step-level metadata when authoring custom actions.
|
|
7787
8441
|
|
|
7788
8442
|
Get the metadata for the validation step where an action was triggered. This can be called by
|
|
7789
|
-
user functions to get the metadata for the current action.
|
|
8443
|
+
user functions to get the metadata for the current action. This function can only be used within
|
|
8444
|
+
callables crafted for the [`Actions`](`pointblank.Actions`) class.
|
|
7790
8445
|
|
|
7791
8446
|
Returns
|
|
7792
8447
|
-------
|
|
7793
|
-
dict
|
|
7794
|
-
A dictionary containing the metadata for the current step.
|
|
8448
|
+
dict | None
|
|
8449
|
+
A dictionary containing the metadata for the current step. If called outside of an action
|
|
8450
|
+
(i.e., when no action is being executed), this function will return `None`.
|
|
7795
8451
|
|
|
7796
8452
|
Description of the Metadata Fields
|
|
7797
8453
|
----------------------------------
|
|
@@ -7826,7 +8482,7 @@ Access step-level metadata when authoring custom actions.
|
|
|
7826
8482
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
7827
8483
|
actions=pb.Actions(warning=log_issue),
|
|
7828
8484
|
)
|
|
7829
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
8485
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
7830
8486
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
7831
8487
|
.col_vals_gt(
|
|
7832
8488
|
columns="session_duration",
|
|
@@ -7844,19 +8500,26 @@ Access step-level metadata when authoring custom actions.
|
|
|
7844
8500
|
- the `metadata` is a dictionary that is used to craft the log message
|
|
7845
8501
|
- the action is passed as a bare function to the `Actions` object within the `Validate` object
|
|
7846
8502
|
(placing it within `Validate(actions=)` ensures it's set as an action for every validation step)
|
|
8503
|
+
|
|
8504
|
+
See Also
|
|
8505
|
+
--------
|
|
8506
|
+
Have a look at [`Actions`](`pointblank.Actions`) for more information on how to create custom
|
|
8507
|
+
actions for validation steps that exceed a set threshold value.
|
|
7847
8508
|
|
|
7848
8509
|
|
|
7849
|
-
get_validation_summary()
|
|
8510
|
+
get_validation_summary() -> 'dict | None'
|
|
7850
8511
|
Access validation summary information when authoring final actions.
|
|
7851
8512
|
|
|
7852
8513
|
This function provides a convenient way to access summary information about the validation
|
|
7853
8514
|
process within a final action. It returns a dictionary with key metrics from the validation
|
|
7854
|
-
process.
|
|
8515
|
+
process. This function can only be used within callables crafted for the
|
|
8516
|
+
[`FinalActions`](`pointblank.FinalActions`) class.
|
|
7855
8517
|
|
|
7856
8518
|
Returns
|
|
7857
8519
|
-------
|
|
7858
8520
|
dict | None
|
|
7859
|
-
A dictionary containing validation metrics
|
|
8521
|
+
A dictionary containing validation metrics. If called outside of an final action context,
|
|
8522
|
+
this function will return `None`.
|
|
7860
8523
|
|
|
7861
8524
|
Description of the Summary Fields
|
|
7862
8525
|
--------------------------------
|
|
@@ -7946,6 +8609,11 @@ Access validation summary information when authoring final actions.
|
|
|
7946
8609
|
|
|
7947
8610
|
Final actions work well with both simple logging and more complex notification systems, allowing
|
|
7948
8611
|
you to integrate validation results into your broader data quality workflows.
|
|
8612
|
+
|
|
8613
|
+
See Also
|
|
8614
|
+
--------
|
|
8615
|
+
Have a look at [`FinalActions`](`pointblank.FinalActions`) for more information on how to create
|
|
8616
|
+
custom actions that are executed after all validation steps have been completed.
|
|
7949
8617
|
|
|
7950
8618
|
|
|
7951
8619
|
config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, preview_incl_header: 'bool' = True) -> 'PointblankConfig'
|
|
@@ -8329,7 +8997,6 @@ A validation with a comprehensive set of rules
|
|
|
8329
8997
|
```python
|
|
8330
8998
|
import pointblank as pb
|
|
8331
8999
|
import polars as pl
|
|
8332
|
-
import narwhals as nw
|
|
8333
9000
|
|
|
8334
9001
|
validation = (
|
|
8335
9002
|
pb.Validate(
|