pointblank 0.15.0__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_constants.py +25 -1
- pointblank/_constants_translations.py +2361 -2
- pointblank/_interrogation.py +24 -0
- pointblank/_typing.py +37 -9
- pointblank/_utils.py +0 -355
- pointblank/_utils_llms_txt.py +661 -0
- pointblank/column.py +24 -0
- pointblank/data/api-docs.txt +336 -3
- pointblank/validate.py +2551 -926
- pointblank/yaml.py +10 -2
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/METADATA +9 -4
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/RECORD +17 -16
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/WHEEL +0 -0
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/top_level.txt +0 -0
pointblank/column.py
CHANGED
|
@@ -268,9 +268,12 @@ def col(
|
|
|
268
268
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
269
269
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
270
270
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
271
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
272
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
271
273
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
272
274
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
273
275
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
276
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
274
277
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
275
278
|
|
|
276
279
|
If specifying a single column with certainty (you have the exact name), `col()` is not necessary
|
|
@@ -568,9 +571,12 @@ def starts_with(text: str, case_sensitive: bool = False) -> StartsWith:
|
|
|
568
571
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
569
572
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
570
573
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
574
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
575
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
571
576
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
572
577
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
573
578
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
579
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
574
580
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
575
581
|
|
|
576
582
|
The `starts_with()` selector function doesn't need to be used in isolation. Read the next
|
|
@@ -727,9 +733,12 @@ def ends_with(text: str, case_sensitive: bool = False) -> EndsWith:
|
|
|
727
733
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
728
734
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
729
735
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
736
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
737
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
730
738
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
731
739
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
732
740
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
741
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
733
742
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
734
743
|
|
|
735
744
|
The `ends_with()` selector function doesn't need to be used in isolation. Read the next section
|
|
@@ -887,9 +896,12 @@ def contains(text: str, case_sensitive: bool = False) -> Contains:
|
|
|
887
896
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
888
897
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
889
898
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
899
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
900
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
890
901
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
891
902
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
892
903
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
904
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
893
905
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
894
906
|
|
|
895
907
|
The `contains()` selector function doesn't need to be used in isolation. Read the next section
|
|
@@ -1047,9 +1059,12 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
|
|
|
1047
1059
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
1048
1060
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
1049
1061
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
1062
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
1063
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
1050
1064
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
1051
1065
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
1052
1066
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
1067
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
1053
1068
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
1054
1069
|
|
|
1055
1070
|
The `matches()` selector function doesn't need to be used in isolation. Read the next section
|
|
@@ -1189,9 +1204,12 @@ def everything() -> Everything:
|
|
|
1189
1204
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
1190
1205
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
1191
1206
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
1207
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
1208
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
1192
1209
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
1193
1210
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
1194
1211
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
1212
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
1195
1213
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
1196
1214
|
|
|
1197
1215
|
The `everything()` selector function doesn't need to be used in isolation. Read the next section
|
|
@@ -1341,9 +1359,12 @@ def first_n(n: int, offset: int = 0) -> FirstN:
|
|
|
1341
1359
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
1342
1360
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
1343
1361
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
1362
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
1363
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
1344
1364
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
1345
1365
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
1346
1366
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
1367
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
1347
1368
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
1348
1369
|
|
|
1349
1370
|
The `first_n()` selector function doesn't need to be used in isolation. Read the next section
|
|
@@ -1497,9 +1518,12 @@ def last_n(n: int, offset: int = 0) -> LastN:
|
|
|
1497
1518
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
1498
1519
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
1499
1520
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
1521
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
1522
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
1500
1523
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
1501
1524
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
1502
1525
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
1526
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
1503
1527
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
1504
1528
|
|
|
1505
1529
|
The `last_n()` selector function doesn't need to be used in isolation. Read the next section for
|
pointblank/data/api-docs.txt
CHANGED
|
@@ -289,6 +289,16 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
289
289
|
- Vietnamese (`"vi"`)
|
|
290
290
|
- Indonesian (`"id"`)
|
|
291
291
|
- Ukrainian (`"uk"`)
|
|
292
|
+
- Bulgarian (`"bg"`)
|
|
293
|
+
- Croatian (`"hr"`)
|
|
294
|
+
- Estonian (`"et"`)
|
|
295
|
+
- Hungarian (`"hu"`)
|
|
296
|
+
- Irish (`"ga"`)
|
|
297
|
+
- Latvian (`"lv"`)
|
|
298
|
+
- Lithuanian (`"lt"`)
|
|
299
|
+
- Maltese (`"mt"`)
|
|
300
|
+
- Slovak (`"sk"`)
|
|
301
|
+
- Slovenian (`"sl"`)
|
|
292
302
|
- Hebrew (`"he"`)
|
|
293
303
|
- Thai (`"th"`)
|
|
294
304
|
- Persian (`"fa"`)
|
|
@@ -5392,6 +5402,247 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
|
|
|
5392
5402
|
failing validation step (the check for column `c`, which doesn't exist).
|
|
5393
5403
|
|
|
5394
5404
|
|
|
5405
|
+
col_pct_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', p: 'float', tol: 'Tolerance' = 0, thresholds: 'int | float | None | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5406
|
+
|
|
5407
|
+
Validate whether a column has a specific percentage of Null values.
|
|
5408
|
+
|
|
5409
|
+
The `col_pct_null()` validation method checks whether the percentage of Null values in a
|
|
5410
|
+
column matches a specified percentage `p=` (within an optional tolerance `tol=`). This
|
|
5411
|
+
validation operates at the column level, generating a single validation step per column that
|
|
5412
|
+
passes or fails based on whether the actual percentage of Null values falls within the
|
|
5413
|
+
acceptable range defined by `p ± tol`.
|
|
5414
|
+
|
|
5415
|
+
Parameters
|
|
5416
|
+
----------
|
|
5417
|
+
columns
|
|
5418
|
+
A single column or a list of columns to validate. Can also use
|
|
5419
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
5420
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
5421
|
+
generated for each column.
|
|
5422
|
+
p
|
|
5423
|
+
The expected percentage of Null values in the column, expressed as a decimal between
|
|
5424
|
+
`0.0` and `1.0`. For example, `p=0.5` means 50% of values should be Null.
|
|
5425
|
+
tol
|
|
5426
|
+
The tolerance allowed when comparing the actual percentage of Null values to the
|
|
5427
|
+
expected percentage `p=`. The validation passes if the actual percentage falls within
|
|
5428
|
+
the range `[p - tol, p + tol]`. Default is `0`, meaning an exact match is required. See
|
|
5429
|
+
the *Tolerance* section for details on all supported formats (absolute, relative,
|
|
5430
|
+
symmetric, and asymmetric bounds).
|
|
5431
|
+
thresholds
|
|
5432
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
5433
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
5434
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
5435
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
5436
|
+
section for information on how to set threshold levels.
|
|
5437
|
+
actions
|
|
5438
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
5439
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5440
|
+
define the actions.
|
|
5441
|
+
brief
|
|
5442
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5443
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5444
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5445
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5446
|
+
won't be a brief.
|
|
5447
|
+
active
|
|
5448
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5449
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5450
|
+
for the steps unchanged).
|
|
5451
|
+
|
|
5452
|
+
Returns
|
|
5453
|
+
-------
|
|
5454
|
+
Validate
|
|
5455
|
+
The `Validate` object with the added validation step.
|
|
5456
|
+
|
|
5457
|
+
Tolerance
|
|
5458
|
+
---------
|
|
5459
|
+
The `tol=` parameter accepts several different formats to specify the acceptable deviation
|
|
5460
|
+
from the expected percentage `p=`. The tolerance can be expressed as:
|
|
5461
|
+
|
|
5462
|
+
1. *single integer* (absolute tolerance): the exact number of test units that can deviate.
|
|
5463
|
+
For example, `tol=2` means the actual count can differ from the expected count by up to 2
|
|
5464
|
+
units in either direction.
|
|
5465
|
+
|
|
5466
|
+
2. *single float between 0 and 1* (relative tolerance): a proportion of the expected
|
|
5467
|
+
count. For example, if the expected count is 50 and `tol=0.1`, the acceptable range is
|
|
5468
|
+
45 to 55 (50 ± 10% of 50 = 50 ± 5).
|
|
5469
|
+
|
|
5470
|
+
3. *tuple of two integers* (absolute bounds): explicitly specify the lower and upper
|
|
5471
|
+
bounds as absolute deviations. For example, `tol=(1, 3)` means the actual count can be
|
|
5472
|
+
1 unit below or 3 units above the expected count.
|
|
5473
|
+
|
|
5474
|
+
4. *tuple of two floats between 0 and 1* (relative bounds): explicitly specify the lower
|
|
5475
|
+
and upper bounds as proportional deviations. For example, `tol=(0.05, 0.15)` means the
|
|
5476
|
+
lower bound is 5% below and the upper bound is 15% above the expected count.
|
|
5477
|
+
|
|
5478
|
+
When using a single value (integer or float), the tolerance is applied symmetrically in both
|
|
5479
|
+
directions. When using a tuple, you can specify asymmetric tolerances where the lower and
|
|
5480
|
+
upper bounds differ.
|
|
5481
|
+
|
|
5482
|
+
Thresholds
|
|
5483
|
+
----------
|
|
5484
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5485
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5486
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5487
|
+
|
|
5488
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
5489
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
5490
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
5491
|
+
|
|
5492
|
+
Thresholds can be defined using one of these input schemes:
|
|
5493
|
+
|
|
5494
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5495
|
+
thresholds)
|
|
5496
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5497
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5498
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5499
|
+
'critical'
|
|
5500
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5501
|
+
for the 'warning' level only
|
|
5502
|
+
|
|
5503
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
5504
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
5505
|
+
set, you're free to set any combination of them.
|
|
5506
|
+
|
|
5507
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
5508
|
+
take for each level of failure (using the `actions=` parameter).
|
|
5509
|
+
|
|
5510
|
+
Examples
|
|
5511
|
+
--------
|
|
5512
|
+
For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`,
|
|
5513
|
+
and `c`) that have different percentages of Null values. The table is shown below:
|
|
5514
|
+
|
|
5515
|
+
```python
|
|
5516
|
+
import pointblank as pb
|
|
5517
|
+
import polars as pl
|
|
5518
|
+
|
|
5519
|
+
tbl = pl.DataFrame(
|
|
5520
|
+
{
|
|
5521
|
+
"a": [1, 2, 3, 4, 5, 6, 7, 8],
|
|
5522
|
+
"b": [1, None, 3, None, 5, None, 7, None],
|
|
5523
|
+
"c": [None, None, None, None, None, None, 1, 2],
|
|
5524
|
+
}
|
|
5525
|
+
)
|
|
5526
|
+
|
|
5527
|
+
pb.preview(tbl)
|
|
5528
|
+
```
|
|
5529
|
+
|
|
5530
|
+
Let's validate that column `a` has 0% Null values (i.e., no Null values at all).
|
|
5531
|
+
|
|
5532
|
+
```python
|
|
5533
|
+
validation = (
|
|
5534
|
+
pb.Validate(data=tbl)
|
|
5535
|
+
.col_pct_null(columns="a", p=0.0)
|
|
5536
|
+
.interrogate()
|
|
5537
|
+
)
|
|
5538
|
+
|
|
5539
|
+
validation
|
|
5540
|
+
```
|
|
5541
|
+
|
|
5542
|
+
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
5543
|
+
The validation table shows the single entry that corresponds to the validation step created
|
|
5544
|
+
by using `col_pct_null()`. The validation passed since column `a` has no Null values.
|
|
5545
|
+
|
|
5546
|
+
Now, let's check that column `b` has exactly 50% Null values.
|
|
5547
|
+
|
|
5548
|
+
```python
|
|
5549
|
+
validation = (
|
|
5550
|
+
pb.Validate(data=tbl)
|
|
5551
|
+
.col_pct_null(columns="b", p=0.5)
|
|
5552
|
+
.interrogate()
|
|
5553
|
+
)
|
|
5554
|
+
|
|
5555
|
+
validation
|
|
5556
|
+
```
|
|
5557
|
+
|
|
5558
|
+
This validation also passes, as column `b` has exactly 4 out of 8 values as Null (50%).
|
|
5559
|
+
|
|
5560
|
+
Finally, let's validate column `c` with a tolerance. Column `c` has 75% Null values, so
|
|
5561
|
+
we'll check if it's approximately 70% Null with a tolerance of 10%.
|
|
5562
|
+
|
|
5563
|
+
```python
|
|
5564
|
+
validation = (
|
|
5565
|
+
pb.Validate(data=tbl)
|
|
5566
|
+
.col_pct_null(columns="c", p=0.70, tol=0.10)
|
|
5567
|
+
.interrogate()
|
|
5568
|
+
)
|
|
5569
|
+
|
|
5570
|
+
validation
|
|
5571
|
+
```
|
|
5572
|
+
|
|
5573
|
+
This validation passes because the actual percentage (75%) falls within the acceptable
|
|
5574
|
+
range of 60% to 80% (70% ± 10%).
|
|
5575
|
+
|
|
5576
|
+
The `tol=` parameter supports multiple formats to express tolerance. Let's explore all the
|
|
5577
|
+
different ways to specify tolerance using column `b`, which has exactly 50% Null values
|
|
5578
|
+
(4 out of 8 values).
|
|
5579
|
+
|
|
5580
|
+
*Using an absolute tolerance (integer)*: Specify the exact number of rows that can
|
|
5581
|
+
deviate. With `tol=1`, we allow the count to differ by 1 row in either direction.
|
|
5582
|
+
|
|
5583
|
+
```python
|
|
5584
|
+
validation = (
|
|
5585
|
+
pb.Validate(data=tbl)
|
|
5586
|
+
.col_pct_null(columns="b", p=0.375, tol=1) # Expect 3 nulls, allow ±1 (range: 2-4)
|
|
5587
|
+
.interrogate()
|
|
5588
|
+
)
|
|
5589
|
+
|
|
5590
|
+
validation
|
|
5591
|
+
```
|
|
5592
|
+
|
|
5593
|
+
This passes because column `b` has 4 Null values, which falls within the acceptable range
|
|
5594
|
+
of 2 to 4 (3 ± 1).
|
|
5595
|
+
|
|
5596
|
+
*Using a relative tolerance (float)*: Specify the tolerance as a proportion of the
|
|
5597
|
+
expected count. With `tol=0.25`, we allow a 25% deviation from the expected count.
|
|
5598
|
+
|
|
5599
|
+
```python
|
|
5600
|
+
validation = (
|
|
5601
|
+
pb.Validate(data=tbl)
|
|
5602
|
+
.col_pct_null(columns="b", p=0.375, tol=0.25) # Expect 3 nulls, allow ±25% (range: 2.25-3.75)
|
|
5603
|
+
.interrogate()
|
|
5604
|
+
)
|
|
5605
|
+
|
|
5606
|
+
validation
|
|
5607
|
+
```
|
|
5608
|
+
|
|
5609
|
+
This passes because 4 Null values falls within the acceptable range (3 ± 0.75 calculates
|
|
5610
|
+
to 2.25 to 3.75, which rounds down to 2 to 3 rows).
|
|
5611
|
+
|
|
5612
|
+
*Using asymmetric absolute bounds (tuple of integers)*: Specify different lower and
|
|
5613
|
+
upper bounds as absolute values. With `tol=(0, 2)`, we allow no deviation below but up
|
|
5614
|
+
to 2 rows above the expected count.
|
|
5615
|
+
|
|
5616
|
+
```python
|
|
5617
|
+
validation = (
|
|
5618
|
+
pb.Validate(data=tbl)
|
|
5619
|
+
.col_pct_null(columns="b", p=0.25, tol=(0, 2)) # Expect 2 Nulls, allow +0/-2 (range: 2-4)
|
|
5620
|
+
.interrogate()
|
|
5621
|
+
)
|
|
5622
|
+
|
|
5623
|
+
validation
|
|
5624
|
+
```
|
|
5625
|
+
|
|
5626
|
+
This passes because 4 Null values falls within the acceptable range of 2 to 4.
|
|
5627
|
+
|
|
5628
|
+
*Using asymmetric relative bounds (tuple of floats)*: Specify different lower and upper
|
|
5629
|
+
bounds as proportions. With `tol=(0.1, 0.3)`, we allow 10% below and 30% above the
|
|
5630
|
+
expected count.
|
|
5631
|
+
|
|
5632
|
+
```python
|
|
5633
|
+
validation = (
|
|
5634
|
+
pb.Validate(data=tbl)
|
|
5635
|
+
.col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3) # Expect 3 Nulls, allow -10%/+30%
|
|
5636
|
+
.interrogate()
|
|
5637
|
+
)
|
|
5638
|
+
|
|
5639
|
+
validation
|
|
5640
|
+
```
|
|
5641
|
+
|
|
5642
|
+
This passes because 4 Null values falls within the acceptable range (3 - 0.3 to 3 + 0.9
|
|
5643
|
+
calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows).
|
|
5644
|
+
|
|
5645
|
+
|
|
5395
5646
|
col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5396
5647
|
|
|
5397
5648
|
Do columns in the table (and their types) match a predefined schema?
|
|
@@ -6907,9 +7158,12 @@ col(exprs: 'str | ColumnSelector | ColumnSelectorNarwhals') -> 'Column | ColumnL
|
|
|
6907
7158
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
6908
7159
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
6909
7160
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
7161
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
7162
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
6910
7163
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
6911
7164
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
6912
7165
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
7166
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
6913
7167
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
6914
7168
|
|
|
6915
7169
|
If specifying a single column with certainty (you have the exact name), `col()` is not necessary
|
|
@@ -7191,9 +7445,12 @@ starts_with(text: 'str', case_sensitive: 'bool' = False) -> 'StartsWith'
|
|
|
7191
7445
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
7192
7446
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
7193
7447
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
7448
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
7449
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
7194
7450
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
7195
7451
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
7196
7452
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
7453
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
7197
7454
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
7198
7455
|
|
|
7199
7456
|
The `starts_with()` selector function doesn't need to be used in isolation. Read the next
|
|
@@ -7341,9 +7598,12 @@ ends_with(text: 'str', case_sensitive: 'bool' = False) -> 'EndsWith'
|
|
|
7341
7598
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
7342
7599
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
7343
7600
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
7601
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
7602
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
7344
7603
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
7345
7604
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
7346
7605
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
7606
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
7347
7607
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
7348
7608
|
|
|
7349
7609
|
The `ends_with()` selector function doesn't need to be used in isolation. Read the next section
|
|
@@ -7492,9 +7752,12 @@ contains(text: 'str', case_sensitive: 'bool' = False) -> 'Contains'
|
|
|
7492
7752
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
7493
7753
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
7494
7754
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
7755
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
7756
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
7495
7757
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
7496
7758
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
7497
7759
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
7760
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
7498
7761
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
7499
7762
|
|
|
7500
7763
|
The `contains()` selector function doesn't need to be used in isolation. Read the next section
|
|
@@ -7643,9 +7906,12 @@ matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches'
|
|
|
7643
7906
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
7644
7907
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
7645
7908
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
7909
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
7910
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
7646
7911
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
7647
7912
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
7648
7913
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
7914
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
7649
7915
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
7650
7916
|
|
|
7651
7917
|
The `matches()` selector function doesn't need to be used in isolation. Read the next section
|
|
@@ -7776,9 +8042,12 @@ everything() -> 'Everything'
|
|
|
7776
8042
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
7777
8043
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
7778
8044
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
8045
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
8046
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
7779
8047
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
7780
8048
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
7781
8049
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
8050
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
7782
8051
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
7783
8052
|
|
|
7784
8053
|
The `everything()` selector function doesn't need to be used in isolation. Read the next section
|
|
@@ -7919,9 +8188,12 @@ first_n(n: 'int', offset: 'int' = 0) -> 'FirstN'
|
|
|
7919
8188
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
7920
8189
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
7921
8190
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
8191
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
8192
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
7922
8193
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
7923
8194
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
7924
8195
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
8196
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
7925
8197
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
7926
8198
|
|
|
7927
8199
|
The `first_n()` selector function doesn't need to be used in isolation. Read the next section
|
|
@@ -8066,9 +8338,12 @@ last_n(n: 'int', offset: 'int' = 0) -> 'LastN'
|
|
|
8066
8338
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
8067
8339
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
8068
8340
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
8341
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
8342
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
8069
8343
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
8070
8344
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
8071
8345
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
8346
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
8072
8347
|
- [`col_exists()`](`pointblank.Validate.col_exists`)
|
|
8073
8348
|
|
|
8074
8349
|
The `last_n()` selector function doesn't need to be used in isolation. Read the next section for
|
|
@@ -8562,7 +8837,7 @@ set_tbl(self, tbl: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str |
|
|
|
8562
8837
|
```
|
|
8563
8838
|
|
|
8564
8839
|
|
|
8565
|
-
get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool' = None, incl_footer: 'bool' = None) -> 'GT'
|
|
8840
|
+
get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool' = None, incl_footer: 'bool' = None, incl_footer_timings: 'bool' = None, incl_footer_notes: 'bool' = None) -> 'GT'
|
|
8566
8841
|
|
|
8567
8842
|
Validation report as a GT table.
|
|
8568
8843
|
|
|
@@ -8584,6 +8859,20 @@ get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool'
|
|
|
8584
8859
|
name of the table as the title for the report. If no title is wanted, then `":none:"`
|
|
8585
8860
|
can be used. Aside from keyword options, text can be provided for the title. This will
|
|
8586
8861
|
be interpreted as Markdown text and transformed internally to HTML.
|
|
8862
|
+
incl_header
|
|
8863
|
+
Controls whether the header section should be displayed. If `None`, uses the global
|
|
8864
|
+
configuration setting. The header contains the table name, label, and threshold
|
|
8865
|
+
information.
|
|
8866
|
+
incl_footer
|
|
8867
|
+
Controls whether the footer section should be displayed. If `None`, uses the global
|
|
8868
|
+
configuration setting. The footer can contain validation timing information and notes.
|
|
8869
|
+
incl_footer_timings
|
|
8870
|
+
Controls whether validation timing information (start time, duration, end time) should
|
|
8871
|
+
be displayed in the footer. If `None`, uses the global configuration setting. Only
|
|
8872
|
+
applies when `incl_footer=True`.
|
|
8873
|
+
incl_footer_notes
|
|
8874
|
+
Controls whether notes from validation steps should be displayed in the footer. If
|
|
8875
|
+
`None`, uses the global configuration setting. Only applies when `incl_footer=True`.
|
|
8587
8876
|
|
|
8588
8877
|
Returns
|
|
8589
8878
|
-------
|
|
@@ -8699,11 +8988,15 @@ get_step_report(self, i: 'int', columns_subset: 'str | list[str] | Column | None
|
|
|
8699
8988
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
8700
8989
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
8701
8990
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
8991
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
8992
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
8702
8993
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
8703
8994
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
8704
8995
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
8996
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
8705
8997
|
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
8706
8998
|
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
8999
|
+
- [`prompt()`](`pointblank.Validate.prompt`)
|
|
8707
9000
|
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
8708
9001
|
|
|
8709
9002
|
The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
|
|
@@ -9040,11 +9333,15 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
|
|
|
9040
9333
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
9041
9334
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
9042
9335
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
9336
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
9337
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
9043
9338
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
9044
9339
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
9045
9340
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
9341
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
9046
9342
|
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
9047
9343
|
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
9344
|
+
- [`prompt()`](`pointblank.Validate.prompt`)
|
|
9048
9345
|
|
|
9049
9346
|
An extracted row for these validation methods means that a test unit failed for that row in
|
|
9050
9347
|
the validation step.
|
|
@@ -11036,6 +11333,36 @@ connect_to_table(connection_string: 'str') -> 'Any'
|
|
|
11036
11333
|
pip install 'ibis-framework[duckdb]' # for DuckDB
|
|
11037
11334
|
pip install 'ibis-framework[postgres]' # for PostgreSQL
|
|
11038
11335
|
```
|
|
11336
|
+
See Also
|
|
11337
|
+
--------
|
|
11338
|
+
print_database_tables : List all available tables in a database for discovery
|
|
11339
|
+
|
|
11340
|
+
|
|
11341
|
+
print_database_tables(connection_string: 'str') -> 'list[str]'
|
|
11342
|
+
|
|
11343
|
+
List all tables in a database from a connection string.
|
|
11344
|
+
|
|
11345
|
+
The `print_database_tables()` function connects to a database and returns a list of all
|
|
11346
|
+
available tables. This is particularly useful for discovering what tables exist in a database
|
|
11347
|
+
before connecting to a specific table with `connect_to_table(). The function automatically
|
|
11348
|
+
filters out temporary Ibis tables (memtables) to show only user tables. It supports all database
|
|
11349
|
+
backends available through Ibis, including DuckDB, SQLite, PostgreSQL, MySQL, BigQuery, and
|
|
11350
|
+
Snowflake.
|
|
11351
|
+
|
|
11352
|
+
Parameters
|
|
11353
|
+
----------
|
|
11354
|
+
connection_string
|
|
11355
|
+
A database connection string *without* the `::table_name` suffix. Example:
|
|
11356
|
+
`"duckdb:///path/to/database.ddb"`.
|
|
11357
|
+
|
|
11358
|
+
Returns
|
|
11359
|
+
-------
|
|
11360
|
+
list[str]
|
|
11361
|
+
List of table names, excluding temporary Ibis tables.
|
|
11362
|
+
|
|
11363
|
+
See Also
|
|
11364
|
+
--------
|
|
11365
|
+
connect_to_table : Connect to a database table with full connection string documentation
|
|
11039
11366
|
|
|
11040
11367
|
|
|
11041
11368
|
|
|
@@ -12238,7 +12565,7 @@ read_file(filepath: 'str | Path') -> 'Validate'
|
|
|
12238
12565
|
to disk for later retrieval with this function.
|
|
12239
12566
|
|
|
12240
12567
|
|
|
12241
|
-
config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, preview_incl_header: 'bool' = True) -> 'PointblankConfig'
|
|
12568
|
+
config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, report_incl_footer_timings: 'bool' = True, report_incl_footer_notes: 'bool' = True, preview_incl_header: 'bool' = True) -> 'PointblankConfig'
|
|
12242
12569
|
|
|
12243
12570
|
Configuration settings for the Pointblank library.
|
|
12244
12571
|
|
|
@@ -12250,7 +12577,13 @@ config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, pre
|
|
|
12250
12577
|
threshold levels (if set).
|
|
12251
12578
|
report_incl_footer
|
|
12252
12579
|
Should the footer of the validation table report be displayed? The footer contains the
|
|
12253
|
-
starting and ending times of the interrogation.
|
|
12580
|
+
starting and ending times of the interrogation and any notes added to validation steps.
|
|
12581
|
+
report_incl_footer_timings
|
|
12582
|
+
Controls whether the validation timing information (start time, duration, and end time)
|
|
12583
|
+
should be displayed in the footer. Only applies when `report_incl_footer=True`.
|
|
12584
|
+
report_incl_footer_notes
|
|
12585
|
+
Controls whether the notes from validation steps should be displayed in the footer. Only
|
|
12586
|
+
applies when `report_incl_footer=True`.
|
|
12254
12587
|
preview_incl_header
|
|
12255
12588
|
Whether the header should be present in any preview table (generated via the
|
|
12256
12589
|
[`preview()`](`pointblank.preview`) function).
|