pointblank 0.16.0__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants.py +15 -1
- pointblank/_constants_translations.py +1302 -0
- pointblank/_interrogation.py +24 -0
- pointblank/_utils_llms_txt.py +1 -0
- pointblank/data/api-docs.txt +264 -3
- pointblank/validate.py +1350 -29
- pointblank/yaml.py +5 -2
- {pointblank-0.16.0.dist-info → pointblank-0.17.0.dist-info}/METADATA +6 -1
- {pointblank-0.16.0.dist-info → pointblank-0.17.0.dist-info}/RECORD +13 -13
- {pointblank-0.16.0.dist-info → pointblank-0.17.0.dist-info}/WHEEL +0 -0
- {pointblank-0.16.0.dist-info → pointblank-0.17.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.16.0.dist-info → pointblank-0.17.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.16.0.dist-info → pointblank-0.17.0.dist-info}/top_level.txt +0 -0
pointblank/_interrogation.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import functools
|
|
4
|
+
from collections.abc import Callable
|
|
4
5
|
from dataclasses import dataclass
|
|
5
6
|
from typing import Any
|
|
6
7
|
|
|
@@ -16,6 +17,7 @@ from pointblank._spec_utils import (
|
|
|
16
17
|
check_postal_code,
|
|
17
18
|
check_vin,
|
|
18
19
|
)
|
|
20
|
+
from pointblank._typing import AbsoluteBounds
|
|
19
21
|
from pointblank._utils import (
|
|
20
22
|
_column_test_prep,
|
|
21
23
|
_convert_to_narwhals,
|
|
@@ -745,6 +747,28 @@ def row_count_match(data_tbl: FrameT, count, inverse: bool, abs_tol_bounds) -> b
|
|
|
745
747
|
return row_count >= min_val and row_count <= max_val
|
|
746
748
|
|
|
747
749
|
|
|
750
|
+
def col_pct_null(
|
|
751
|
+
data_tbl: FrameT, column: str, p: float, bound_finder: Callable[[int], AbsoluteBounds]
|
|
752
|
+
) -> bool:
|
|
753
|
+
"""Check if the percentage of null vales are within p given the absolute bounds."""
|
|
754
|
+
# Convert to narwhals for consistent API across backends
|
|
755
|
+
nw_tbl = nw.from_native(data_tbl)
|
|
756
|
+
|
|
757
|
+
# Handle LazyFrames by collecting them first
|
|
758
|
+
if hasattr(nw_tbl, "collect"):
|
|
759
|
+
nw_tbl = nw_tbl.collect()
|
|
760
|
+
|
|
761
|
+
# Get total rows using narwhals
|
|
762
|
+
total_rows: int = nw_tbl.select(nw.len()).item()
|
|
763
|
+
abs_target: float = round(total_rows * p)
|
|
764
|
+
lower_bound, upper_bound = bound_finder(abs_target)
|
|
765
|
+
|
|
766
|
+
# Count null values
|
|
767
|
+
n_null: int = nw_tbl.select(nw.col(column).is_null().sum()).item()
|
|
768
|
+
|
|
769
|
+
return n_null >= (abs_target - lower_bound) and n_null <= (abs_target + upper_bound)
|
|
770
|
+
|
|
771
|
+
|
|
748
772
|
def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
|
|
749
773
|
"""
|
|
750
774
|
Check if DataFrame column count matches expected count.
|
pointblank/_utils_llms_txt.py
CHANGED
pointblank/data/api-docs.txt
CHANGED
|
@@ -5402,6 +5402,247 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
|
|
|
5402
5402
|
failing validation step (the check for column `c`, which doesn't exist).
|
|
5403
5403
|
|
|
5404
5404
|
|
|
5405
|
+
col_pct_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', p: 'float', tol: 'Tolerance' = 0, thresholds: 'int | float | None | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5406
|
+
|
|
5407
|
+
Validate whether a column has a specific percentage of Null values.
|
|
5408
|
+
|
|
5409
|
+
The `col_pct_null()` validation method checks whether the percentage of Null values in a
|
|
5410
|
+
column matches a specified percentage `p=` (within an optional tolerance `tol=`). This
|
|
5411
|
+
validation operates at the column level, generating a single validation step per column that
|
|
5412
|
+
passes or fails based on whether the actual percentage of Null values falls within the
|
|
5413
|
+
acceptable range defined by `p ± tol`.
|
|
5414
|
+
|
|
5415
|
+
Parameters
|
|
5416
|
+
----------
|
|
5417
|
+
columns
|
|
5418
|
+
A single column or a list of columns to validate. Can also use
|
|
5419
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
5420
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
5421
|
+
generated for each column.
|
|
5422
|
+
p
|
|
5423
|
+
The expected percentage of Null values in the column, expressed as a decimal between
|
|
5424
|
+
`0.0` and `1.0`. For example, `p=0.5` means 50% of values should be Null.
|
|
5425
|
+
tol
|
|
5426
|
+
The tolerance allowed when comparing the actual percentage of Null values to the
|
|
5427
|
+
expected percentage `p=`. The validation passes if the actual percentage falls within
|
|
5428
|
+
the range `[p - tol, p + tol]`. Default is `0`, meaning an exact match is required. See
|
|
5429
|
+
the *Tolerance* section for details on all supported formats (absolute, relative,
|
|
5430
|
+
symmetric, and asymmetric bounds).
|
|
5431
|
+
thresholds
|
|
5432
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
5433
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
5434
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
5435
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
5436
|
+
section for information on how to set threshold levels.
|
|
5437
|
+
actions
|
|
5438
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
5439
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5440
|
+
define the actions.
|
|
5441
|
+
brief
|
|
5442
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5443
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5444
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5445
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5446
|
+
won't be a brief.
|
|
5447
|
+
active
|
|
5448
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5449
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5450
|
+
for the steps unchanged).
|
|
5451
|
+
|
|
5452
|
+
Returns
|
|
5453
|
+
-------
|
|
5454
|
+
Validate
|
|
5455
|
+
The `Validate` object with the added validation step.
|
|
5456
|
+
|
|
5457
|
+
Tolerance
|
|
5458
|
+
---------
|
|
5459
|
+
The `tol=` parameter accepts several different formats to specify the acceptable deviation
|
|
5460
|
+
from the expected percentage `p=`. The tolerance can be expressed as:
|
|
5461
|
+
|
|
5462
|
+
1. *single integer* (absolute tolerance): the exact number of test units that can deviate.
|
|
5463
|
+
For example, `tol=2` means the actual count can differ from the expected count by up to 2
|
|
5464
|
+
units in either direction.
|
|
5465
|
+
|
|
5466
|
+
2. *single float between 0 and 1* (relative tolerance): a proportion of the expected
|
|
5467
|
+
count. For example, if the expected count is 50 and `tol=0.1`, the acceptable range is
|
|
5468
|
+
45 to 55 (50 ± 10% of 50 = 50 ± 5).
|
|
5469
|
+
|
|
5470
|
+
3. *tuple of two integers* (absolute bounds): explicitly specify the lower and upper
|
|
5471
|
+
bounds as absolute deviations. For example, `tol=(1, 3)` means the actual count can be
|
|
5472
|
+
1 unit below or 3 units above the expected count.
|
|
5473
|
+
|
|
5474
|
+
4. *tuple of two floats between 0 and 1* (relative bounds): explicitly specify the lower
|
|
5475
|
+
and upper bounds as proportional deviations. For example, `tol=(0.05, 0.15)` means the
|
|
5476
|
+
lower bound is 5% below and the upper bound is 15% above the expected count.
|
|
5477
|
+
|
|
5478
|
+
When using a single value (integer or float), the tolerance is applied symmetrically in both
|
|
5479
|
+
directions. When using a tuple, you can specify asymmetric tolerances where the lower and
|
|
5480
|
+
upper bounds differ.
|
|
5481
|
+
|
|
5482
|
+
Thresholds
|
|
5483
|
+
----------
|
|
5484
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5485
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5486
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5487
|
+
|
|
5488
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
5489
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
5490
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
5491
|
+
|
|
5492
|
+
Thresholds can be defined using one of these input schemes:
|
|
5493
|
+
|
|
5494
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5495
|
+
thresholds)
|
|
5496
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5497
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5498
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5499
|
+
'critical'
|
|
5500
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5501
|
+
for the 'warning' level only
|
|
5502
|
+
|
|
5503
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
5504
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
5505
|
+
set, you're free to set any combination of them.
|
|
5506
|
+
|
|
5507
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
5508
|
+
take for each level of failure (using the `actions=` parameter).
|
|
5509
|
+
|
|
5510
|
+
Examples
|
|
5511
|
+
--------
|
|
5512
|
+
For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`,
|
|
5513
|
+
and `c`) that have different percentages of Null values. The table is shown below:
|
|
5514
|
+
|
|
5515
|
+
```python
|
|
5516
|
+
import pointblank as pb
|
|
5517
|
+
import polars as pl
|
|
5518
|
+
|
|
5519
|
+
tbl = pl.DataFrame(
|
|
5520
|
+
{
|
|
5521
|
+
"a": [1, 2, 3, 4, 5, 6, 7, 8],
|
|
5522
|
+
"b": [1, None, 3, None, 5, None, 7, None],
|
|
5523
|
+
"c": [None, None, None, None, None, None, 1, 2],
|
|
5524
|
+
}
|
|
5525
|
+
)
|
|
5526
|
+
|
|
5527
|
+
pb.preview(tbl)
|
|
5528
|
+
```
|
|
5529
|
+
|
|
5530
|
+
Let's validate that column `a` has 0% Null values (i.e., no Null values at all).
|
|
5531
|
+
|
|
5532
|
+
```python
|
|
5533
|
+
validation = (
|
|
5534
|
+
pb.Validate(data=tbl)
|
|
5535
|
+
.col_pct_null(columns="a", p=0.0)
|
|
5536
|
+
.interrogate()
|
|
5537
|
+
)
|
|
5538
|
+
|
|
5539
|
+
validation
|
|
5540
|
+
```
|
|
5541
|
+
|
|
5542
|
+
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
5543
|
+
The validation table shows the single entry that corresponds to the validation step created
|
|
5544
|
+
by using `col_pct_null()`. The validation passed since column `a` has no Null values.
|
|
5545
|
+
|
|
5546
|
+
Now, let's check that column `b` has exactly 50% Null values.
|
|
5547
|
+
|
|
5548
|
+
```python
|
|
5549
|
+
validation = (
|
|
5550
|
+
pb.Validate(data=tbl)
|
|
5551
|
+
.col_pct_null(columns="b", p=0.5)
|
|
5552
|
+
.interrogate()
|
|
5553
|
+
)
|
|
5554
|
+
|
|
5555
|
+
validation
|
|
5556
|
+
```
|
|
5557
|
+
|
|
5558
|
+
This validation also passes, as column `b` has exactly 4 out of 8 values as Null (50%).
|
|
5559
|
+
|
|
5560
|
+
Finally, let's validate column `c` with a tolerance. Column `c` has 75% Null values, so
|
|
5561
|
+
we'll check if it's approximately 70% Null with a tolerance of 10%.
|
|
5562
|
+
|
|
5563
|
+
```python
|
|
5564
|
+
validation = (
|
|
5565
|
+
pb.Validate(data=tbl)
|
|
5566
|
+
.col_pct_null(columns="c", p=0.70, tol=0.10)
|
|
5567
|
+
.interrogate()
|
|
5568
|
+
)
|
|
5569
|
+
|
|
5570
|
+
validation
|
|
5571
|
+
```
|
|
5572
|
+
|
|
5573
|
+
This validation passes because the actual percentage (75%) falls within the acceptable
|
|
5574
|
+
range of 60% to 80% (70% ± 10%).
|
|
5575
|
+
|
|
5576
|
+
The `tol=` parameter supports multiple formats to express tolerance. Let's explore all the
|
|
5577
|
+
different ways to specify tolerance using column `b`, which has exactly 50% Null values
|
|
5578
|
+
(4 out of 8 values).
|
|
5579
|
+
|
|
5580
|
+
*Using an absolute tolerance (integer)*: Specify the exact number of rows that can
|
|
5581
|
+
deviate. With `tol=1`, we allow the count to differ by 1 row in either direction.
|
|
5582
|
+
|
|
5583
|
+
```python
|
|
5584
|
+
validation = (
|
|
5585
|
+
pb.Validate(data=tbl)
|
|
5586
|
+
.col_pct_null(columns="b", p=0.375, tol=1) # Expect 3 nulls, allow ±1 (range: 2-4)
|
|
5587
|
+
.interrogate()
|
|
5588
|
+
)
|
|
5589
|
+
|
|
5590
|
+
validation
|
|
5591
|
+
```
|
|
5592
|
+
|
|
5593
|
+
This passes because column `b` has 4 Null values, which falls within the acceptable range
|
|
5594
|
+
of 2 to 4 (3 ± 1).
|
|
5595
|
+
|
|
5596
|
+
*Using a relative tolerance (float)*: Specify the tolerance as a proportion of the
|
|
5597
|
+
expected count. With `tol=0.25`, we allow a 25% deviation from the expected count.
|
|
5598
|
+
|
|
5599
|
+
```python
|
|
5600
|
+
validation = (
|
|
5601
|
+
pb.Validate(data=tbl)
|
|
5602
|
+
.col_pct_null(columns="b", p=0.375, tol=0.25) # Expect 3 nulls, allow ±25% (range: 2.25-3.75)
|
|
5603
|
+
.interrogate()
|
|
5604
|
+
)
|
|
5605
|
+
|
|
5606
|
+
validation
|
|
5607
|
+
```
|
|
5608
|
+
|
|
5609
|
+
This passes because 4 Null values falls within the acceptable range (3 ± 0.75 calculates
|
|
5610
|
+
to 2.25 to 3.75, which rounds down to 2 to 3 rows).
|
|
5611
|
+
|
|
5612
|
+
*Using asymmetric absolute bounds (tuple of integers)*: Specify different lower and
|
|
5613
|
+
upper bounds as absolute values. With `tol=(0, 2)`, we allow no deviation below but up
|
|
5614
|
+
to 2 rows above the expected count.
|
|
5615
|
+
|
|
5616
|
+
```python
|
|
5617
|
+
validation = (
|
|
5618
|
+
pb.Validate(data=tbl)
|
|
5619
|
+
.col_pct_null(columns="b", p=0.25, tol=(0, 2)) # Expect 2 Nulls, allow +0/-2 (range: 2-4)
|
|
5620
|
+
.interrogate()
|
|
5621
|
+
)
|
|
5622
|
+
|
|
5623
|
+
validation
|
|
5624
|
+
```
|
|
5625
|
+
|
|
5626
|
+
This passes because 4 Null values falls within the acceptable range of 2 to 4.
|
|
5627
|
+
|
|
5628
|
+
*Using asymmetric relative bounds (tuple of floats)*: Specify different lower and upper
|
|
5629
|
+
bounds as proportions. With `tol=(0.1, 0.3)`, we allow 10% below and 30% above the
|
|
5630
|
+
expected count.
|
|
5631
|
+
|
|
5632
|
+
```python
|
|
5633
|
+
validation = (
|
|
5634
|
+
pb.Validate(data=tbl)
|
|
5635
|
+
.col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3) # Expect 3 Nulls, allow -10%/+30%
|
|
5636
|
+
.interrogate()
|
|
5637
|
+
)
|
|
5638
|
+
|
|
5639
|
+
validation
|
|
5640
|
+
```
|
|
5641
|
+
|
|
5642
|
+
This passes because 4 Null values falls within the acceptable range (3 - 0.3 to 3 + 0.9
|
|
5643
|
+
calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows).
|
|
5644
|
+
|
|
5645
|
+
|
|
5405
5646
|
col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5406
5647
|
|
|
5407
5648
|
Do columns in the table (and their types) match a predefined schema?
|
|
@@ -8596,7 +8837,7 @@ set_tbl(self, tbl: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str |
|
|
|
8596
8837
|
```
|
|
8597
8838
|
|
|
8598
8839
|
|
|
8599
|
-
get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool' = None, incl_footer: 'bool' = None) -> 'GT'
|
|
8840
|
+
get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool' = None, incl_footer: 'bool' = None, incl_footer_timings: 'bool' = None, incl_footer_notes: 'bool' = None) -> 'GT'
|
|
8600
8841
|
|
|
8601
8842
|
Validation report as a GT table.
|
|
8602
8843
|
|
|
@@ -8618,6 +8859,20 @@ get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool'
|
|
|
8618
8859
|
name of the table as the title for the report. If no title is wanted, then `":none:"`
|
|
8619
8860
|
can be used. Aside from keyword options, text can be provided for the title. This will
|
|
8620
8861
|
be interpreted as Markdown text and transformed internally to HTML.
|
|
8862
|
+
incl_header
|
|
8863
|
+
Controls whether the header section should be displayed. If `None`, uses the global
|
|
8864
|
+
configuration setting. The header contains the table name, label, and threshold
|
|
8865
|
+
information.
|
|
8866
|
+
incl_footer
|
|
8867
|
+
Controls whether the footer section should be displayed. If `None`, uses the global
|
|
8868
|
+
configuration setting. The footer can contain validation timing information and notes.
|
|
8869
|
+
incl_footer_timings
|
|
8870
|
+
Controls whether validation timing information (start time, duration, end time) should
|
|
8871
|
+
be displayed in the footer. If `None`, uses the global configuration setting. Only
|
|
8872
|
+
applies when `incl_footer=True`.
|
|
8873
|
+
incl_footer_notes
|
|
8874
|
+
Controls whether notes from validation steps should be displayed in the footer. If
|
|
8875
|
+
`None`, uses the global configuration setting. Only applies when `incl_footer=True`.
|
|
8621
8876
|
|
|
8622
8877
|
Returns
|
|
8623
8878
|
-------
|
|
@@ -12310,7 +12565,7 @@ read_file(filepath: 'str | Path') -> 'Validate'
|
|
|
12310
12565
|
to disk for later retrieval with this function.
|
|
12311
12566
|
|
|
12312
12567
|
|
|
12313
|
-
config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, preview_incl_header: 'bool' = True) -> 'PointblankConfig'
|
|
12568
|
+
config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, report_incl_footer_timings: 'bool' = True, report_incl_footer_notes: 'bool' = True, preview_incl_header: 'bool' = True) -> 'PointblankConfig'
|
|
12314
12569
|
|
|
12315
12570
|
Configuration settings for the Pointblank library.
|
|
12316
12571
|
|
|
@@ -12322,7 +12577,13 @@ config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, pre
|
|
|
12322
12577
|
threshold levels (if set).
|
|
12323
12578
|
report_incl_footer
|
|
12324
12579
|
Should the footer of the validation table report be displayed? The footer contains the
|
|
12325
|
-
starting and ending times of the interrogation.
|
|
12580
|
+
starting and ending times of the interrogation and any notes added to validation steps.
|
|
12581
|
+
report_incl_footer_timings
|
|
12582
|
+
Controls whether the validation timing information (start time, duration, and end time)
|
|
12583
|
+
should be displayed in the footer. Only applies when `report_incl_footer=True`.
|
|
12584
|
+
report_incl_footer_notes
|
|
12585
|
+
Controls whether the notes from validation steps should be displayed in the footer. Only
|
|
12586
|
+
applies when `report_incl_footer=True`.
|
|
12326
12587
|
preview_incl_header
|
|
12327
12588
|
Whether the header should be present in any preview table (generated via the
|
|
12328
12589
|
[`preview()`](`pointblank.preview`) function).
|