pointblank 0.9.4__py3-none-any.whl → 0.9.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants.py +2 -0
- pointblank/_utils.py +2 -0
- pointblank/actions.py +3 -3
- pointblank/column.py +4 -4
- pointblank/data/api-docs.txt +271 -18
- pointblank/schema.py +8 -1
- pointblank/thresholds.py +2 -2
- pointblank/validate.py +338 -4
- {pointblank-0.9.4.dist-info → pointblank-0.9.6.dist-info}/METADATA +7 -3
- {pointblank-0.9.4.dist-info → pointblank-0.9.6.dist-info}/RECORD +13 -13
- {pointblank-0.9.4.dist-info → pointblank-0.9.6.dist-info}/WHEEL +1 -1
- {pointblank-0.9.4.dist-info → pointblank-0.9.6.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.9.4.dist-info → pointblank-0.9.6.dist-info}/top_level.txt +0 -0
pointblank/_constants.py
CHANGED
|
@@ -109,6 +109,7 @@ ROW_BASED_VALIDATION_TYPES = [
|
|
|
109
109
|
]
|
|
110
110
|
|
|
111
111
|
IBIS_BACKENDS = [
|
|
112
|
+
"bigquery",
|
|
112
113
|
"databricks",
|
|
113
114
|
"duckdb",
|
|
114
115
|
"memtable",
|
|
@@ -165,6 +166,7 @@ TABLE_TYPE_STYLES = {
|
|
|
165
166
|
"parquet": {"background": "#3F9FF9", "text": "#FFFFFF", "label": "Parquet"},
|
|
166
167
|
"memtable": {"background": "#2C3E50", "text": "#FFFFFF", "label": "Ibis memtable"},
|
|
167
168
|
"mssql": {"background": "#E2E2E2", "text": "#222222", "label": "MSSQL"},
|
|
169
|
+
"bigquery": {"background": "#4285F4", "text": "#FFFFFF", "label": "BigQuery"},
|
|
168
170
|
"pyspark": {"background": "#E66F21", "text": "#FFFFFF", "label": "Spark DataFrame"},
|
|
169
171
|
"databricks": {"background": "#FF3621", "text": "#FFFFFF", "label": "Databricks"},
|
|
170
172
|
}
|
pointblank/_utils.py
CHANGED
|
@@ -514,6 +514,8 @@ def _get_api_text() -> str:
|
|
|
514
514
|
"Validate.get_data_extracts",
|
|
515
515
|
"Validate.all_passed",
|
|
516
516
|
"Validate.assert_passing",
|
|
517
|
+
"Validate.assert_below_threshold",
|
|
518
|
+
"Validate.above_threshold",
|
|
517
519
|
"Validate.n",
|
|
518
520
|
"Validate.n_passed",
|
|
519
521
|
"Validate.n_failed",
|
pointblank/actions.py
CHANGED
|
@@ -216,7 +216,7 @@ def send_slack_notification(
|
|
|
216
216
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
217
217
|
actions=pb.Actions(critical=notify_slack),
|
|
218
218
|
)
|
|
219
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
219
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
220
220
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
221
221
|
.col_vals_gt(columns="session_duration", value=15)
|
|
222
222
|
.interrogate()
|
|
@@ -248,7 +248,7 @@ def send_slack_notification(
|
|
|
248
248
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
249
249
|
final_actions=pb.FinalActions(notify_slack),
|
|
250
250
|
)
|
|
251
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
251
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
252
252
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
253
253
|
.col_vals_gt(columns="session_duration", value=15)
|
|
254
254
|
.interrogate()
|
|
@@ -316,7 +316,7 @@ def send_slack_notification(
|
|
|
316
316
|
actions=pb.Actions(default=notify_slack),
|
|
317
317
|
final_actions=pb.FinalActions(notify_slack),
|
|
318
318
|
)
|
|
319
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
319
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
320
320
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
321
321
|
.col_vals_gt(columns="session_duration", value=15)
|
|
322
322
|
.interrogate()
|
pointblank/column.py
CHANGED
|
@@ -1007,7 +1007,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
|
|
|
1007
1007
|
`[rev_01, rev_02, profit_01, profit_02, age]`
|
|
1008
1008
|
|
|
1009
1009
|
and you want to validate columns that have two digits at the end of the name, you can use
|
|
1010
|
-
`columns=matches(r"
|
|
1010
|
+
`columns=matches(r"[0-9]{2}$")`. This will select the `rev_01`, `rev_02`, `profit_01`, and
|
|
1011
1011
|
`profit_02` columns.
|
|
1012
1012
|
|
|
1013
1013
|
There will be a validation step created for every resolved column. Note that if there aren't any
|
|
@@ -1061,7 +1061,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
|
|
|
1061
1061
|
[`col()`](`pointblank.col`) function, like this:
|
|
1062
1062
|
|
|
1063
1063
|
```python
|
|
1064
|
-
col(matches(r"
|
|
1064
|
+
col(matches(r"^[0-9]{5}") & ends_with("_id"))
|
|
1065
1065
|
```
|
|
1066
1066
|
|
|
1067
1067
|
There are four operators that can be used to compose column selectors:
|
|
@@ -1107,7 +1107,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
|
|
|
1107
1107
|
|
|
1108
1108
|
validation = (
|
|
1109
1109
|
pb.Validate(data=tbl)
|
|
1110
|
-
.col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID
|
|
1110
|
+
.col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID[0-9]{4}")
|
|
1111
1111
|
.interrogate()
|
|
1112
1112
|
)
|
|
1113
1113
|
|
|
@@ -1115,7 +1115,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
|
|
|
1115
1115
|
```
|
|
1116
1116
|
|
|
1117
1117
|
From the results of the validation table we get two validation steps, one for `id_old` and one
|
|
1118
|
-
for `new_identifier`. The values in both columns all match the pattern `"ID
|
|
1118
|
+
for `new_identifier`. The values in both columns all match the pattern `"ID[0-9]{4}"`.
|
|
1119
1119
|
|
|
1120
1120
|
We can also use the `matches()` function in combination with other column selectors (within
|
|
1121
1121
|
[`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select
|
pointblank/data/api-docs.txt
CHANGED
|
@@ -107,6 +107,11 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
107
107
|
- MySQL table (`"mysql"`)*
|
|
108
108
|
- PostgreSQL table (`"postgresql"`)*
|
|
109
109
|
- SQLite table (`"sqlite"`)*
|
|
110
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
111
|
+
- Snowflake table (`"snowflake"`)*
|
|
112
|
+
- Databricks table (`"databricks"`)*
|
|
113
|
+
- PySpark table (`"pyspark"`)*
|
|
114
|
+
- BigQuery table (`"bigquery"`)*
|
|
110
115
|
- Parquet table (`"parquet"`)*
|
|
111
116
|
|
|
112
117
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -580,7 +585,7 @@ Actions(warning: 'str | Callable | list[str | Callable] | None' = None, error: '
|
|
|
580
585
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
581
586
|
actions=pb.Actions(critical="Major data quality issue found in step {step}."),
|
|
582
587
|
)
|
|
583
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
588
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
584
589
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
585
590
|
.col_vals_gt(columns="session_duration", value=15)
|
|
586
591
|
.interrogate()
|
|
@@ -610,7 +615,7 @@ Actions(warning: 'str | Callable | list[str | Callable] | None' = None, error: '
|
|
|
610
615
|
data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"),
|
|
611
616
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
612
617
|
)
|
|
613
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
618
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
614
619
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
615
620
|
.col_vals_gt(
|
|
616
621
|
columns="session_duration",
|
|
@@ -6231,7 +6236,7 @@ matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches'
|
|
|
6231
6236
|
`[rev_01, rev_02, profit_01, profit_02, age]`
|
|
6232
6237
|
|
|
6233
6238
|
and you want to validate columns that have two digits at the end of the name, you can use
|
|
6234
|
-
`columns=matches(r"
|
|
6239
|
+
`columns=matches(r"[0-9]{2}$")`. This will select the `rev_01`, `rev_02`, `profit_01`, and
|
|
6235
6240
|
`profit_02` columns.
|
|
6236
6241
|
|
|
6237
6242
|
There will be a validation step created for every resolved column. Note that if there aren't any
|
|
@@ -6285,7 +6290,7 @@ matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches'
|
|
|
6285
6290
|
[`col()`](`pointblank.col`) function, like this:
|
|
6286
6291
|
|
|
6287
6292
|
```python
|
|
6288
|
-
col(matches(r"
|
|
6293
|
+
col(matches(r"^[0-9]{5}") & ends_with("_id"))
|
|
6289
6294
|
```
|
|
6290
6295
|
|
|
6291
6296
|
There are four operators that can be used to compose column selectors:
|
|
@@ -6324,7 +6329,7 @@ matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches'
|
|
|
6324
6329
|
|
|
6325
6330
|
validation = (
|
|
6326
6331
|
pb.Validate(data=tbl)
|
|
6327
|
-
.col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID
|
|
6332
|
+
.col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID[0-9]{4}")
|
|
6328
6333
|
.interrogate()
|
|
6329
6334
|
)
|
|
6330
6335
|
|
|
@@ -6332,7 +6337,7 @@ matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches'
|
|
|
6332
6337
|
```
|
|
6333
6338
|
|
|
6334
6339
|
From the results of the validation table we get two validation steps, one for `id_old` and one
|
|
6335
|
-
for `new_identifier`. The values in both columns all match the pattern `"ID
|
|
6340
|
+
for `new_identifier`. The values in both columns all match the pattern `"ID[0-9]{4}"`.
|
|
6336
6341
|
|
|
6337
6342
|
We can also use the `matches()` function in combination with other column selectors (within
|
|
6338
6343
|
[`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select
|
|
@@ -6875,7 +6880,7 @@ interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' =
|
|
|
6875
6880
|
|
|
6876
6881
|
After interrogation is complete, the `Validate` object will have gathered information, and
|
|
6877
6882
|
we can use methods like [`n_passed()`](`pointblank.Validate.n_passed`),
|
|
6878
|
-
[`f_failed()`](`pointblank.Validate.f_failed`)
|
|
6883
|
+
[`f_failed()`](`pointblank.Validate.f_failed`), etc., to understand how the table performed
|
|
6879
6884
|
against the validation plan. A visual representation of the validation results can be viewed
|
|
6880
6885
|
by printing the `Validate` object; this will display the validation table in an HTML viewing
|
|
6881
6886
|
environment.
|
|
@@ -7578,6 +7583,10 @@ assert_passing(self) -> 'None'
|
|
|
7578
7583
|
assertion made is printed in the `AssertionError` message if a failure occurs, ensuring
|
|
7579
7584
|
some details are preserved.
|
|
7580
7585
|
|
|
7586
|
+
If the validation has not yet been interrogated, this method will automatically call
|
|
7587
|
+
[`interrogate()`](`pointblank.Validate.interrogate`) with default parameters before checking
|
|
7588
|
+
for passing tests.
|
|
7589
|
+
|
|
7581
7590
|
Raises
|
|
7582
7591
|
-------
|
|
7583
7592
|
AssertionError
|
|
@@ -7587,8 +7596,9 @@ assert_passing(self) -> 'None'
|
|
|
7587
7596
|
--------
|
|
7588
7597
|
In the example below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and
|
|
7589
7598
|
`c`). There will be three validation steps, and the second step will have a failing test
|
|
7590
|
-
unit (the value `10` isn't less than `9`).
|
|
7591
|
-
|
|
7599
|
+
unit (the value `10` isn't less than `9`). The `assert_passing()` method is used to assert
|
|
7600
|
+
that all validation steps passed perfectly, automatically performing the interrogation if
|
|
7601
|
+
needed.
|
|
7592
7602
|
|
|
7593
7603
|
```python
|
|
7594
7604
|
#| error: True
|
|
@@ -7609,13 +7619,221 @@ assert_passing(self) -> 'None'
|
|
|
7609
7619
|
.col_vals_gt(columns="a", value=0)
|
|
7610
7620
|
.col_vals_lt(columns="b", value=9) # this assertion is false
|
|
7611
7621
|
.col_vals_in_set(columns="c", set=["a", "b"])
|
|
7612
|
-
.interrogate()
|
|
7613
7622
|
)
|
|
7614
7623
|
|
|
7624
|
+
# No need to call [`interrogate()`](`pointblank.Validate.interrogate`) explicitly
|
|
7615
7625
|
validation.assert_passing()
|
|
7616
7626
|
```
|
|
7617
7627
|
|
|
7618
7628
|
|
|
7629
|
+
assert_below_threshold(self, level: 'str' = 'warning', i: 'int | None' = None, message: 'str | None' = None) -> 'None'
|
|
7630
|
+
|
|
7631
|
+
Raise an `AssertionError` if validation steps exceed a specified threshold level.
|
|
7632
|
+
|
|
7633
|
+
The `assert_below_threshold()` method checks whether validation steps' failure rates are
|
|
7634
|
+
below a given threshold level (`"warning"`, `"error"`, or `"critical"`). This is
|
|
7635
|
+
particularly useful in automated testing environments where you want to ensure your data
|
|
7636
|
+
quality meets minimum standards before proceeding.
|
|
7637
|
+
|
|
7638
|
+
If any validation step exceeds the specified threshold level, an `AssertionError` will be
|
|
7639
|
+
raised with details about which steps failed. If the validation has not yet been
|
|
7640
|
+
interrogated, this method will automatically call
|
|
7641
|
+
[`interrogate()`](`pointblank.Validate.interrogate`) with default parameters.
|
|
7642
|
+
|
|
7643
|
+
Parameters
|
|
7644
|
+
----------
|
|
7645
|
+
level
|
|
7646
|
+
The threshold level to check against, which could be any of `"warning"` (the default),
|
|
7647
|
+
`"error"`, or `"critical"`. An `AssertionError` will be raised if any validation step
|
|
7648
|
+
exceeds this level.
|
|
7649
|
+
i
|
|
7650
|
+
Specific validation step number(s) to check. Can be provided as a single integer or a
|
|
7651
|
+
list of integers. If `None` (the default), all steps are checked.
|
|
7652
|
+
message
|
|
7653
|
+
Custom error message to use if assertion fails. If `None`, a default message will be
|
|
7654
|
+
generated that lists the specific steps that exceeded the threshold.
|
|
7655
|
+
|
|
7656
|
+
Returns
|
|
7657
|
+
-------
|
|
7658
|
+
None
|
|
7659
|
+
|
|
7660
|
+
Raises
|
|
7661
|
+
------
|
|
7662
|
+
AssertionError
|
|
7663
|
+
If any specified validation step exceeds the given threshold level.
|
|
7664
|
+
ValueError
|
|
7665
|
+
If an invalid threshold level is provided.
|
|
7666
|
+
|
|
7667
|
+
Examples
|
|
7668
|
+
--------
|
|
7669
|
+
Below are some examples of how to use the `assert_below_threshold()` method. First, we'll
|
|
7670
|
+
create a simple Polars DataFrame with two columns (`a` and `b`).
|
|
7671
|
+
|
|
7672
|
+
```python
|
|
7673
|
+
import polars as pl
|
|
7674
|
+
|
|
7675
|
+
tbl = pl.DataFrame({
|
|
7676
|
+
"a": [7, 4, 9, 7, 12],
|
|
7677
|
+
"b": [9, 8, 10, 5, 10]
|
|
7678
|
+
})
|
|
7679
|
+
```
|
|
7680
|
+
|
|
7681
|
+
Then a validation plan will be created with thresholds (`warning=0.1`, `error=0.2`,
|
|
7682
|
+
`critical=0.3`). After interrogating, we display the validation report table:
|
|
7683
|
+
|
|
7684
|
+
```python
|
|
7685
|
+
import pointblank as pb
|
|
7686
|
+
|
|
7687
|
+
validation = (
|
|
7688
|
+
pb.Validate(data=tbl, thresholds=(0.1, 0.2, 0.3))
|
|
7689
|
+
.col_vals_gt(columns="a", value=5) # 1 failing test unit
|
|
7690
|
+
.col_vals_lt(columns="b", value=10) # 2 failing test units
|
|
7691
|
+
.interrogate()
|
|
7692
|
+
)
|
|
7693
|
+
|
|
7694
|
+
validation
|
|
7695
|
+
```
|
|
7696
|
+
|
|
7697
|
+
Using `assert_below_threshold(level="warning")` will raise an `AssertionError` if any step
|
|
7698
|
+
exceeds the 'warning' threshold:
|
|
7699
|
+
|
|
7700
|
+
Check a specific step against the 'critical' threshold using the `i=` parameter:
|
|
7701
|
+
|
|
7702
|
+
```python
|
|
7703
|
+
validation.assert_below_threshold(level="critical", i=1) # Won't raise an error
|
|
7704
|
+
```
|
|
7705
|
+
|
|
7706
|
+
As the first step is below the 'critical' threshold (it exceeds the 'warning' and 'error'
|
|
7707
|
+
thresholds), no error is raised and nothing is printed.
|
|
7708
|
+
|
|
7709
|
+
We can also provide a custom error message with the `message=` parameter. Let's try that
|
|
7710
|
+
here:
|
|
7711
|
+
|
|
7712
|
+
```python
|
|
7713
|
+
try:
|
|
7714
|
+
validation.assert_below_threshold(
|
|
7715
|
+
level="error",
|
|
7716
|
+
message="Data quality too low for processing!"
|
|
7717
|
+
)
|
|
7718
|
+
except AssertionError as e:
|
|
7719
|
+
print(f"Custom error: {e}")
|
|
7720
|
+
```
|
|
7721
|
+
|
|
7722
|
+
See Also
|
|
7723
|
+
--------
|
|
7724
|
+
- [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
|
|
7725
|
+
step
|
|
7726
|
+
- [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
|
|
7727
|
+
- [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
|
|
7728
|
+
validation step
|
|
7729
|
+
- [`assert_passing()`](`pointblank.Validate.assert_passing`): assert all validations pass
|
|
7730
|
+
completely
|
|
7731
|
+
|
|
7732
|
+
|
|
7733
|
+
above_threshold(self, level: 'str' = 'warning', i: 'int | None' = None) -> 'bool'
|
|
7734
|
+
|
|
7735
|
+
Check if any validation steps exceed a specified threshold level.
|
|
7736
|
+
|
|
7737
|
+
The `above_threshold()` method checks whether validation steps exceed a given threshold
|
|
7738
|
+
level. This provides a non-exception-based alternative to
|
|
7739
|
+
[`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`) for conditional
|
|
7740
|
+
workflow control based on validation results.
|
|
7741
|
+
|
|
7742
|
+
This method is useful in scenarios where you want to check if any validation steps failed
|
|
7743
|
+
beyond a certain threshold without raising an exception, allowing for more flexible
|
|
7744
|
+
programmatic responses to validation issues.
|
|
7745
|
+
|
|
7746
|
+
Parameters
|
|
7747
|
+
----------
|
|
7748
|
+
level
|
|
7749
|
+
The threshold level to check against. Valid options are: `"warning"` (the least severe
|
|
7750
|
+
threshold level), `"error"` (the middle severity threshold level), and `"critical"` (the
|
|
7751
|
+
most severe threshold level). The default is `"warning"`.
|
|
7752
|
+
i
|
|
7753
|
+
Specific validation step number(s) to check. If a single integer, checks only that step.
|
|
7754
|
+
If a list of integers, checks all specified steps. If `None` (the default), checks all
|
|
7755
|
+
validation steps. Step numbers are 1-based (first step is `1`, not `0`).
|
|
7756
|
+
|
|
7757
|
+
Returns
|
|
7758
|
+
-------
|
|
7759
|
+
bool
|
|
7760
|
+
`True` if any of the specified validation steps exceed the given threshold level,
|
|
7761
|
+
`False` otherwise.
|
|
7762
|
+
|
|
7763
|
+
Raises
|
|
7764
|
+
------
|
|
7765
|
+
ValueError
|
|
7766
|
+
If an invalid threshold level is provided.
|
|
7767
|
+
|
|
7768
|
+
Examples
|
|
7769
|
+
--------
|
|
7770
|
+
Below are some examples of how to use the `above_threshold()` method. First, we'll create a
|
|
7771
|
+
simple Polars DataFrame with a single column (`values`).
|
|
7772
|
+
|
|
7773
|
+
Then a validation plan will be created with thresholds (`warning=0.1`, `error=0.2`,
|
|
7774
|
+
`critical=0.3`). After interrogating, we display the validation report table:
|
|
7775
|
+
|
|
7776
|
+
```python
|
|
7777
|
+
import pointblank as pb
|
|
7778
|
+
|
|
7779
|
+
validation = (
|
|
7780
|
+
pb.Validate(data=tbl, thresholds=(0.1, 0.2, 0.3))
|
|
7781
|
+
.col_vals_gt(columns="values", value=0)
|
|
7782
|
+
.col_vals_lt(columns="values", value=10)
|
|
7783
|
+
.col_vals_between(columns="values", left=0, right=5)
|
|
7784
|
+
.interrogate()
|
|
7785
|
+
)
|
|
7786
|
+
|
|
7787
|
+
validation
|
|
7788
|
+
```
|
|
7789
|
+
|
|
7790
|
+
Let's check if any steps exceed the 'warning' threshold with the `above_threshold()` method.
|
|
7791
|
+
A message will be printed if that's the case:
|
|
7792
|
+
|
|
7793
|
+
```python
|
|
7794
|
+
if validation.above_threshold(level="warning"):
|
|
7795
|
+
print("Some steps have exceeded the warning threshold")
|
|
7796
|
+
```
|
|
7797
|
+
|
|
7798
|
+
Check if only steps 2 and 3 exceed the 'error' threshold through use of the `i=` argument:
|
|
7799
|
+
|
|
7800
|
+
```python
|
|
7801
|
+
if validation.above_threshold(level="error", i=[2, 3]):
|
|
7802
|
+
print("Steps 2 and/or 3 have exceeded the error threshold")
|
|
7803
|
+
```
|
|
7804
|
+
|
|
7805
|
+
You can use this in a workflow to conditionally trigger processes. Here's a snippet of how
|
|
7806
|
+
you might use this in a function:
|
|
7807
|
+
|
|
7808
|
+
```python
|
|
7809
|
+
def process_data(validation_obj):
|
|
7810
|
+
# Only continue processing if validation passes critical thresholds
|
|
7811
|
+
if not validation_obj.above_threshold(level="critical"):
|
|
7812
|
+
# Continue with processing
|
|
7813
|
+
print("Data meets critical quality thresholds, proceeding...")
|
|
7814
|
+
return True
|
|
7815
|
+
else:
|
|
7816
|
+
# Log failure and stop processing
|
|
7817
|
+
print("Data fails critical quality checks, aborting...")
|
|
7818
|
+
return False
|
|
7819
|
+
```
|
|
7820
|
+
|
|
7821
|
+
Note that this is just a suggestion for how to implement conditional workflow processes. You
|
|
7822
|
+
should adapt this pattern to your specific requirements, which might include different
|
|
7823
|
+
threshold levels, custom logging mechanisms, or integration with your organization's data
|
|
7824
|
+
pipelines and notification systems.
|
|
7825
|
+
|
|
7826
|
+
See Also
|
|
7827
|
+
--------
|
|
7828
|
+
- [`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`): a similar
|
|
7829
|
+
method that raises an exception if thresholds are exceeded
|
|
7830
|
+
- [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
|
|
7831
|
+
step
|
|
7832
|
+
- [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
|
|
7833
|
+
- [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
|
|
7834
|
+
validation step
|
|
7835
|
+
|
|
7836
|
+
|
|
7619
7837
|
n(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int, int] | int'
|
|
7620
7838
|
|
|
7621
7839
|
Provides a dictionary of the number of test units for each validation step.
|
|
@@ -8504,6 +8722,11 @@ preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None'
|
|
|
8504
8722
|
- MySQL table (`"mysql"`)*
|
|
8505
8723
|
- PostgreSQL table (`"postgresql"`)*
|
|
8506
8724
|
- SQLite table (`"sqlite"`)*
|
|
8725
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
8726
|
+
- Snowflake table (`"snowflake"`)*
|
|
8727
|
+
- Databricks table (`"databricks"`)*
|
|
8728
|
+
- PySpark table (`"pyspark"`)*
|
|
8729
|
+
- BigQuery table (`"bigquery"`)*
|
|
8507
8730
|
- Parquet table (`"parquet"`)*
|
|
8508
8731
|
|
|
8509
8732
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -8672,6 +8895,11 @@ missing_vals_tbl(data: 'FrameT | Any') -> 'GT'
|
|
|
8672
8895
|
- MySQL table (`"mysql"`)*
|
|
8673
8896
|
- PostgreSQL table (`"postgresql"`)*
|
|
8674
8897
|
- SQLite table (`"sqlite"`)*
|
|
8898
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
8899
|
+
- Snowflake table (`"snowflake"`)*
|
|
8900
|
+
- Databricks table (`"databricks"`)*
|
|
8901
|
+
- PySpark table (`"pyspark"`)*
|
|
8902
|
+
- BigQuery table (`"bigquery"`)*
|
|
8675
8903
|
- Parquet table (`"parquet"`)*
|
|
8676
8904
|
|
|
8677
8905
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -8837,7 +9065,7 @@ assistant(model: 'str', data: 'FrameT | Any | None' = None, tbl_name: 'str | Non
|
|
|
8837
9065
|
Pandas DataFrame, the availability of Ibis is not needed.
|
|
8838
9066
|
|
|
8839
9067
|
|
|
8840
|
-
load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights']" = 'small_table', tbl_type: "Literal['polars', 'pandas', 'duckdb']" = 'polars') -> 'FrameT | Any'
|
|
9068
|
+
load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'global_sales']" = 'small_table', tbl_type: "Literal['polars', 'pandas', 'duckdb']" = 'polars') -> 'FrameT | Any'
|
|
8841
9069
|
|
|
8842
9070
|
Load a dataset hosted in the library as specified table type.
|
|
8843
9071
|
|
|
@@ -8851,7 +9079,7 @@ load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights']" = '
|
|
|
8851
9079
|
----------
|
|
8852
9080
|
dataset
|
|
8853
9081
|
The name of the dataset to load. Current options are `"small_table"`, `"game_revenue"`,
|
|
8854
|
-
and `"
|
|
9082
|
+
`"nycflights"`, and `"global_sales"`.
|
|
8855
9083
|
tbl_type
|
|
8856
9084
|
The type of table to generate from the dataset. The named options are `"polars"`,
|
|
8857
9085
|
`"pandas"`, and `"duckdb"`.
|
|
@@ -8873,6 +9101,8 @@ load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights']" = '
|
|
|
8873
9101
|
they purchased, ads viewed, and the revenue generated.
|
|
8874
9102
|
- `"nycflights"`: A dataset with 336,776 rows and 18 columns. This dataset provides information
|
|
8875
9103
|
about flights departing from New York City airports (JFK, LGA, or EWR) in 2013.
|
|
9104
|
+
- `"global_sales"`: A dataset with 50,000 rows and 20 columns. Provides information about
|
|
9105
|
+
global sales of products across different regions and countries.
|
|
8876
9106
|
|
|
8877
9107
|
Supported DataFrame Types
|
|
8878
9108
|
-------------------------
|
|
@@ -8884,10 +9114,10 @@ load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights']" = '
|
|
|
8884
9114
|
|
|
8885
9115
|
Examples
|
|
8886
9116
|
--------
|
|
8887
|
-
Load the `"small_table"` dataset as a Polars DataFrame by calling `load_dataset()` with
|
|
8888
|
-
|
|
9117
|
+
Load the `"small_table"` dataset as a Polars DataFrame by calling `load_dataset()` with
|
|
9118
|
+
`dataset="small_table"` and `tbl_type="polars"`:
|
|
8889
9119
|
|
|
8890
|
-
Note that the `"small_table"` dataset is a
|
|
9120
|
+
Note that the `"small_table"` dataset is a Polars DataFrame and using the
|
|
8891
9121
|
[`preview()`](`pointblank.preview`) function will display the table in an HTML viewing
|
|
8892
9122
|
environment.
|
|
8893
9123
|
|
|
@@ -8915,6 +9145,19 @@ load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights']" = '
|
|
|
8915
9145
|
The `"nycflights"` dataset is a large dataset with 336,776 rows and 18 columns. This dataset is
|
|
8916
9146
|
truly a real-world dataset and provides information about flights originating from New York City
|
|
8917
9147
|
airports in 2013.
|
|
9148
|
+
|
|
9149
|
+
Finally, the `"global_sales"` dataset can be loaded as a Polars table by specifying the dataset
|
|
9150
|
+
name. Since `tbl_type=` is set to `"polars"` by default, we don't need to specify it:
|
|
9151
|
+
|
|
9152
|
+
```python
|
|
9153
|
+
global_sales = pb.load_dataset(dataset="global_sales")
|
|
9154
|
+
|
|
9155
|
+
pb.preview(global_sales)
|
|
9156
|
+
```
|
|
9157
|
+
|
|
9158
|
+
The `"global_sales"` dataset is a large dataset with 50,000 rows and 20 columns. Each record
|
|
9159
|
+
describes the sales of a particular product to a customer located in one of three global
|
|
9160
|
+
regions: North America, Europe, or Asia.
|
|
8918
9161
|
|
|
8919
9162
|
|
|
8920
9163
|
|
|
@@ -8956,6 +9199,11 @@ get_column_count(data: 'FrameT | Any') -> 'int'
|
|
|
8956
9199
|
- MySQL table (`"mysql"`)*
|
|
8957
9200
|
- PostgreSQL table (`"postgresql"`)*
|
|
8958
9201
|
- SQLite table (`"sqlite"`)*
|
|
9202
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
9203
|
+
- Snowflake table (`"snowflake"`)*
|
|
9204
|
+
- Databricks table (`"databricks"`)*
|
|
9205
|
+
- PySpark table (`"pyspark"`)*
|
|
9206
|
+
- BigQuery table (`"bigquery"`)*
|
|
8959
9207
|
- Parquet table (`"parquet"`)*
|
|
8960
9208
|
|
|
8961
9209
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -9013,6 +9261,11 @@ get_row_count(data: 'FrameT | Any') -> 'int'
|
|
|
9013
9261
|
- MySQL table (`"mysql"`)*
|
|
9014
9262
|
- PostgreSQL table (`"postgresql"`)*
|
|
9015
9263
|
- SQLite table (`"sqlite"`)*
|
|
9264
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
9265
|
+
- Snowflake table (`"snowflake"`)*
|
|
9266
|
+
- Databricks table (`"databricks"`)*
|
|
9267
|
+
- PySpark table (`"pyspark"`)*
|
|
9268
|
+
- BigQuery table (`"bigquery"`)*
|
|
9016
9269
|
- Parquet table (`"parquet"`)*
|
|
9017
9270
|
|
|
9018
9271
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -9452,7 +9705,7 @@ send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None'
|
|
|
9452
9705
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
9453
9706
|
actions=pb.Actions(critical=notify_slack),
|
|
9454
9707
|
)
|
|
9455
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
9708
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
9456
9709
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
9457
9710
|
.col_vals_gt(columns="session_duration", value=15)
|
|
9458
9711
|
.interrogate()
|
|
@@ -9484,7 +9737,7 @@ send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None'
|
|
|
9484
9737
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
9485
9738
|
final_actions=pb.FinalActions(notify_slack),
|
|
9486
9739
|
)
|
|
9487
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
9740
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
9488
9741
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
9489
9742
|
.col_vals_gt(columns="session_duration", value=15)
|
|
9490
9743
|
.interrogate()
|
|
@@ -9552,7 +9805,7 @@ send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None'
|
|
|
9552
9805
|
actions=pb.Actions(default=notify_slack),
|
|
9553
9806
|
final_actions=pb.FinalActions(notify_slack),
|
|
9554
9807
|
)
|
|
9555
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
9808
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
9556
9809
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
9557
9810
|
.col_vals_gt(columns="session_duration", value=15)
|
|
9558
9811
|
.interrogate()
|
pointblank/schema.py
CHANGED
|
@@ -728,7 +728,14 @@ class Schema:
|
|
|
728
728
|
return new_schema
|
|
729
729
|
|
|
730
730
|
def __str__(self):
|
|
731
|
-
|
|
731
|
+
formatted_columns = []
|
|
732
|
+
for col in self.columns:
|
|
733
|
+
if len(col) == 1: # Only column name provided (no data type)
|
|
734
|
+
formatted_columns.append(f" {col[0]}: <ANY>")
|
|
735
|
+
else: # Both column name and data type provided
|
|
736
|
+
formatted_columns.append(f" {col[0]}: {col[1]}")
|
|
737
|
+
|
|
738
|
+
return "Pointblank Schema\n" + "\n".join(formatted_columns)
|
|
732
739
|
|
|
733
740
|
def __repr__(self):
|
|
734
741
|
return f"Schema(columns={self.columns})"
|
pointblank/thresholds.py
CHANGED
|
@@ -404,7 +404,7 @@ class Actions:
|
|
|
404
404
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
405
405
|
actions=pb.Actions(critical="Major data quality issue found in step {step}."),
|
|
406
406
|
)
|
|
407
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
407
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
408
408
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
409
409
|
.col_vals_gt(columns="session_duration", value=15)
|
|
410
410
|
.interrogate()
|
|
@@ -434,7 +434,7 @@ class Actions:
|
|
|
434
434
|
data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"),
|
|
435
435
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
436
436
|
)
|
|
437
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
437
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
438
438
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
439
439
|
.col_vals_gt(
|
|
440
440
|
columns="session_duration",
|
pointblank/validate.py
CHANGED
|
@@ -636,6 +636,11 @@ def preview(
|
|
|
636
636
|
- MySQL table (`"mysql"`)*
|
|
637
637
|
- PostgreSQL table (`"postgresql"`)*
|
|
638
638
|
- SQLite table (`"sqlite"`)*
|
|
639
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
640
|
+
- Snowflake table (`"snowflake"`)*
|
|
641
|
+
- Databricks table (`"databricks"`)*
|
|
642
|
+
- PySpark table (`"pyspark"`)*
|
|
643
|
+
- BigQuery table (`"bigquery"`)*
|
|
639
644
|
- Parquet table (`"parquet"`)*
|
|
640
645
|
|
|
641
646
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -1134,6 +1139,11 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
|
|
|
1134
1139
|
- MySQL table (`"mysql"`)*
|
|
1135
1140
|
- PostgreSQL table (`"postgresql"`)*
|
|
1136
1141
|
- SQLite table (`"sqlite"`)*
|
|
1142
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
1143
|
+
- Snowflake table (`"snowflake"`)*
|
|
1144
|
+
- Databricks table (`"databricks"`)*
|
|
1145
|
+
- PySpark table (`"pyspark"`)*
|
|
1146
|
+
- BigQuery table (`"bigquery"`)*
|
|
1137
1147
|
- Parquet table (`"parquet"`)*
|
|
1138
1148
|
|
|
1139
1149
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -1663,6 +1673,11 @@ def get_column_count(data: FrameT | Any) -> int:
|
|
|
1663
1673
|
- MySQL table (`"mysql"`)*
|
|
1664
1674
|
- PostgreSQL table (`"postgresql"`)*
|
|
1665
1675
|
- SQLite table (`"sqlite"`)*
|
|
1676
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
1677
|
+
- Snowflake table (`"snowflake"`)*
|
|
1678
|
+
- Databricks table (`"databricks"`)*
|
|
1679
|
+
- PySpark table (`"pyspark"`)*
|
|
1680
|
+
- BigQuery table (`"bigquery"`)*
|
|
1666
1681
|
- Parquet table (`"parquet"`)*
|
|
1667
1682
|
|
|
1668
1683
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -1741,6 +1756,11 @@ def get_row_count(data: FrameT | Any) -> int:
|
|
|
1741
1756
|
- MySQL table (`"mysql"`)*
|
|
1742
1757
|
- PostgreSQL table (`"postgresql"`)*
|
|
1743
1758
|
- SQLite table (`"sqlite"`)*
|
|
1759
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
1760
|
+
- Snowflake table (`"snowflake"`)*
|
|
1761
|
+
- Databricks table (`"databricks"`)*
|
|
1762
|
+
- PySpark table (`"pyspark"`)*
|
|
1763
|
+
- BigQuery table (`"bigquery"`)*
|
|
1744
1764
|
- Parquet table (`"parquet"`)*
|
|
1745
1765
|
|
|
1746
1766
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -2007,6 +2027,11 @@ class Validate:
|
|
|
2007
2027
|
- MySQL table (`"mysql"`)*
|
|
2008
2028
|
- PostgreSQL table (`"postgresql"`)*
|
|
2009
2029
|
- SQLite table (`"sqlite"`)*
|
|
2030
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
2031
|
+
- Snowflake table (`"snowflake"`)*
|
|
2032
|
+
- Databricks table (`"databricks"`)*
|
|
2033
|
+
- PySpark table (`"pyspark"`)*
|
|
2034
|
+
- BigQuery table (`"bigquery"`)*
|
|
2010
2035
|
- Parquet table (`"parquet"`)*
|
|
2011
2036
|
|
|
2012
2037
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -8031,7 +8056,7 @@ class Validate:
|
|
|
8031
8056
|
|
|
8032
8057
|
After interrogation is complete, the `Validate` object will have gathered information, and
|
|
8033
8058
|
we can use methods like [`n_passed()`](`pointblank.Validate.n_passed`),
|
|
8034
|
-
[`f_failed()`](`pointblank.Validate.f_failed`)
|
|
8059
|
+
[`f_failed()`](`pointblank.Validate.f_failed`), etc., to understand how the table performed
|
|
8035
8060
|
against the validation plan. A visual representation of the validation results can be viewed
|
|
8036
8061
|
by printing the `Validate` object; this will display the validation table in an HTML viewing
|
|
8037
8062
|
environment.
|
|
@@ -8772,6 +8797,10 @@ class Validate:
|
|
|
8772
8797
|
assertion made is printed in the `AssertionError` message if a failure occurs, ensuring
|
|
8773
8798
|
some details are preserved.
|
|
8774
8799
|
|
|
8800
|
+
If the validation has not yet been interrogated, this method will automatically call
|
|
8801
|
+
[`interrogate()`](`pointblank.Validate.interrogate`) with default parameters before checking
|
|
8802
|
+
for passing tests.
|
|
8803
|
+
|
|
8775
8804
|
Raises
|
|
8776
8805
|
-------
|
|
8777
8806
|
AssertionError
|
|
@@ -8781,8 +8810,9 @@ class Validate:
|
|
|
8781
8810
|
--------
|
|
8782
8811
|
In the example below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and
|
|
8783
8812
|
`c`). There will be three validation steps, and the second step will have a failing test
|
|
8784
|
-
unit (the value `10` isn't less than `9`).
|
|
8785
|
-
|
|
8813
|
+
unit (the value `10` isn't less than `9`). The `assert_passing()` method is used to assert
|
|
8814
|
+
that all validation steps passed perfectly, automatically performing the interrogation if
|
|
8815
|
+
needed.
|
|
8786
8816
|
|
|
8787
8817
|
```{python}
|
|
8788
8818
|
#| error: True
|
|
@@ -8803,12 +8833,16 @@ class Validate:
|
|
|
8803
8833
|
.col_vals_gt(columns="a", value=0)
|
|
8804
8834
|
.col_vals_lt(columns="b", value=9) # this assertion is false
|
|
8805
8835
|
.col_vals_in_set(columns="c", set=["a", "b"])
|
|
8806
|
-
.interrogate()
|
|
8807
8836
|
)
|
|
8808
8837
|
|
|
8838
|
+
# No need to call [`interrogate()`](`pointblank.Validate.interrogate`) explicitly
|
|
8809
8839
|
validation.assert_passing()
|
|
8810
8840
|
```
|
|
8811
8841
|
"""
|
|
8842
|
+
# Check if validation has been interrogated
|
|
8843
|
+
if not hasattr(self, "time_start") or self.time_start is None:
|
|
8844
|
+
# Auto-interrogate with default parameters
|
|
8845
|
+
self.interrogate()
|
|
8812
8846
|
|
|
8813
8847
|
if not self.all_passed():
|
|
8814
8848
|
failed_steps = [
|
|
@@ -8821,6 +8855,306 @@ class Validate:
|
|
|
8821
8855
|
)
|
|
8822
8856
|
raise AssertionError(msg)
|
|
8823
8857
|
|
|
8858
|
+
def assert_below_threshold(
|
|
8859
|
+
self, level: str = "warning", i: int | None = None, message: str | None = None
|
|
8860
|
+
) -> None:
|
|
8861
|
+
"""
|
|
8862
|
+
Raise an `AssertionError` if validation steps exceed a specified threshold level.
|
|
8863
|
+
|
|
8864
|
+
The `assert_below_threshold()` method checks whether validation steps' failure rates are
|
|
8865
|
+
below a given threshold level (`"warning"`, `"error"`, or `"critical"`). This is
|
|
8866
|
+
particularly useful in automated testing environments where you want to ensure your data
|
|
8867
|
+
quality meets minimum standards before proceeding.
|
|
8868
|
+
|
|
8869
|
+
If any validation step exceeds the specified threshold level, an `AssertionError` will be
|
|
8870
|
+
raised with details about which steps failed. If the validation has not yet been
|
|
8871
|
+
interrogated, this method will automatically call
|
|
8872
|
+
[`interrogate()`](`pointblank.Validate.interrogate`) with default parameters.
|
|
8873
|
+
|
|
8874
|
+
Parameters
|
|
8875
|
+
----------
|
|
8876
|
+
level
|
|
8877
|
+
The threshold level to check against, which could be any of `"warning"` (the default),
|
|
8878
|
+
`"error"`, or `"critical"`. An `AssertionError` will be raised if any validation step
|
|
8879
|
+
exceeds this level.
|
|
8880
|
+
i
|
|
8881
|
+
Specific validation step number(s) to check. Can be provided as a single integer or a
|
|
8882
|
+
list of integers. If `None` (the default), all steps are checked.
|
|
8883
|
+
message
|
|
8884
|
+
Custom error message to use if assertion fails. If `None`, a default message will be
|
|
8885
|
+
generated that lists the specific steps that exceeded the threshold.
|
|
8886
|
+
|
|
8887
|
+
Returns
|
|
8888
|
+
-------
|
|
8889
|
+
None
|
|
8890
|
+
|
|
8891
|
+
Raises
|
|
8892
|
+
------
|
|
8893
|
+
AssertionError
|
|
8894
|
+
If any specified validation step exceeds the given threshold level.
|
|
8895
|
+
ValueError
|
|
8896
|
+
If an invalid threshold level is provided.
|
|
8897
|
+
|
|
8898
|
+
Examples
|
|
8899
|
+
--------
|
|
8900
|
+
```{python}
|
|
8901
|
+
#| echo: false
|
|
8902
|
+
#| output: false
|
|
8903
|
+
import pointblank as pb
|
|
8904
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
8905
|
+
```
|
|
8906
|
+
Below are some examples of how to use the `assert_below_threshold()` method. First, we'll
|
|
8907
|
+
create a simple Polars DataFrame with two columns (`a` and `b`).
|
|
8908
|
+
|
|
8909
|
+
```{python}
|
|
8910
|
+
import polars as pl
|
|
8911
|
+
|
|
8912
|
+
tbl = pl.DataFrame({
|
|
8913
|
+
"a": [7, 4, 9, 7, 12],
|
|
8914
|
+
"b": [9, 8, 10, 5, 10]
|
|
8915
|
+
})
|
|
8916
|
+
```
|
|
8917
|
+
|
|
8918
|
+
Then a validation plan will be created with thresholds (`warning=0.1`, `error=0.2`,
|
|
8919
|
+
`critical=0.3`). After interrogating, we display the validation report table:
|
|
8920
|
+
|
|
8921
|
+
```{python}
|
|
8922
|
+
import pointblank as pb
|
|
8923
|
+
|
|
8924
|
+
validation = (
|
|
8925
|
+
pb.Validate(data=tbl, thresholds=(0.1, 0.2, 0.3))
|
|
8926
|
+
.col_vals_gt(columns="a", value=5) # 1 failing test unit
|
|
8927
|
+
.col_vals_lt(columns="b", value=10) # 2 failing test units
|
|
8928
|
+
.interrogate()
|
|
8929
|
+
)
|
|
8930
|
+
|
|
8931
|
+
validation
|
|
8932
|
+
```
|
|
8933
|
+
|
|
8934
|
+
Using `assert_below_threshold(level="warning")` will raise an `AssertionError` if any step
|
|
8935
|
+
exceeds the 'warning' threshold:
|
|
8936
|
+
|
|
8937
|
+
```{python}
|
|
8938
|
+
try:
|
|
8939
|
+
validation.assert_below_threshold(level="warning")
|
|
8940
|
+
except AssertionError as e:
|
|
8941
|
+
print(f"Assertion failed: {e}")
|
|
8942
|
+
```
|
|
8943
|
+
|
|
8944
|
+
Check a specific step against the 'critical' threshold using the `i=` parameter:
|
|
8945
|
+
|
|
8946
|
+
```{python}
|
|
8947
|
+
validation.assert_below_threshold(level="critical", i=1) # Won't raise an error
|
|
8948
|
+
```
|
|
8949
|
+
|
|
8950
|
+
As the first step is below the 'critical' threshold (it exceeds the 'warning' and 'error'
|
|
8951
|
+
thresholds), no error is raised and nothing is printed.
|
|
8952
|
+
|
|
8953
|
+
We can also provide a custom error message with the `message=` parameter. Let's try that
|
|
8954
|
+
here:
|
|
8955
|
+
|
|
8956
|
+
```{python}
|
|
8957
|
+
try:
|
|
8958
|
+
validation.assert_below_threshold(
|
|
8959
|
+
level="error",
|
|
8960
|
+
message="Data quality too low for processing!"
|
|
8961
|
+
)
|
|
8962
|
+
except AssertionError as e:
|
|
8963
|
+
print(f"Custom error: {e}")
|
|
8964
|
+
```
|
|
8965
|
+
|
|
8966
|
+
See Also
|
|
8967
|
+
--------
|
|
8968
|
+
- [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
|
|
8969
|
+
step
|
|
8970
|
+
- [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
|
|
8971
|
+
- [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
|
|
8972
|
+
validation step
|
|
8973
|
+
- [`assert_passing()`](`pointblank.Validate.assert_passing`): assert all validations pass
|
|
8974
|
+
completely
|
|
8975
|
+
"""
|
|
8976
|
+
# Check if validation has been interrogated
|
|
8977
|
+
if not hasattr(self, "time_start") or self.time_start is None:
|
|
8978
|
+
# Auto-interrogate with default parameters
|
|
8979
|
+
self.interrogate()
|
|
8980
|
+
|
|
8981
|
+
# Validate the level parameter
|
|
8982
|
+
level = level.lower()
|
|
8983
|
+
if level not in ["warning", "error", "critical"]:
|
|
8984
|
+
raise ValueError(
|
|
8985
|
+
f"Invalid threshold level: {level}. Must be one of 'warning', 'error', or 'critical'."
|
|
8986
|
+
)
|
|
8987
|
+
|
|
8988
|
+
# Get the threshold status using the appropriate method
|
|
8989
|
+
if level == "warning":
|
|
8990
|
+
status = self.warning(i=i)
|
|
8991
|
+
elif level == "error":
|
|
8992
|
+
status = self.error(i=i)
|
|
8993
|
+
elif level == "critical":
|
|
8994
|
+
status = self.critical(i=i)
|
|
8995
|
+
|
|
8996
|
+
# Find any steps that exceeded the threshold
|
|
8997
|
+
failures = []
|
|
8998
|
+
for step_num, exceeded in status.items():
|
|
8999
|
+
if exceeded:
|
|
9000
|
+
# Get the step's description
|
|
9001
|
+
validation_step = self.validation_info[step_num - 1]
|
|
9002
|
+
step_descriptor = (
|
|
9003
|
+
validation_step.autobrief
|
|
9004
|
+
if hasattr(validation_step, "autobrief") and validation_step.autobrief
|
|
9005
|
+
else f"Validation step {step_num}"
|
|
9006
|
+
)
|
|
9007
|
+
failures.append(f"Step {step_num}: {step_descriptor}")
|
|
9008
|
+
|
|
9009
|
+
# If any failures were found, raise an AssertionError
|
|
9010
|
+
if failures:
|
|
9011
|
+
if message:
|
|
9012
|
+
msg = message
|
|
9013
|
+
else:
|
|
9014
|
+
msg = f"The following steps exceeded the {level} threshold level:\n" + "\n".join(
|
|
9015
|
+
failures
|
|
9016
|
+
)
|
|
9017
|
+
raise AssertionError(msg)
|
|
9018
|
+
|
|
9019
|
+
def above_threshold(self, level: str = "warning", i: int | None = None) -> bool:
|
|
9020
|
+
"""
|
|
9021
|
+
Check if any validation steps exceed a specified threshold level.
|
|
9022
|
+
|
|
9023
|
+
The `above_threshold()` method checks whether validation steps exceed a given threshold
|
|
9024
|
+
level. This provides a non-exception-based alternative to
|
|
9025
|
+
[`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`) for conditional
|
|
9026
|
+
workflow control based on validation results.
|
|
9027
|
+
|
|
9028
|
+
This method is useful in scenarios where you want to check if any validation steps failed
|
|
9029
|
+
beyond a certain threshold without raising an exception, allowing for more flexible
|
|
9030
|
+
programmatic responses to validation issues.
|
|
9031
|
+
|
|
9032
|
+
Parameters
|
|
9033
|
+
----------
|
|
9034
|
+
level
|
|
9035
|
+
The threshold level to check against. Valid options are: `"warning"` (the least severe
|
|
9036
|
+
threshold level), `"error"` (the middle severity threshold level), and `"critical"` (the
|
|
9037
|
+
most severe threshold level). The default is `"warning"`.
|
|
9038
|
+
i
|
|
9039
|
+
Specific validation step number(s) to check. If a single integer, checks only that step.
|
|
9040
|
+
If a list of integers, checks all specified steps. If `None` (the default), checks all
|
|
9041
|
+
validation steps. Step numbers are 1-based (first step is `1`, not `0`).
|
|
9042
|
+
|
|
9043
|
+
Returns
|
|
9044
|
+
-------
|
|
9045
|
+
bool
|
|
9046
|
+
`True` if any of the specified validation steps exceed the given threshold level,
|
|
9047
|
+
`False` otherwise.
|
|
9048
|
+
|
|
9049
|
+
Raises
|
|
9050
|
+
------
|
|
9051
|
+
ValueError
|
|
9052
|
+
If an invalid threshold level is provided.
|
|
9053
|
+
|
|
9054
|
+
Examples
|
|
9055
|
+
--------
|
|
9056
|
+
```{python}
|
|
9057
|
+
#| echo: false
|
|
9058
|
+
#| output: false
|
|
9059
|
+
import pointblank as pb
|
|
9060
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
9061
|
+
```
|
|
9062
|
+
Below are some examples of how to use the `above_threshold()` method. First, we'll create a
|
|
9063
|
+
simple Polars DataFrame with a single column (`values`).
|
|
9064
|
+
|
|
9065
|
+
```{python}
|
|
9066
|
+
import polars as pl
|
|
9067
|
+
|
|
9068
|
+
tbl = pl.DataFrame({
|
|
9069
|
+
"values": [1, 2, 3, 4, 5, 0, -1]
|
|
9070
|
+
})
|
|
9071
|
+
```
|
|
9072
|
+
|
|
9073
|
+
Then a validation plan will be created with thresholds (`warning=0.1`, `error=0.2`,
|
|
9074
|
+
`critical=0.3`). After interrogating, we display the validation report table:
|
|
9075
|
+
|
|
9076
|
+
```{python}
|
|
9077
|
+
import pointblank as pb
|
|
9078
|
+
|
|
9079
|
+
validation = (
|
|
9080
|
+
pb.Validate(data=tbl, thresholds=(0.1, 0.2, 0.3))
|
|
9081
|
+
.col_vals_gt(columns="values", value=0)
|
|
9082
|
+
.col_vals_lt(columns="values", value=10)
|
|
9083
|
+
.col_vals_between(columns="values", left=0, right=5)
|
|
9084
|
+
.interrogate()
|
|
9085
|
+
)
|
|
9086
|
+
|
|
9087
|
+
validation
|
|
9088
|
+
```
|
|
9089
|
+
|
|
9090
|
+
Let's check if any steps exceed the 'warning' threshold with the `above_threshold()` method.
|
|
9091
|
+
A message will be printed if that's the case:
|
|
9092
|
+
|
|
9093
|
+
```{python}
|
|
9094
|
+
if validation.above_threshold(level="warning"):
|
|
9095
|
+
print("Some steps have exceeded the warning threshold")
|
|
9096
|
+
```
|
|
9097
|
+
|
|
9098
|
+
Check if only steps 2 and 3 exceed the 'error' threshold through use of the `i=` argument:
|
|
9099
|
+
|
|
9100
|
+
```{python}
|
|
9101
|
+
if validation.above_threshold(level="error", i=[2, 3]):
|
|
9102
|
+
print("Steps 2 and/or 3 have exceeded the error threshold")
|
|
9103
|
+
```
|
|
9104
|
+
|
|
9105
|
+
You can use this in a workflow to conditionally trigger processes. Here's a snippet of how
|
|
9106
|
+
you might use this in a function:
|
|
9107
|
+
|
|
9108
|
+
```python
|
|
9109
|
+
def process_data(validation_obj):
|
|
9110
|
+
# Only continue processing if validation passes critical thresholds
|
|
9111
|
+
if not validation_obj.above_threshold(level="critical"):
|
|
9112
|
+
# Continue with processing
|
|
9113
|
+
print("Data meets critical quality thresholds, proceeding...")
|
|
9114
|
+
return True
|
|
9115
|
+
else:
|
|
9116
|
+
# Log failure and stop processing
|
|
9117
|
+
print("Data fails critical quality checks, aborting...")
|
|
9118
|
+
return False
|
|
9119
|
+
```
|
|
9120
|
+
|
|
9121
|
+
Note that this is just a suggestion for how to implement conditional workflow processes. You
|
|
9122
|
+
should adapt this pattern to your specific requirements, which might include different
|
|
9123
|
+
threshold levels, custom logging mechanisms, or integration with your organization's data
|
|
9124
|
+
pipelines and notification systems.
|
|
9125
|
+
|
|
9126
|
+
See Also
|
|
9127
|
+
--------
|
|
9128
|
+
- [`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`): a similar
|
|
9129
|
+
method that raises an exception if thresholds are exceeded
|
|
9130
|
+
- [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
|
|
9131
|
+
step
|
|
9132
|
+
- [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
|
|
9133
|
+
- [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
|
|
9134
|
+
validation step
|
|
9135
|
+
"""
|
|
9136
|
+
# Ensure validation has been run
|
|
9137
|
+
if not hasattr(self, "time_start") or self.time_start is None:
|
|
9138
|
+
return False
|
|
9139
|
+
|
|
9140
|
+
# Validate the level parameter
|
|
9141
|
+
level = level.lower()
|
|
9142
|
+
if level not in ["warning", "error", "critical"]:
|
|
9143
|
+
raise ValueError(
|
|
9144
|
+
f"Invalid threshold level: {level}. Must be one of 'warning', 'error', or 'critical'."
|
|
9145
|
+
)
|
|
9146
|
+
|
|
9147
|
+
# Get the threshold status using the appropriate method
|
|
9148
|
+
if level == "warning":
|
|
9149
|
+
status = self.warning(i=i)
|
|
9150
|
+
elif level == "error":
|
|
9151
|
+
status = self.error(i=i)
|
|
9152
|
+
elif level == "critical":
|
|
9153
|
+
status = self.critical(i=i)
|
|
9154
|
+
|
|
9155
|
+
# Return True if any steps exceeded the threshold
|
|
9156
|
+
return any(status.values())
|
|
9157
|
+
|
|
8824
9158
|
def n(self, i: int | list[int] | None = None, scalar: bool = False) -> dict[int, int] | int:
|
|
8825
9159
|
"""
|
|
8826
9160
|
Provides a dictionary of the number of test units for each validation step.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pointblank
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.6
|
|
4
4
|
Summary: Find out if your data is what you think it is.
|
|
5
5
|
Author-email: Richard Iannone <riannone@me.com>
|
|
6
6
|
License: MIT License
|
|
@@ -55,6 +55,8 @@ Requires-Dist: chatlas>=0.3.0; extra == "generate"
|
|
|
55
55
|
Requires-Dist: anthropic[bedrock]>=0.45.2; extra == "generate"
|
|
56
56
|
Requires-Dist: openai>=1.63.0; extra == "generate"
|
|
57
57
|
Requires-Dist: shiny>=1.3.0; extra == "generate"
|
|
58
|
+
Provides-Extra: bigquery
|
|
59
|
+
Requires-Dist: ibis-framework[bigquery]>=9.5.0; extra == "bigquery"
|
|
58
60
|
Provides-Extra: databricks
|
|
59
61
|
Requires-Dist: ibis-framework[databricks]>=9.5.0; extra == "databricks"
|
|
60
62
|
Provides-Extra: duckdb
|
|
@@ -103,7 +105,7 @@ _Data validation made beautiful and powerful_
|
|
|
103
105
|
|
|
104
106
|
</div>
|
|
105
107
|
|
|
106
|
-
<div align="
|
|
108
|
+
<div align="center">
|
|
107
109
|
<a href="translations/README.fr.md">Français</a> |
|
|
108
110
|
<a href="translations/README.de.md">Deutsch</a> |
|
|
109
111
|
<a href="translations/README.it.md">Italiano</a> |
|
|
@@ -112,7 +114,9 @@ _Data validation made beautiful and powerful_
|
|
|
112
114
|
<a href="translations/README.nl.md">Nederlands</a> |
|
|
113
115
|
<a href="translations/README.zh-CN.md">简体中文</a> |
|
|
114
116
|
<a href="translations/README.ja.md">日本語</a> |
|
|
115
|
-
<a href="translations/README.ko.md">한국어</a>
|
|
117
|
+
<a href="translations/README.ko.md">한국어</a> |
|
|
118
|
+
<a href="translations/README.hi.md">हिन्दी</a> |
|
|
119
|
+
<a href="translations/README.ar.md">العربية</a>
|
|
116
120
|
</div>
|
|
117
121
|
|
|
118
122
|
## What is Pointblank?
|
|
@@ -1,22 +1,22 @@
|
|
|
1
1
|
pointblank/__init__.py,sha256=uHrX-ARZOhvWogXXqKV65RO2DXdYLZNCD1oNcm8hE6o,1585
|
|
2
|
-
pointblank/_constants.py,sha256=
|
|
2
|
+
pointblank/_constants.py,sha256=YeQVYpSkdQ8v7D8ZJnG-M75zqAH3yJuDzzjwWC2I-d8,81227
|
|
3
3
|
pointblank/_constants_docs.py,sha256=JBmtt16zTYQ-zaM4ElLExtKs-dKlnN553Ys2ML1Y1C8,2099
|
|
4
4
|
pointblank/_constants_translations.py,sha256=HXcCYmKoMjoaFv-Ym4UWv3AsIVXik2zDyAy7xvTvv0Y,186710
|
|
5
5
|
pointblank/_interrogation.py,sha256=U4GQ8Ik5rP75BYBkmunBvHKwf3XvLPHcUx18JwiBQZI,89422
|
|
6
6
|
pointblank/_typing.py,sha256=aItbCbzhbzqjK3lCbL27ltRyXoAH1c3-U6xQdRzg-lU,1594
|
|
7
|
-
pointblank/_utils.py,sha256=
|
|
7
|
+
pointblank/_utils.py,sha256=BoIwMEZYBwPEe5xGku1vSmkgAeGgnA4_bQ4MDeYFGrc,24824
|
|
8
8
|
pointblank/_utils_check_args.py,sha256=rFEc1nbCN8ftsQQWVjCNWmQ2QmUDxkfgmoJclrZeTLs,5489
|
|
9
9
|
pointblank/_utils_html.py,sha256=sTcmnBljkPjRZF1hbpoHl4HmnXOazsA91gC9iWVIrRk,2848
|
|
10
|
-
pointblank/actions.py,sha256=
|
|
10
|
+
pointblank/actions.py,sha256=D6o9B2_ES9PNQg9HZwREacrrt-3A5bhdrBkL1UXz__s,18281
|
|
11
11
|
pointblank/assistant.py,sha256=ZIQJKTy9rDwq_Wmr1FMp0J7Q3ekxSgF3_tK0p4PTEUM,14850
|
|
12
|
-
pointblank/column.py,sha256=
|
|
12
|
+
pointblank/column.py,sha256=_FJjpjv760D1p6YGgqbwmKYktouG7AJ2A9uIMYQBTYA,76560
|
|
13
13
|
pointblank/datascan.py,sha256=rRz0hR81uTgd1e9OfLdfsNYXRk8vcpE8PW8exu-GJoE,47697
|
|
14
14
|
pointblank/draft.py,sha256=cusr4fBiNncCKIOU8UwvJcvkBeBuUnqH_UfYp9dtNss,15777
|
|
15
|
-
pointblank/schema.py,sha256=
|
|
15
|
+
pointblank/schema.py,sha256=nHkOXykPw7mTmVGjT67hjx13iKySZ5xsfVgPUQV0yCM,44588
|
|
16
16
|
pointblank/tf.py,sha256=8o_8m4i01teulEe3-YYMotSNf3tImjBMInsvdjSAO5Q,8844
|
|
17
|
-
pointblank/thresholds.py,sha256=
|
|
18
|
-
pointblank/validate.py,sha256=
|
|
19
|
-
pointblank/data/api-docs.txt,sha256=
|
|
17
|
+
pointblank/thresholds.py,sha256=mybeLzTVdmN04NLKoV-jiSBXsWknwHO0Gox0ttVN_MU,25766
|
|
18
|
+
pointblank/validate.py,sha256=dM5U41me38atNDt1Llzv08gdUcnYyvWoHycQPpctidg,621961
|
|
19
|
+
pointblank/data/api-docs.txt,sha256=6cdUIYdVy2XfGRLNNxtcGTaxu2WX4EXEeICayOvJCTs,492756
|
|
20
20
|
pointblank/data/game_revenue-duckdb.zip,sha256=tKIVx48OGLYGsQPS3h5AjA2Nyq_rfEpLCjBiFUWhagU,35880
|
|
21
21
|
pointblank/data/game_revenue.zip,sha256=7c9EvHLyi93CHUd4p3dM4CZ-GucFCtXKSPxgLojL32U,33749
|
|
22
22
|
pointblank/data/global_sales-duckdb.zip,sha256=2ok_cvJ1ZuSkXnw0R6_OkKYRTWhJ-jJEMq2VYsv5fqY,1336390
|
|
@@ -26,8 +26,8 @@ pointblank/data/nycflights.zip,sha256=yVjbUaKUz2LydSdF9cABuir0VReHBBgV7shiNWSd0m
|
|
|
26
26
|
pointblank/data/polars-api-docs.txt,sha256=KGcS-BOtUs9zgpkWfXD-GFdFh4O_zjdkpX7msHjztLg,198045
|
|
27
27
|
pointblank/data/small_table-duckdb.zip,sha256=BhTaZ2CRS4-9Z1uVhOU6HggvW3XCar7etMznfENIcOc,2028
|
|
28
28
|
pointblank/data/small_table.zip,sha256=lmFb90Nb-v5X559Ikjg31YLAXuRyMkD9yLRElkXPMzQ,472
|
|
29
|
-
pointblank-0.9.
|
|
30
|
-
pointblank-0.9.
|
|
31
|
-
pointblank-0.9.
|
|
32
|
-
pointblank-0.9.
|
|
33
|
-
pointblank-0.9.
|
|
29
|
+
pointblank-0.9.6.dist-info/licenses/LICENSE,sha256=apLF-HWPNU7pT5bmf5KmZpD5Cklpy2u-BN_0xBoRMLY,1081
|
|
30
|
+
pointblank-0.9.6.dist-info/METADATA,sha256=_BocxWcU0_AXIiMGBPcxsd9VwrD8uGXjXpjE16hUhVw,14950
|
|
31
|
+
pointblank-0.9.6.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
|
32
|
+
pointblank-0.9.6.dist-info/top_level.txt,sha256=-wHrS1SvV8-nhvc3w-PPYs1C1WtEc1pK-eGjubbCCKc,11
|
|
33
|
+
pointblank-0.9.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|