pointblank 0.9.1__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants.py +14 -0
- pointblank/_constants_translations.py +54 -0
- pointblank/_interrogation.py +101 -0
- pointblank/_typing.py +35 -24
- pointblank/_utils.py +1 -0
- pointblank/actions.py +2 -2
- pointblank/data/api-docs.txt +305 -4
- pointblank/data/global_sales-duckdb.zip +0 -0
- pointblank/data/global_sales.zip +0 -0
- pointblank/thresholds.py +3 -2
- pointblank/validate.py +461 -20
- {pointblank-0.9.1.dist-info → pointblank-0.9.4.dist-info}/METADATA +1 -1
- {pointblank-0.9.1.dist-info → pointblank-0.9.4.dist-info}/RECORD +16 -14
- {pointblank-0.9.1.dist-info → pointblank-0.9.4.dist-info}/WHEEL +1 -1
- {pointblank-0.9.1.dist-info → pointblank-0.9.4.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.9.1.dist-info → pointblank-0.9.4.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -58,6 +58,7 @@ from pointblank._interrogation import (
|
|
|
58
58
|
RowCountMatch,
|
|
59
59
|
RowsComplete,
|
|
60
60
|
RowsDistinct,
|
|
61
|
+
SpeciallyValidation,
|
|
61
62
|
)
|
|
62
63
|
from pointblank._typing import SegmentSpec
|
|
63
64
|
from pointblank._utils import (
|
|
@@ -384,7 +385,7 @@ def config(
|
|
|
384
385
|
|
|
385
386
|
|
|
386
387
|
def load_dataset(
|
|
387
|
-
dataset: Literal["small_table", "game_revenue", "nycflights"] = "small_table",
|
|
388
|
+
dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
|
|
388
389
|
tbl_type: Literal["polars", "pandas", "duckdb"] = "polars",
|
|
389
390
|
) -> FrameT | Any:
|
|
390
391
|
"""
|
|
@@ -400,7 +401,7 @@ def load_dataset(
|
|
|
400
401
|
----------
|
|
401
402
|
dataset
|
|
402
403
|
The name of the dataset to load. Current options are `"small_table"`, `"game_revenue"`,
|
|
403
|
-
and `"
|
|
404
|
+
`"nycflights"`, and `"global_sales"`.
|
|
404
405
|
tbl_type
|
|
405
406
|
The type of table to generate from the dataset. The named options are `"polars"`,
|
|
406
407
|
`"pandas"`, and `"duckdb"`.
|
|
@@ -422,6 +423,8 @@ def load_dataset(
|
|
|
422
423
|
they purchased, ads viewed, and the revenue generated.
|
|
423
424
|
- `"nycflights"`: A dataset with 336,776 rows and 18 columns. This dataset provides information
|
|
424
425
|
about flights departing from New York City airports (JFK, LGA, or EWR) in 2013.
|
|
426
|
+
- `"global_sales"`: A dataset with 50,000 rows and 20 columns. Provides information about
|
|
427
|
+
global sales of products across different regions and countries.
|
|
425
428
|
|
|
426
429
|
Supported DataFrame Types
|
|
427
430
|
-------------------------
|
|
@@ -433,18 +436,18 @@ def load_dataset(
|
|
|
433
436
|
|
|
434
437
|
Examples
|
|
435
438
|
--------
|
|
436
|
-
Load the `"small_table"` dataset as a Polars DataFrame by calling `load_dataset()` with
|
|
437
|
-
|
|
439
|
+
Load the `"small_table"` dataset as a Polars DataFrame by calling `load_dataset()` with
|
|
440
|
+
`dataset="small_table"` and `tbl_type="polars"`:
|
|
438
441
|
|
|
439
442
|
```{python}
|
|
440
443
|
import pointblank as pb
|
|
441
444
|
|
|
442
|
-
small_table = pb.load_dataset()
|
|
445
|
+
small_table = pb.load_dataset(dataset="small_table", tbl_type="polars")
|
|
443
446
|
|
|
444
447
|
pb.preview(small_table)
|
|
445
448
|
```
|
|
446
449
|
|
|
447
|
-
Note that the `"small_table"` dataset is a
|
|
450
|
+
Note that the `"small_table"` dataset is a Polars DataFrame and using the
|
|
448
451
|
[`preview()`](`pointblank.preview`) function will display the table in an HTML viewing
|
|
449
452
|
environment.
|
|
450
453
|
|
|
@@ -472,10 +475,23 @@ def load_dataset(
|
|
|
472
475
|
The `"nycflights"` dataset is a large dataset with 336,776 rows and 18 columns. This dataset is
|
|
473
476
|
truly a real-world dataset and provides information about flights originating from New York City
|
|
474
477
|
airports in 2013.
|
|
478
|
+
|
|
479
|
+
Finally, the `"global_sales"` dataset can be loaded as a Polars table by specifying the dataset
|
|
480
|
+
name. Since `tbl_type=` is set to `"polars"` by default, we don't need to specify it:
|
|
481
|
+
|
|
482
|
+
```{python}
|
|
483
|
+
global_sales = pb.load_dataset(dataset="global_sales")
|
|
484
|
+
|
|
485
|
+
pb.preview(global_sales)
|
|
486
|
+
```
|
|
487
|
+
|
|
488
|
+
The `"global_sales"` dataset is a large dataset with 50,000 rows and 20 columns. Each record
|
|
489
|
+
describes the sales of a particular product to a customer located in one of three global
|
|
490
|
+
regions: North America, Europe, or Asia.
|
|
475
491
|
"""
|
|
476
492
|
|
|
477
493
|
# Raise an error if the dataset is from the list of provided datasets
|
|
478
|
-
if dataset not in ["small_table", "game_revenue", "nycflights"]:
|
|
494
|
+
if dataset not in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
479
495
|
raise ValueError(
|
|
480
496
|
f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
|
|
481
497
|
"- `small_table`\n"
|
|
@@ -517,6 +533,7 @@ def load_dataset(
|
|
|
517
533
|
"small_table": ["date_time", "date"],
|
|
518
534
|
"game_revenue": ["session_start", "time", "start_day"],
|
|
519
535
|
"nycflights": [],
|
|
536
|
+
"global_sales": ["timestamp"],
|
|
520
537
|
}
|
|
521
538
|
|
|
522
539
|
dataset = pd.read_csv(data_path, parse_dates=parse_date_columns[dataset])
|
|
@@ -7633,7 +7650,7 @@ class Validate:
|
|
|
7633
7650
|
|
|
7634
7651
|
val_info = _ValidationInfo(
|
|
7635
7652
|
assertion_type=assertion_type,
|
|
7636
|
-
column=None, # This is
|
|
7653
|
+
column=None, # This validation is not specific to any column(s)
|
|
7637
7654
|
values=values,
|
|
7638
7655
|
pre=pre,
|
|
7639
7656
|
thresholds=thresholds,
|
|
@@ -7646,6 +7663,351 @@ class Validate:
|
|
|
7646
7663
|
|
|
7647
7664
|
return self
|
|
7648
7665
|
|
|
7666
|
+
def specially(
|
|
7667
|
+
self,
|
|
7668
|
+
expr: Callable,
|
|
7669
|
+
pre: Callable | None = None,
|
|
7670
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
7671
|
+
actions: Actions | None = None,
|
|
7672
|
+
brief: str | bool | None = None,
|
|
7673
|
+
active: bool = True,
|
|
7674
|
+
) -> Validate:
|
|
7675
|
+
"""
|
|
7676
|
+
Perform a specialized validation with customized logic.
|
|
7677
|
+
|
|
7678
|
+
The `specially()` validation method allows for the creation of specialized validation
|
|
7679
|
+
expressions that can be used to validate specific conditions or logic in the data. This
|
|
7680
|
+
method provides maximum flexibility by accepting a custom callable that encapsulates
|
|
7681
|
+
your validation logic.
|
|
7682
|
+
|
|
7683
|
+
The callable function can have one of two signatures:
|
|
7684
|
+
|
|
7685
|
+
- a function accepting a single parameter (the data table): `def validate(data): ...`
|
|
7686
|
+
- a function with no parameters: `def validate(): ...`
|
|
7687
|
+
|
|
7688
|
+
The second form is particularly useful for environment validations that don't need to
|
|
7689
|
+
inspect the data table.
|
|
7690
|
+
|
|
7691
|
+
The callable function must ultimately return one of:
|
|
7692
|
+
|
|
7693
|
+
1. a single boolean value or boolean list
|
|
7694
|
+
2. a table where the final column contains boolean values (column name is unimportant)
|
|
7695
|
+
|
|
7696
|
+
The validation will operate over the number of test units that is equal to the number of
|
|
7697
|
+
rows in the data table (if returning a table with boolean values). If returning a scalar
|
|
7698
|
+
boolean value, the validation will operate over a single test unit. For a return of a list
|
|
7699
|
+
of boolean values, the length of the list constitutes the number of test units.
|
|
7700
|
+
|
|
7701
|
+
Parameters
|
|
7702
|
+
----------
|
|
7703
|
+
expr
|
|
7704
|
+
A callable function that defines the specialized validation logic. This function should:
|
|
7705
|
+
(1) accept the target data table as its single argument (though it may ignore it), or
|
|
7706
|
+
(2) take no parameters at all (for environment validations). The function must
|
|
7707
|
+
ultimately return boolean values representing validation results. Design your function
|
|
7708
|
+
to incorporate any custom parameters directly within the function itself using closure
|
|
7709
|
+
variables or default parameters.
|
|
7710
|
+
pre
|
|
7711
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
7712
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
7713
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
7714
|
+
argument.
|
|
7715
|
+
thresholds
|
|
7716
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
7717
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
7718
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
7719
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
7720
|
+
section for information on how to set threshold levels.
|
|
7721
|
+
actions
|
|
7722
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
7723
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
7724
|
+
define the actions.
|
|
7725
|
+
brief
|
|
7726
|
+
An optional brief description of the validation step that will be displayed in the
|
|
7727
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
7728
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
7729
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
7730
|
+
won't be a brief.
|
|
7731
|
+
active
|
|
7732
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
7733
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
7734
|
+
for the steps unchanged).
|
|
7735
|
+
|
|
7736
|
+
Returns
|
|
7737
|
+
-------
|
|
7738
|
+
Validate
|
|
7739
|
+
The `Validate` object with the added validation step.
|
|
7740
|
+
|
|
7741
|
+
Preprocessing
|
|
7742
|
+
-------------
|
|
7743
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
7744
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
7745
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
7746
|
+
before the validation step is applied.
|
|
7747
|
+
|
|
7748
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
7749
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
7750
|
+
certain criteria or to apply a transformation to the data. Regarding the lifetime of the
|
|
7751
|
+
transformed table, it only exists during the validation step and is not stored in the
|
|
7752
|
+
`Validate` object or used in subsequent validation steps.
|
|
7753
|
+
|
|
7754
|
+
Thresholds
|
|
7755
|
+
----------
|
|
7756
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
7757
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
7758
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
7759
|
+
|
|
7760
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
7761
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
7762
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
7763
|
+
|
|
7764
|
+
Thresholds can be defined using one of these input schemes:
|
|
7765
|
+
|
|
7766
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
7767
|
+
thresholds)
|
|
7768
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
7769
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
7770
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
7771
|
+
'critical'
|
|
7772
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
7773
|
+
for the 'warning' level only
|
|
7774
|
+
|
|
7775
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
7776
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
7777
|
+
set, you're free to set any combination of them.
|
|
7778
|
+
|
|
7779
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
7780
|
+
take for each level of failure (using the `actions=` parameter).
|
|
7781
|
+
|
|
7782
|
+
Examples
|
|
7783
|
+
--------
|
|
7784
|
+
```{python}
|
|
7785
|
+
#| echo: false
|
|
7786
|
+
#| output: false
|
|
7787
|
+
import pointblank as pb
|
|
7788
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
7789
|
+
```
|
|
7790
|
+
The `specially()` method offers maximum flexibility for validation, allowing you to create
|
|
7791
|
+
custom validation logic that fits your specific needs. The following examples demonstrate
|
|
7792
|
+
different patterns and use cases for this powerful validation approach.
|
|
7793
|
+
|
|
7794
|
+
### Simple validation with direct table access
|
|
7795
|
+
|
|
7796
|
+
This example shows the most straightforward use case where we create a function that
|
|
7797
|
+
directly checks if the sum of two columns is positive.
|
|
7798
|
+
|
|
7799
|
+
```{python}
|
|
7800
|
+
import pointblank as pb
|
|
7801
|
+
import polars as pl
|
|
7802
|
+
|
|
7803
|
+
simple_tbl = pl.DataFrame({
|
|
7804
|
+
"a": [5, 7, 1, 3, 9, 4],
|
|
7805
|
+
"b": [6, 3, 0, 5, 8, 2]
|
|
7806
|
+
})
|
|
7807
|
+
|
|
7808
|
+
# Simple function that validates directly on the table
|
|
7809
|
+
def validate_sum_positive(data):
|
|
7810
|
+
return data.select(pl.col("a") + pl.col("b") > 0)
|
|
7811
|
+
|
|
7812
|
+
(
|
|
7813
|
+
pb.Validate(data=simple_tbl)
|
|
7814
|
+
.specially(expr=validate_sum_positive)
|
|
7815
|
+
.interrogate()
|
|
7816
|
+
)
|
|
7817
|
+
```
|
|
7818
|
+
|
|
7819
|
+
The function returns a Polars DataFrame with a single boolean column indicating whether
|
|
7820
|
+
the sum of columns `a` and `b` is positive for each row. Each row in the resulting DataFrame
|
|
7821
|
+
is a distinct test unit. This pattern works well for simple validations where you don't need
|
|
7822
|
+
configurable parameters.
|
|
7823
|
+
|
|
7824
|
+
### Advanced validation with closure variables for parameters
|
|
7825
|
+
|
|
7826
|
+
When you need to make your validation configurable, you can use the function factory pattern
|
|
7827
|
+
(also known as closures) to create parameterized validations:
|
|
7828
|
+
|
|
7829
|
+
```{python}
|
|
7830
|
+
# Create a parameterized validation function using closures
|
|
7831
|
+
def make_column_ratio_validator(col1, col2, min_ratio):
|
|
7832
|
+
def validate_column_ratio(data):
|
|
7833
|
+
return data.select((pl.col(col1) / pl.col(col2)) > min_ratio)
|
|
7834
|
+
return validate_column_ratio
|
|
7835
|
+
|
|
7836
|
+
(
|
|
7837
|
+
pb.Validate(data=simple_tbl)
|
|
7838
|
+
.specially(
|
|
7839
|
+
expr=make_column_ratio_validator(col1="a", col2="b", min_ratio=0.5)
|
|
7840
|
+
)
|
|
7841
|
+
.interrogate()
|
|
7842
|
+
)
|
|
7843
|
+
```
|
|
7844
|
+
|
|
7845
|
+
This approach allows you to create reusable validation functions that can be configured with
|
|
7846
|
+
different parameters without modifying the function itself.
|
|
7847
|
+
|
|
7848
|
+
### Validation function returning a list of booleans
|
|
7849
|
+
|
|
7850
|
+
This example demonstrates how to create a validation function that returns a list of boolean
|
|
7851
|
+
values, where each element represents a separate test unit:
|
|
7852
|
+
|
|
7853
|
+
```{python}
|
|
7854
|
+
import pointblank as pb
|
|
7855
|
+
import polars as pl
|
|
7856
|
+
import random
|
|
7857
|
+
|
|
7858
|
+
# Create sample data
|
|
7859
|
+
transaction_tbl = pl.DataFrame({
|
|
7860
|
+
"transaction_id": [f"TX{i:04d}" for i in range(1, 11)],
|
|
7861
|
+
"amount": [120.50, 85.25, 50.00, 240.75, 35.20, 150.00, 85.25, 65.00, 210.75, 90.50],
|
|
7862
|
+
"category": ["food", "shopping", "entertainment", "travel", "utilities",
|
|
7863
|
+
"food", "shopping", "entertainment", "travel", "utilities"]
|
|
7864
|
+
})
|
|
7865
|
+
|
|
7866
|
+
# Define a validation function that returns a list of booleans
|
|
7867
|
+
def validate_transaction_rules(data):
|
|
7868
|
+
# Create a list to store individual test results
|
|
7869
|
+
test_results = []
|
|
7870
|
+
|
|
7871
|
+
# Check each row individually against multiple business rules
|
|
7872
|
+
for row in data.iter_rows(named=True):
|
|
7873
|
+
# Rule: transaction IDs must start with "TX" and be 6 chars long
|
|
7874
|
+
valid_id = row["transaction_id"].startswith("TX") and len(row["transaction_id"]) == 6
|
|
7875
|
+
|
|
7876
|
+
# Rule: Amounts must be appropriate for their category
|
|
7877
|
+
valid_amount = True
|
|
7878
|
+
if row["category"] == "food" and (row["amount"] < 10 or row["amount"] > 200):
|
|
7879
|
+
valid_amount = False
|
|
7880
|
+
elif row["category"] == "utilities" and (row["amount"] < 20 or row["amount"] > 300):
|
|
7881
|
+
valid_amount = False
|
|
7882
|
+
elif row["category"] == "entertainment" and row["amount"] > 100:
|
|
7883
|
+
valid_amount = False
|
|
7884
|
+
|
|
7885
|
+
# A transaction passes if it satisfies both rules
|
|
7886
|
+
test_results.append(valid_id and valid_amount)
|
|
7887
|
+
|
|
7888
|
+
return test_results
|
|
7889
|
+
|
|
7890
|
+
(
|
|
7891
|
+
pb.Validate(data=transaction_tbl)
|
|
7892
|
+
.specially(
|
|
7893
|
+
expr=validate_transaction_rules,
|
|
7894
|
+
brief="Validate transaction IDs and amounts by category."
|
|
7895
|
+
)
|
|
7896
|
+
.interrogate()
|
|
7897
|
+
)
|
|
7898
|
+
```
|
|
7899
|
+
|
|
7900
|
+
This example shows how to create a validation function that applies multiple business rules
|
|
7901
|
+
to each row and returns a list of boolean results. Each boolean in the list represents a
|
|
7902
|
+
separate test unit, and a test unit passes only if all rules are satisfied for a given row.
|
|
7903
|
+
|
|
7904
|
+
The function iterates through each row in the data table, checking:
|
|
7905
|
+
|
|
7906
|
+
1. if transaction IDs follow the required format
|
|
7907
|
+
2. if transaction amounts are appropriate for their respective categories
|
|
7908
|
+
|
|
7909
|
+
This approach is powerful when you need to apply complex, conditional logic that can't be
|
|
7910
|
+
easily expressed using the built-in validation functions.
|
|
7911
|
+
|
|
7912
|
+
### Table-level validation returning a single boolean
|
|
7913
|
+
|
|
7914
|
+
Sometimes you need to validate properties of the entire table rather than row-by-row. In
|
|
7915
|
+
these cases, your function can return a single boolean value:
|
|
7916
|
+
|
|
7917
|
+
```{python}
|
|
7918
|
+
def validate_table_properties(data):
|
|
7919
|
+
# Check if table has at least one row with column 'a' > 10
|
|
7920
|
+
has_large_values = data.filter(pl.col("a") > 10).height > 0
|
|
7921
|
+
|
|
7922
|
+
# Check if mean of column 'b' is positive
|
|
7923
|
+
has_positive_mean = data.select(pl.mean("b")).item() > 0
|
|
7924
|
+
|
|
7925
|
+
# Return a single boolean for the entire table
|
|
7926
|
+
return has_large_values and has_positive_mean
|
|
7927
|
+
|
|
7928
|
+
(
|
|
7929
|
+
pb.Validate(data=simple_tbl)
|
|
7930
|
+
.specially(expr=validate_table_properties)
|
|
7931
|
+
.interrogate()
|
|
7932
|
+
)
|
|
7933
|
+
```
|
|
7934
|
+
|
|
7935
|
+
This example demonstrates how to perform multiple checks on the table as a whole and combine
|
|
7936
|
+
them into a single validation result.
|
|
7937
|
+
|
|
7938
|
+
### Environment validation that doesn't use the data table
|
|
7939
|
+
|
|
7940
|
+
The `specially()` validation method can even be used to validate aspects of your environment
|
|
7941
|
+
that are completely independent of the data:
|
|
7942
|
+
|
|
7943
|
+
```{python}
|
|
7944
|
+
def validate_pointblank_version():
|
|
7945
|
+
try:
|
|
7946
|
+
import importlib.metadata
|
|
7947
|
+
version = importlib.metadata.version("pointblank")
|
|
7948
|
+
version_parts = version.split(".")
|
|
7949
|
+
|
|
7950
|
+
# Get major and minor components regardless of how many parts there are
|
|
7951
|
+
major = int(version_parts[0])
|
|
7952
|
+
minor = int(version_parts[1])
|
|
7953
|
+
|
|
7954
|
+
# Check both major and minor components for version `0.9+`
|
|
7955
|
+
return (major > 0) or (major == 0 and minor >= 9)
|
|
7956
|
+
|
|
7957
|
+
except Exception as e:
|
|
7958
|
+
# More specific error handling could be added here
|
|
7959
|
+
print(f"Version check failed: {e}")
|
|
7960
|
+
return False
|
|
7961
|
+
|
|
7962
|
+
(
|
|
7963
|
+
pb.Validate(data=simple_tbl)
|
|
7964
|
+
.specially(
|
|
7965
|
+
expr=validate_pointblank_version,
|
|
7966
|
+
brief="Check Pointblank version `>=0.9.0`."
|
|
7967
|
+
)
|
|
7968
|
+
.interrogate()
|
|
7969
|
+
)
|
|
7970
|
+
```
|
|
7971
|
+
|
|
7972
|
+
This pattern shows how to validate external dependencies or environment conditions as part
|
|
7973
|
+
of your validation workflow. Notice that the function doesn't take any parameters at all,
|
|
7974
|
+
which makes it cleaner when the validation doesn't need to access the data table.
|
|
7975
|
+
|
|
7976
|
+
By combining these patterns, you can create sophisticated validation workflows that address
|
|
7977
|
+
virtually any data quality requirement in your organization.
|
|
7978
|
+
"""
|
|
7979
|
+
|
|
7980
|
+
assertion_type = _get_fn_name()
|
|
7981
|
+
|
|
7982
|
+
# TODO: add a check for the expression to be a callable
|
|
7983
|
+
# _check_expr_specially(expr=expr)
|
|
7984
|
+
_check_pre(pre=pre)
|
|
7985
|
+
_check_thresholds(thresholds=thresholds)
|
|
7986
|
+
_check_boolean_input(param=active, param_name="active")
|
|
7987
|
+
|
|
7988
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
7989
|
+
thresholds = (
|
|
7990
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
7991
|
+
)
|
|
7992
|
+
|
|
7993
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
7994
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
7995
|
+
|
|
7996
|
+
val_info = _ValidationInfo(
|
|
7997
|
+
assertion_type=assertion_type,
|
|
7998
|
+
column=None, # This validation is not specific to any column(s)
|
|
7999
|
+
values=expr,
|
|
8000
|
+
pre=pre,
|
|
8001
|
+
thresholds=thresholds,
|
|
8002
|
+
actions=actions,
|
|
8003
|
+
brief=brief,
|
|
8004
|
+
active=active,
|
|
8005
|
+
)
|
|
8006
|
+
|
|
8007
|
+
self._add_validation(validation_info=val_info)
|
|
8008
|
+
|
|
8009
|
+
return self
|
|
8010
|
+
|
|
7649
8011
|
def interrogate(
|
|
7650
8012
|
self,
|
|
7651
8013
|
collect_extracts: bool = True,
|
|
@@ -7796,6 +8158,7 @@ class Validate:
|
|
|
7796
8158
|
inclusive = validation.inclusive
|
|
7797
8159
|
na_pass = validation.na_pass
|
|
7798
8160
|
threshold = validation.thresholds
|
|
8161
|
+
segment = validation.segments
|
|
7799
8162
|
|
|
7800
8163
|
assertion_method = ASSERTION_TYPE_METHOD_MAP[assertion_type]
|
|
7801
8164
|
assertion_category = METHOD_CATEGORY_MAP[assertion_method]
|
|
@@ -7803,7 +8166,14 @@ class Validate:
|
|
|
7803
8166
|
|
|
7804
8167
|
# Process the `brief` text for the validation step by including template variables to
|
|
7805
8168
|
# the user-supplied text
|
|
7806
|
-
validation.brief = _process_brief(
|
|
8169
|
+
validation.brief = _process_brief(
|
|
8170
|
+
brief=validation.brief,
|
|
8171
|
+
step=validation.i,
|
|
8172
|
+
col=column,
|
|
8173
|
+
values=value,
|
|
8174
|
+
thresholds=threshold,
|
|
8175
|
+
segment=segment,
|
|
8176
|
+
)
|
|
7807
8177
|
|
|
7808
8178
|
# Generate the autobrief description for the validation step; it's important to perform
|
|
7809
8179
|
# that here since text components like the column and the value(s) have been resolved
|
|
@@ -8060,12 +8430,39 @@ class Validate:
|
|
|
8060
8430
|
tbl_type=tbl_type,
|
|
8061
8431
|
).get_test_results()
|
|
8062
8432
|
|
|
8063
|
-
if assertion_category
|
|
8064
|
-
|
|
8065
|
-
|
|
8066
|
-
|
|
8067
|
-
|
|
8068
|
-
|
|
8433
|
+
if assertion_category == "SPECIALLY":
|
|
8434
|
+
results_tbl_list = SpeciallyValidation(
|
|
8435
|
+
data_tbl=data_tbl_step,
|
|
8436
|
+
expression=value,
|
|
8437
|
+
threshold=threshold,
|
|
8438
|
+
tbl_type=tbl_type,
|
|
8439
|
+
).get_test_results()
|
|
8440
|
+
|
|
8441
|
+
#
|
|
8442
|
+
# The result from this could either be a table in the conventional form, or,
|
|
8443
|
+
# a list of boolean values; handle both cases
|
|
8444
|
+
#
|
|
8445
|
+
|
|
8446
|
+
if isinstance(results_tbl_list, list):
|
|
8447
|
+
# If the result is a list of boolean values, then we need to convert it to a
|
|
8448
|
+
# set the validation results from the list
|
|
8449
|
+
validation.all_passed = all(results_tbl_list)
|
|
8450
|
+
validation.n = len(results_tbl_list)
|
|
8451
|
+
validation.n_passed = results_tbl_list.count(True)
|
|
8452
|
+
validation.n_failed = results_tbl_list.count(False)
|
|
8453
|
+
|
|
8454
|
+
results_tbl = None
|
|
8455
|
+
|
|
8456
|
+
else:
|
|
8457
|
+
# If the result is not a list, then we assume it's a table in the conventional
|
|
8458
|
+
# form (where the column is `pb_is_good_` exists, with boolean values)
|
|
8459
|
+
|
|
8460
|
+
results_tbl = results_tbl_list
|
|
8461
|
+
|
|
8462
|
+
# If the results table is not `None`, then we assume there is a table with a column
|
|
8463
|
+
# called `pb_is_good_` that contains boolean values; we can then use this table to
|
|
8464
|
+
# determine the number of test units that passed and failed
|
|
8465
|
+
if results_tbl is not None:
|
|
8069
8466
|
# Extract the `pb_is_good_` column from the table as a results list
|
|
8070
8467
|
if tbl_type in IBIS_BACKENDS:
|
|
8071
8468
|
# Select the DataFrame library to use for getting the results list
|
|
@@ -9967,7 +10364,7 @@ class Validate:
|
|
|
9967
10364
|
# With a column subset list, format with commas between the column names
|
|
9968
10365
|
columns_upd.append(", ".join(column))
|
|
9969
10366
|
|
|
9970
|
-
elif assertion_type[i] in ["conjointly"]:
|
|
10367
|
+
elif assertion_type[i] in ["conjointly", "specially"]:
|
|
9971
10368
|
columns_upd.append("")
|
|
9972
10369
|
else:
|
|
9973
10370
|
columns_upd.append(str(column))
|
|
@@ -10029,7 +10426,7 @@ class Validate:
|
|
|
10029
10426
|
elif assertion_type[i] in ["col_schema_match"]:
|
|
10030
10427
|
values_upd.append("SCHEMA")
|
|
10031
10428
|
|
|
10032
|
-
elif assertion_type[i] in ["col_vals_expr"]:
|
|
10429
|
+
elif assertion_type[i] in ["col_vals_expr", "conjointly"]:
|
|
10033
10430
|
values_upd.append("COLUMN EXPR")
|
|
10034
10431
|
|
|
10035
10432
|
elif assertion_type[i] in ["row_count_match", "col_count_match"]:
|
|
@@ -10041,8 +10438,8 @@ class Validate:
|
|
|
10041
10438
|
|
|
10042
10439
|
values_upd.append(str(count))
|
|
10043
10440
|
|
|
10044
|
-
elif assertion_type[i] in ["
|
|
10045
|
-
values_upd.append("
|
|
10441
|
+
elif assertion_type[i] in ["specially"]:
|
|
10442
|
+
values_upd.append("EXPR")
|
|
10046
10443
|
|
|
10047
10444
|
# If the assertion type is not recognized, add the value as a string
|
|
10048
10445
|
else:
|
|
@@ -11256,7 +11653,14 @@ def _string_date_dttm_conversion(value: any) -> any:
|
|
|
11256
11653
|
return value
|
|
11257
11654
|
|
|
11258
11655
|
|
|
11259
|
-
def _process_brief(
|
|
11656
|
+
def _process_brief(
|
|
11657
|
+
brief: str | None,
|
|
11658
|
+
step: int,
|
|
11659
|
+
col: str | list[str] | None,
|
|
11660
|
+
values: any | None,
|
|
11661
|
+
thresholds: any | None,
|
|
11662
|
+
segment: any | None,
|
|
11663
|
+
) -> str:
|
|
11260
11664
|
# If there is no brief, return `None`
|
|
11261
11665
|
if brief is None:
|
|
11262
11666
|
return None
|
|
@@ -11276,6 +11680,34 @@ def _process_brief(brief: str | None, step: int, col: str | list[str] | None) ->
|
|
|
11276
11680
|
brief = brief.replace("{col}", col)
|
|
11277
11681
|
brief = brief.replace("{column}", col)
|
|
11278
11682
|
|
|
11683
|
+
if values is not None:
|
|
11684
|
+
# If the value is a list, then join the values into a comma-separated string
|
|
11685
|
+
if isinstance(values, list):
|
|
11686
|
+
values = ", ".join([str(v) for v in values])
|
|
11687
|
+
|
|
11688
|
+
brief = brief.replace("{value}", str(values))
|
|
11689
|
+
|
|
11690
|
+
if thresholds is not None:
|
|
11691
|
+
# Get the string representation of thresholds in the form of:
|
|
11692
|
+
# "W: 0.20 / C: 0.40 / E: 1.00"
|
|
11693
|
+
|
|
11694
|
+
warning_val = thresholds._get_threshold_value(level="warning")
|
|
11695
|
+
error_val = thresholds._get_threshold_value(level="error")
|
|
11696
|
+
critical_val = thresholds._get_threshold_value(level="critical")
|
|
11697
|
+
|
|
11698
|
+
thresholds_fmt = f"W: {warning_val} / E: {error_val} / C: {critical_val}"
|
|
11699
|
+
|
|
11700
|
+
brief = brief.replace("{thresholds}", thresholds_fmt)
|
|
11701
|
+
|
|
11702
|
+
if segment is not None:
|
|
11703
|
+
# The segment is always a tuple of the form ("{column}", "{value}")
|
|
11704
|
+
|
|
11705
|
+
segment_fmt = f"{segment[0]} / {segment[1]}"
|
|
11706
|
+
|
|
11707
|
+
brief = brief.replace("{segment}", segment_fmt)
|
|
11708
|
+
brief = brief.replace("{segment_column}", segment[0])
|
|
11709
|
+
brief = brief.replace("{segment_value}", segment[1])
|
|
11710
|
+
|
|
11279
11711
|
return brief
|
|
11280
11712
|
|
|
11281
11713
|
|
|
@@ -11470,6 +11902,9 @@ def _create_autobrief_or_failure_text(
|
|
|
11470
11902
|
if assertion_type == "conjointly":
|
|
11471
11903
|
return _create_text_conjointly(lang=lang, for_failure=for_failure)
|
|
11472
11904
|
|
|
11905
|
+
if assertion_type == "specially":
|
|
11906
|
+
return _create_text_specially(lang=lang, for_failure=for_failure)
|
|
11907
|
+
|
|
11473
11908
|
return None # pragma: no cover
|
|
11474
11909
|
|
|
11475
11910
|
|
|
@@ -11668,6 +12103,12 @@ def _create_text_conjointly(lang: str, for_failure: bool = False) -> str:
|
|
|
11668
12103
|
return EXPECT_FAIL_TEXT[f"conjointly_{type_}_text"][lang]
|
|
11669
12104
|
|
|
11670
12105
|
|
|
12106
|
+
def _create_text_specially(lang: str, for_failure: bool = False) -> str:
|
|
12107
|
+
type_ = _expect_failure_type(for_failure=for_failure)
|
|
12108
|
+
|
|
12109
|
+
return EXPECT_FAIL_TEXT[f"specially_{type_}_text"][lang]
|
|
12110
|
+
|
|
12111
|
+
|
|
11671
12112
|
def _prep_column_text(column: str | list[str]) -> str:
|
|
11672
12113
|
if isinstance(column, list):
|
|
11673
12114
|
return "`" + str(column[0]) + "`"
|
|
@@ -1,31 +1,33 @@
|
|
|
1
1
|
pointblank/__init__.py,sha256=uHrX-ARZOhvWogXXqKV65RO2DXdYLZNCD1oNcm8hE6o,1585
|
|
2
|
-
pointblank/_constants.py,sha256=
|
|
2
|
+
pointblank/_constants.py,sha256=D4HF0NrNAd-mdb88gZ6VatkRYfVX-9gC6C7TOQjjAw4,81128
|
|
3
3
|
pointblank/_constants_docs.py,sha256=JBmtt16zTYQ-zaM4ElLExtKs-dKlnN553Ys2ML1Y1C8,2099
|
|
4
|
-
pointblank/_constants_translations.py,sha256=
|
|
5
|
-
pointblank/_interrogation.py,sha256=
|
|
6
|
-
pointblank/_typing.py,sha256=
|
|
7
|
-
pointblank/_utils.py,sha256=
|
|
4
|
+
pointblank/_constants_translations.py,sha256=HXcCYmKoMjoaFv-Ym4UWv3AsIVXik2zDyAy7xvTvv0Y,186710
|
|
5
|
+
pointblank/_interrogation.py,sha256=U4GQ8Ik5rP75BYBkmunBvHKwf3XvLPHcUx18JwiBQZI,89422
|
|
6
|
+
pointblank/_typing.py,sha256=aItbCbzhbzqjK3lCbL27ltRyXoAH1c3-U6xQdRzg-lU,1594
|
|
7
|
+
pointblank/_utils.py,sha256=CsuUYXNzox-Nc5CjQNhyy2XnmnvYJVJrS5cZxklzIFo,24745
|
|
8
8
|
pointblank/_utils_check_args.py,sha256=rFEc1nbCN8ftsQQWVjCNWmQ2QmUDxkfgmoJclrZeTLs,5489
|
|
9
9
|
pointblank/_utils_html.py,sha256=sTcmnBljkPjRZF1hbpoHl4HmnXOazsA91gC9iWVIrRk,2848
|
|
10
|
-
pointblank/actions.py,sha256=
|
|
10
|
+
pointblank/actions.py,sha256=ilk__kbQiS4ieJp-4dM7SDGuobQihUxLyS5ahgiP7qE,18272
|
|
11
11
|
pointblank/assistant.py,sha256=ZIQJKTy9rDwq_Wmr1FMp0J7Q3ekxSgF3_tK0p4PTEUM,14850
|
|
12
12
|
pointblank/column.py,sha256=LumGbnterw5VM7-2-7Za3jdlug1VVS9a3TOH0Y1E5eg,76548
|
|
13
13
|
pointblank/datascan.py,sha256=rRz0hR81uTgd1e9OfLdfsNYXRk8vcpE8PW8exu-GJoE,47697
|
|
14
14
|
pointblank/draft.py,sha256=cusr4fBiNncCKIOU8UwvJcvkBeBuUnqH_UfYp9dtNss,15777
|
|
15
15
|
pointblank/schema.py,sha256=gzUCmtccO2v15MH2bo9uHUYjkKEEne1okQucxcH39pc,44291
|
|
16
16
|
pointblank/tf.py,sha256=8o_8m4i01teulEe3-YYMotSNf3tImjBMInsvdjSAO5Q,8844
|
|
17
|
-
pointblank/thresholds.py,sha256=
|
|
18
|
-
pointblank/validate.py,sha256=
|
|
19
|
-
pointblank/data/api-docs.txt,sha256=
|
|
17
|
+
pointblank/thresholds.py,sha256=cweex25DwBPrsvPW12pRoaTQnwFpUUwqTdHyFJXTnN0,25760
|
|
18
|
+
pointblank/validate.py,sha256=9dIWFetyBm70f_Ps0UkroT1gO4b5qACGs8trhObKUHg,608551
|
|
19
|
+
pointblank/data/api-docs.txt,sha256=jKjPSq6X_vU_RRSJAydnVc3C35WvTqNvu-lLKroVO4I,482044
|
|
20
20
|
pointblank/data/game_revenue-duckdb.zip,sha256=tKIVx48OGLYGsQPS3h5AjA2Nyq_rfEpLCjBiFUWhagU,35880
|
|
21
21
|
pointblank/data/game_revenue.zip,sha256=7c9EvHLyi93CHUd4p3dM4CZ-GucFCtXKSPxgLojL32U,33749
|
|
22
|
+
pointblank/data/global_sales-duckdb.zip,sha256=2ok_cvJ1ZuSkXnw0R6_OkKYRTWhJ-jJEMq2VYsv5fqY,1336390
|
|
23
|
+
pointblank/data/global_sales.zip,sha256=JeUnR1apKQ35PPwEcvTKCEIEiYeYQtoGmYjmzbz99DM,2138604
|
|
22
24
|
pointblank/data/nycflights-duckdb.zip,sha256=GQrHO9tp7d9cNGFNSbA9EKF19MLf6t2wZE0U9-hIKow,5293077
|
|
23
25
|
pointblank/data/nycflights.zip,sha256=yVjbUaKUz2LydSdF9cABuir0VReHBBgV7shiNWSd0mU,7828965
|
|
24
26
|
pointblank/data/polars-api-docs.txt,sha256=KGcS-BOtUs9zgpkWfXD-GFdFh4O_zjdkpX7msHjztLg,198045
|
|
25
27
|
pointblank/data/small_table-duckdb.zip,sha256=BhTaZ2CRS4-9Z1uVhOU6HggvW3XCar7etMznfENIcOc,2028
|
|
26
28
|
pointblank/data/small_table.zip,sha256=lmFb90Nb-v5X559Ikjg31YLAXuRyMkD9yLRElkXPMzQ,472
|
|
27
|
-
pointblank-0.9.
|
|
28
|
-
pointblank-0.9.
|
|
29
|
-
pointblank-0.9.
|
|
30
|
-
pointblank-0.9.
|
|
31
|
-
pointblank-0.9.
|
|
29
|
+
pointblank-0.9.4.dist-info/licenses/LICENSE,sha256=apLF-HWPNU7pT5bmf5KmZpD5Cklpy2u-BN_0xBoRMLY,1081
|
|
30
|
+
pointblank-0.9.4.dist-info/METADATA,sha256=TO7kSRz1e8_lhuqkF6st8ompJq-I0i5mevVfsCiHumU,14732
|
|
31
|
+
pointblank-0.9.4.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
|
32
|
+
pointblank-0.9.4.dist-info/top_level.txt,sha256=-wHrS1SvV8-nhvc3w-PPYs1C1WtEc1pK-eGjubbCCKc,11
|
|
33
|
+
pointblank-0.9.4.dist-info/RECORD,,
|