pointblank 0.8.7__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants.py +26 -10
- pointblank/_constants_translations.py +162 -0
- pointblank/_interrogation.py +117 -0
- pointblank/_typing.py +19 -3
- pointblank/_utils.py +1 -0
- pointblank/data/api-docs.txt +1022 -52
- pointblank/datascan.py +4 -4
- pointblank/draft.py +1 -1
- pointblank/thresholds.py +10 -0
- pointblank/validate.py +1462 -55
- {pointblank-0.8.7.dist-info → pointblank-0.9.1.dist-info}/METADATA +6 -2
- {pointblank-0.8.7.dist-info → pointblank-0.9.1.dist-info}/RECORD +15 -15
- {pointblank-0.8.7.dist-info → pointblank-0.9.1.dist-info}/WHEEL +1 -1
- {pointblank-0.8.7.dist-info → pointblank-0.9.1.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.8.7.dist-info → pointblank-0.9.1.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -7,6 +7,7 @@ import datetime
|
|
|
7
7
|
import inspect
|
|
8
8
|
import json
|
|
9
9
|
import re
|
|
10
|
+
import tempfile
|
|
10
11
|
import threading
|
|
11
12
|
from dataclasses import dataclass
|
|
12
13
|
from importlib.metadata import version
|
|
@@ -55,8 +56,10 @@ from pointblank._interrogation import (
|
|
|
55
56
|
ConjointlyValidation,
|
|
56
57
|
NumberOfTestUnits,
|
|
57
58
|
RowCountMatch,
|
|
59
|
+
RowsComplete,
|
|
58
60
|
RowsDistinct,
|
|
59
61
|
)
|
|
62
|
+
from pointblank._typing import SegmentSpec
|
|
60
63
|
from pointblank._utils import (
|
|
61
64
|
_check_any_df_lib,
|
|
62
65
|
_check_invalid_fields,
|
|
@@ -119,16 +122,18 @@ def _action_context_manager(metadata):
|
|
|
119
122
|
delattr(_action_context, "metadata")
|
|
120
123
|
|
|
121
124
|
|
|
122
|
-
def get_action_metadata():
|
|
125
|
+
def get_action_metadata() -> dict | None:
|
|
123
126
|
"""Access step-level metadata when authoring custom actions.
|
|
124
127
|
|
|
125
128
|
Get the metadata for the validation step where an action was triggered. This can be called by
|
|
126
|
-
user functions to get the metadata for the current action.
|
|
129
|
+
user functions to get the metadata for the current action. This function can only be used within
|
|
130
|
+
callables crafted for the [`Actions`](`pointblank.Actions`) class.
|
|
127
131
|
|
|
128
132
|
Returns
|
|
129
133
|
-------
|
|
130
|
-
dict
|
|
131
|
-
A dictionary containing the metadata for the current step.
|
|
134
|
+
dict | None
|
|
135
|
+
A dictionary containing the metadata for the current step. If called outside of an action
|
|
136
|
+
(i.e., when no action is being executed), this function will return `None`.
|
|
132
137
|
|
|
133
138
|
Description of the Metadata Fields
|
|
134
139
|
----------------------------------
|
|
@@ -163,7 +168,7 @@ def get_action_metadata():
|
|
|
163
168
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
164
169
|
actions=pb.Actions(warning=log_issue),
|
|
165
170
|
)
|
|
166
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
171
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
167
172
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
168
173
|
.col_vals_gt(
|
|
169
174
|
columns="session_duration",
|
|
@@ -181,6 +186,11 @@ def get_action_metadata():
|
|
|
181
186
|
- the `metadata` is a dictionary that is used to craft the log message
|
|
182
187
|
- the action is passed as a bare function to the `Actions` object within the `Validate` object
|
|
183
188
|
(placing it within `Validate(actions=)` ensures it's set as an action for every validation step)
|
|
189
|
+
|
|
190
|
+
See Also
|
|
191
|
+
--------
|
|
192
|
+
Have a look at [`Actions`](`pointblank.Actions`) for more information on how to create custom
|
|
193
|
+
actions for validation steps that exceed a set threshold value.
|
|
184
194
|
"""
|
|
185
195
|
if hasattr(_action_context, "metadata"): # pragma: no cover
|
|
186
196
|
return _action_context.metadata # pragma: no cover
|
|
@@ -204,17 +214,19 @@ def _final_action_context_manager(summary):
|
|
|
204
214
|
delattr(_final_action_context, "summary")
|
|
205
215
|
|
|
206
216
|
|
|
207
|
-
def get_validation_summary():
|
|
217
|
+
def get_validation_summary() -> dict | None:
|
|
208
218
|
"""Access validation summary information when authoring final actions.
|
|
209
219
|
|
|
210
220
|
This function provides a convenient way to access summary information about the validation
|
|
211
221
|
process within a final action. It returns a dictionary with key metrics from the validation
|
|
212
|
-
process.
|
|
222
|
+
process. This function can only be used within callables crafted for the
|
|
223
|
+
[`FinalActions`](`pointblank.FinalActions`) class.
|
|
213
224
|
|
|
214
225
|
Returns
|
|
215
226
|
-------
|
|
216
227
|
dict | None
|
|
217
|
-
A dictionary containing validation metrics
|
|
228
|
+
A dictionary containing validation metrics. If called outside of an final action context,
|
|
229
|
+
this function will return `None`.
|
|
218
230
|
|
|
219
231
|
Description of the Summary Fields
|
|
220
232
|
--------------------------------
|
|
@@ -304,6 +316,11 @@ def get_validation_summary():
|
|
|
304
316
|
|
|
305
317
|
Final actions work well with both simple logging and more complex notification systems, allowing
|
|
306
318
|
you to integrate validation results into your broader data quality workflows.
|
|
319
|
+
|
|
320
|
+
See Also
|
|
321
|
+
--------
|
|
322
|
+
Have a look at [`FinalActions`](`pointblank.FinalActions`) for more information on how to create
|
|
323
|
+
custom actions that are executed after all validation steps have been completed.
|
|
307
324
|
"""
|
|
308
325
|
if hasattr(_final_action_context, "summary"):
|
|
309
326
|
return _final_action_context.summary
|
|
@@ -516,10 +533,10 @@ def load_dataset(
|
|
|
516
533
|
data_path = files("pointblank.data") / f"{dataset}-duckdb.zip"
|
|
517
534
|
|
|
518
535
|
# Unzip the DuckDB dataset to a temporary directory
|
|
519
|
-
with ZipFile(data_path, "r") as z:
|
|
520
|
-
z.extractall(path=
|
|
536
|
+
with tempfile.TemporaryDirectory() as tmp, ZipFile(data_path, "r") as z:
|
|
537
|
+
z.extractall(path=tmp)
|
|
521
538
|
|
|
522
|
-
data_path = f"
|
|
539
|
+
data_path = f"{tmp}/{dataset}.ddb"
|
|
523
540
|
|
|
524
541
|
dataset = ibis.connect(f"duckdb://{data_path}").table(dataset)
|
|
525
542
|
|
|
@@ -1783,14 +1800,15 @@ class _ValidationInfo:
|
|
|
1783
1800
|
assertion_type
|
|
1784
1801
|
The type of assertion. This is the method name of the validation (e.g., `"col_vals_gt"`).
|
|
1785
1802
|
column
|
|
1786
|
-
The column to validate.
|
|
1787
|
-
multiple columns).
|
|
1803
|
+
The column(s) to validate.
|
|
1788
1804
|
values
|
|
1789
1805
|
The value or values to compare against.
|
|
1790
1806
|
na_pass
|
|
1791
1807
|
Whether to pass test units that hold missing values.
|
|
1792
1808
|
pre
|
|
1793
1809
|
A preprocessing function or lambda to apply to the data table for the validation step.
|
|
1810
|
+
segments
|
|
1811
|
+
The segments to use for the validation step.
|
|
1794
1812
|
thresholds
|
|
1795
1813
|
The threshold values for the validation.
|
|
1796
1814
|
actions
|
|
@@ -1841,11 +1859,12 @@ class _ValidationInfo:
|
|
|
1841
1859
|
step_id: str | None = None
|
|
1842
1860
|
sha1: str | None = None
|
|
1843
1861
|
assertion_type: str | None = None
|
|
1844
|
-
column:
|
|
1862
|
+
column: any | None = None
|
|
1845
1863
|
values: any | list[any] | tuple | None = None
|
|
1846
1864
|
inclusive: tuple[bool, bool] | None = None
|
|
1847
1865
|
na_pass: bool | None = None
|
|
1848
1866
|
pre: Callable | None = None
|
|
1867
|
+
segments: any | None = None
|
|
1849
1868
|
thresholds: Thresholds | None = None
|
|
1850
1869
|
actions: Actions | None = None
|
|
1851
1870
|
label: str | None = None
|
|
@@ -1909,7 +1928,7 @@ class Validate:
|
|
|
1909
1928
|
The table to validate, which could be a DataFrame object or an Ibis table object. Read the
|
|
1910
1929
|
*Supported Input Table Types* section for details on the supported table types.
|
|
1911
1930
|
tbl_name
|
|
1912
|
-
|
|
1931
|
+
An optional name to assign to the input table object. If no value is provided, a name will
|
|
1913
1932
|
be generated based on whatever information is available. This table name will be displayed
|
|
1914
1933
|
in the header area of the tabular report.
|
|
1915
1934
|
label
|
|
@@ -2323,6 +2342,7 @@ class Validate:
|
|
|
2323
2342
|
value: float | int | Column,
|
|
2324
2343
|
na_pass: bool = False,
|
|
2325
2344
|
pre: Callable | None = None,
|
|
2345
|
+
segments: SegmentSpec | None = None,
|
|
2326
2346
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
2327
2347
|
actions: Actions | None = None,
|
|
2328
2348
|
brief: str | bool | None = None,
|
|
@@ -2354,10 +2374,15 @@ class Validate:
|
|
|
2354
2374
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2355
2375
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2356
2376
|
pre
|
|
2357
|
-
|
|
2377
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2358
2378
|
interrogation. This function should take a table as input and return a modified table.
|
|
2359
2379
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2360
2380
|
argument.
|
|
2381
|
+
segments
|
|
2382
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2383
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2384
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2385
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2361
2386
|
thresholds
|
|
2362
2387
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2363
2388
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2420,6 +2445,42 @@ class Validate:
|
|
|
2420
2445
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2421
2446
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2422
2447
|
|
|
2448
|
+
Segmentation
|
|
2449
|
+
------------
|
|
2450
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2451
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2452
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2453
|
+
column.
|
|
2454
|
+
|
|
2455
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2456
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2457
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2458
|
+
region.
|
|
2459
|
+
|
|
2460
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2461
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2462
|
+
segment on only specific dates, you can provide a tuple like
|
|
2463
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2464
|
+
(i.e., no validation steps will be created for them).
|
|
2465
|
+
|
|
2466
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2467
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2468
|
+
|
|
2469
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2470
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2471
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2472
|
+
columns
|
|
2473
|
+
|
|
2474
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2475
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2476
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2477
|
+
identify issues within specific segments.
|
|
2478
|
+
|
|
2479
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2480
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2481
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2482
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2483
|
+
|
|
2423
2484
|
Thresholds
|
|
2424
2485
|
----------
|
|
2425
2486
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2518,6 +2579,8 @@ class Validate:
|
|
|
2518
2579
|
_check_column(column=columns)
|
|
2519
2580
|
# _check_value_float_int(value=value)
|
|
2520
2581
|
_check_pre(pre=pre)
|
|
2582
|
+
# TODO: add check for segments
|
|
2583
|
+
# _check_segments(segments=segments)
|
|
2521
2584
|
_check_thresholds(thresholds=thresholds)
|
|
2522
2585
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
2523
2586
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -2550,6 +2613,7 @@ class Validate:
|
|
|
2550
2613
|
values=value,
|
|
2551
2614
|
na_pass=na_pass,
|
|
2552
2615
|
pre=pre,
|
|
2616
|
+
segments=segments,
|
|
2553
2617
|
thresholds=thresholds,
|
|
2554
2618
|
actions=actions,
|
|
2555
2619
|
brief=brief,
|
|
@@ -2566,6 +2630,7 @@ class Validate:
|
|
|
2566
2630
|
value: float | int | Column,
|
|
2567
2631
|
na_pass: bool = False,
|
|
2568
2632
|
pre: Callable | None = None,
|
|
2633
|
+
segments: SegmentSpec | None = None,
|
|
2569
2634
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
2570
2635
|
actions: Actions | None = None,
|
|
2571
2636
|
brief: str | bool | None = None,
|
|
@@ -2597,10 +2662,15 @@ class Validate:
|
|
|
2597
2662
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2598
2663
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2599
2664
|
pre
|
|
2600
|
-
|
|
2665
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2601
2666
|
interrogation. This function should take a table as input and return a modified table.
|
|
2602
2667
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2603
2668
|
argument.
|
|
2669
|
+
segments
|
|
2670
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2671
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2672
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2673
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2604
2674
|
thresholds
|
|
2605
2675
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2606
2676
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2663,6 +2733,42 @@ class Validate:
|
|
|
2663
2733
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2664
2734
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2665
2735
|
|
|
2736
|
+
Segmentation
|
|
2737
|
+
------------
|
|
2738
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2739
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2740
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2741
|
+
column.
|
|
2742
|
+
|
|
2743
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2744
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2745
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2746
|
+
region.
|
|
2747
|
+
|
|
2748
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2749
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2750
|
+
segment on only specific dates, you can provide a tuple like
|
|
2751
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2752
|
+
(i.e., no validation steps will be created for them).
|
|
2753
|
+
|
|
2754
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2755
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2756
|
+
|
|
2757
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2758
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2759
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2760
|
+
columns
|
|
2761
|
+
|
|
2762
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2763
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2764
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2765
|
+
identify issues within specific segments.
|
|
2766
|
+
|
|
2767
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2768
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2769
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2770
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2771
|
+
|
|
2666
2772
|
Thresholds
|
|
2667
2773
|
----------
|
|
2668
2774
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2760,6 +2866,8 @@ class Validate:
|
|
|
2760
2866
|
_check_column(column=columns)
|
|
2761
2867
|
# _check_value_float_int(value=value)
|
|
2762
2868
|
_check_pre(pre=pre)
|
|
2869
|
+
# TODO: add check for segments
|
|
2870
|
+
# _check_segments(segments=segments)
|
|
2763
2871
|
_check_thresholds(thresholds=thresholds)
|
|
2764
2872
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
2765
2873
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -2792,6 +2900,7 @@ class Validate:
|
|
|
2792
2900
|
values=value,
|
|
2793
2901
|
na_pass=na_pass,
|
|
2794
2902
|
pre=pre,
|
|
2903
|
+
segments=segments,
|
|
2795
2904
|
thresholds=thresholds,
|
|
2796
2905
|
actions=actions,
|
|
2797
2906
|
brief=brief,
|
|
@@ -2808,6 +2917,7 @@ class Validate:
|
|
|
2808
2917
|
value: float | int | Column,
|
|
2809
2918
|
na_pass: bool = False,
|
|
2810
2919
|
pre: Callable | None = None,
|
|
2920
|
+
segments: SegmentSpec | None = None,
|
|
2811
2921
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
2812
2922
|
actions: Actions | None = None,
|
|
2813
2923
|
brief: str | bool | None = None,
|
|
@@ -2839,10 +2949,15 @@ class Validate:
|
|
|
2839
2949
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2840
2950
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2841
2951
|
pre
|
|
2842
|
-
|
|
2952
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2843
2953
|
interrogation. This function should take a table as input and return a modified table.
|
|
2844
2954
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2845
2955
|
argument.
|
|
2956
|
+
segments
|
|
2957
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2958
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2959
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2960
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2846
2961
|
thresholds
|
|
2847
2962
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2848
2963
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2905,6 +3020,42 @@ class Validate:
|
|
|
2905
3020
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2906
3021
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2907
3022
|
|
|
3023
|
+
Segmentation
|
|
3024
|
+
------------
|
|
3025
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3026
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3027
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3028
|
+
column.
|
|
3029
|
+
|
|
3030
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3031
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3032
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3033
|
+
region.
|
|
3034
|
+
|
|
3035
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3036
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3037
|
+
segment on only specific dates, you can provide a tuple like
|
|
3038
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3039
|
+
(i.e., no validation steps will be created for them).
|
|
3040
|
+
|
|
3041
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3042
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3043
|
+
|
|
3044
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3045
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3046
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3047
|
+
columns
|
|
3048
|
+
|
|
3049
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3050
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3051
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3052
|
+
identify issues within specific segments.
|
|
3053
|
+
|
|
3054
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3055
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3056
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3057
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3058
|
+
|
|
2908
3059
|
Thresholds
|
|
2909
3060
|
----------
|
|
2910
3061
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3001,6 +3152,8 @@ class Validate:
|
|
|
3001
3152
|
_check_column(column=columns)
|
|
3002
3153
|
# _check_value_float_int(value=value)
|
|
3003
3154
|
_check_pre(pre=pre)
|
|
3155
|
+
# TODO: add check for segments
|
|
3156
|
+
# _check_segments(segments=segments)
|
|
3004
3157
|
_check_thresholds(thresholds=thresholds)
|
|
3005
3158
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3006
3159
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -3033,6 +3186,7 @@ class Validate:
|
|
|
3033
3186
|
values=value,
|
|
3034
3187
|
na_pass=na_pass,
|
|
3035
3188
|
pre=pre,
|
|
3189
|
+
segments=segments,
|
|
3036
3190
|
thresholds=thresholds,
|
|
3037
3191
|
actions=actions,
|
|
3038
3192
|
brief=brief,
|
|
@@ -3049,6 +3203,7 @@ class Validate:
|
|
|
3049
3203
|
value: float | int | Column,
|
|
3050
3204
|
na_pass: bool = False,
|
|
3051
3205
|
pre: Callable | None = None,
|
|
3206
|
+
segments: SegmentSpec | None = None,
|
|
3052
3207
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
3053
3208
|
actions: Actions | None = None,
|
|
3054
3209
|
brief: str | bool | None = None,
|
|
@@ -3080,10 +3235,15 @@ class Validate:
|
|
|
3080
3235
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3081
3236
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3082
3237
|
pre
|
|
3083
|
-
|
|
3238
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3084
3239
|
interrogation. This function should take a table as input and return a modified table.
|
|
3085
3240
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3086
3241
|
argument.
|
|
3242
|
+
segments
|
|
3243
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3244
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3245
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3246
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3087
3247
|
thresholds
|
|
3088
3248
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3089
3249
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3146,6 +3306,42 @@ class Validate:
|
|
|
3146
3306
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
3147
3307
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
3148
3308
|
|
|
3309
|
+
Segmentation
|
|
3310
|
+
------------
|
|
3311
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3312
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3313
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3314
|
+
column.
|
|
3315
|
+
|
|
3316
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3317
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3318
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3319
|
+
region.
|
|
3320
|
+
|
|
3321
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3322
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3323
|
+
segment on only specific dates, you can provide a tuple like
|
|
3324
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3325
|
+
(i.e., no validation steps will be created for them).
|
|
3326
|
+
|
|
3327
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3328
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3329
|
+
|
|
3330
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3331
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3332
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3333
|
+
columns
|
|
3334
|
+
|
|
3335
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3336
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3337
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3338
|
+
identify issues within specific segments.
|
|
3339
|
+
|
|
3340
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3341
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3342
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3343
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3344
|
+
|
|
3149
3345
|
Thresholds
|
|
3150
3346
|
----------
|
|
3151
3347
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3240,6 +3436,8 @@ class Validate:
|
|
|
3240
3436
|
_check_column(column=columns)
|
|
3241
3437
|
# _check_value_float_int(value=value)
|
|
3242
3438
|
_check_pre(pre=pre)
|
|
3439
|
+
# TODO: add check for segments
|
|
3440
|
+
# _check_segments(segments=segments)
|
|
3243
3441
|
_check_thresholds(thresholds=thresholds)
|
|
3244
3442
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3245
3443
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -3272,6 +3470,7 @@ class Validate:
|
|
|
3272
3470
|
values=value,
|
|
3273
3471
|
na_pass=na_pass,
|
|
3274
3472
|
pre=pre,
|
|
3473
|
+
segments=segments,
|
|
3275
3474
|
thresholds=thresholds,
|
|
3276
3475
|
actions=actions,
|
|
3277
3476
|
brief=brief,
|
|
@@ -3288,6 +3487,7 @@ class Validate:
|
|
|
3288
3487
|
value: float | int | Column,
|
|
3289
3488
|
na_pass: bool = False,
|
|
3290
3489
|
pre: Callable | None = None,
|
|
3490
|
+
segments: SegmentSpec | None = None,
|
|
3291
3491
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
3292
3492
|
actions: Actions | None = None,
|
|
3293
3493
|
brief: str | bool | None = None,
|
|
@@ -3319,10 +3519,15 @@ class Validate:
|
|
|
3319
3519
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3320
3520
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3321
3521
|
pre
|
|
3322
|
-
|
|
3522
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3323
3523
|
interrogation. This function should take a table as input and return a modified table.
|
|
3324
3524
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3325
3525
|
argument.
|
|
3526
|
+
segments
|
|
3527
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3528
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3529
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3530
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3326
3531
|
thresholds
|
|
3327
3532
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3328
3533
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3385,6 +3590,42 @@ class Validate:
|
|
|
3385
3590
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
3386
3591
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
3387
3592
|
|
|
3593
|
+
Segmentation
|
|
3594
|
+
------------
|
|
3595
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3596
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3597
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3598
|
+
column.
|
|
3599
|
+
|
|
3600
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3601
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3602
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3603
|
+
region.
|
|
3604
|
+
|
|
3605
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3606
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3607
|
+
segment on only specific dates, you can provide a tuple like
|
|
3608
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3609
|
+
(i.e., no validation steps will be created for them).
|
|
3610
|
+
|
|
3611
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3612
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3613
|
+
|
|
3614
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3615
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3616
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3617
|
+
columns
|
|
3618
|
+
|
|
3619
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3620
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3621
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3622
|
+
identify issues within specific segments.
|
|
3623
|
+
|
|
3624
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3625
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3626
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3627
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3628
|
+
|
|
3388
3629
|
Thresholds
|
|
3389
3630
|
----------
|
|
3390
3631
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3483,6 +3724,8 @@ class Validate:
|
|
|
3483
3724
|
_check_column(column=columns)
|
|
3484
3725
|
# _check_value_float_int(value=value)
|
|
3485
3726
|
_check_pre(pre=pre)
|
|
3727
|
+
# TODO: add check for segments
|
|
3728
|
+
# _check_segments(segments=segments)
|
|
3486
3729
|
_check_thresholds(thresholds=thresholds)
|
|
3487
3730
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3488
3731
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -3515,6 +3758,7 @@ class Validate:
|
|
|
3515
3758
|
values=value,
|
|
3516
3759
|
na_pass=na_pass,
|
|
3517
3760
|
pre=pre,
|
|
3761
|
+
segments=segments,
|
|
3518
3762
|
thresholds=thresholds,
|
|
3519
3763
|
actions=actions,
|
|
3520
3764
|
brief=brief,
|
|
@@ -3531,6 +3775,7 @@ class Validate:
|
|
|
3531
3775
|
value: float | int | Column,
|
|
3532
3776
|
na_pass: bool = False,
|
|
3533
3777
|
pre: Callable | None = None,
|
|
3778
|
+
segments: SegmentSpec | None = None,
|
|
3534
3779
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
3535
3780
|
actions: Actions | None = None,
|
|
3536
3781
|
brief: str | bool | None = None,
|
|
@@ -3562,10 +3807,15 @@ class Validate:
|
|
|
3562
3807
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3563
3808
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3564
3809
|
pre
|
|
3565
|
-
|
|
3810
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3566
3811
|
interrogation. This function should take a table as input and return a modified table.
|
|
3567
3812
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3568
3813
|
argument.
|
|
3814
|
+
segments
|
|
3815
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3816
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3817
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3818
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3569
3819
|
thresholds
|
|
3570
3820
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3571
3821
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3628,6 +3878,42 @@ class Validate:
|
|
|
3628
3878
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
3629
3879
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
3630
3880
|
|
|
3881
|
+
Segmentation
|
|
3882
|
+
------------
|
|
3883
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3884
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3885
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3886
|
+
column.
|
|
3887
|
+
|
|
3888
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3889
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3890
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3891
|
+
region.
|
|
3892
|
+
|
|
3893
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3894
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3895
|
+
segment on only specific dates, you can provide a tuple like
|
|
3896
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3897
|
+
(i.e., no validation steps will be created for them).
|
|
3898
|
+
|
|
3899
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3900
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3901
|
+
|
|
3902
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3903
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3904
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3905
|
+
columns
|
|
3906
|
+
|
|
3907
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3908
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3909
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3910
|
+
identify issues within specific segments.
|
|
3911
|
+
|
|
3912
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3913
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3914
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3915
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3916
|
+
|
|
3631
3917
|
Thresholds
|
|
3632
3918
|
----------
|
|
3633
3919
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3726,6 +4012,8 @@ class Validate:
|
|
|
3726
4012
|
_check_column(column=columns)
|
|
3727
4013
|
# _check_value_float_int(value=value)
|
|
3728
4014
|
_check_pre(pre=pre)
|
|
4015
|
+
# TODO: add check for segments
|
|
4016
|
+
# _check_segments(segments=segments)
|
|
3729
4017
|
_check_thresholds(thresholds=thresholds)
|
|
3730
4018
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3731
4019
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -3758,6 +4046,7 @@ class Validate:
|
|
|
3758
4046
|
values=value,
|
|
3759
4047
|
na_pass=na_pass,
|
|
3760
4048
|
pre=pre,
|
|
4049
|
+
segments=segments,
|
|
3761
4050
|
thresholds=thresholds,
|
|
3762
4051
|
actions=actions,
|
|
3763
4052
|
brief=brief,
|
|
@@ -3776,6 +4065,7 @@ class Validate:
|
|
|
3776
4065
|
inclusive: tuple[bool, bool] = (True, True),
|
|
3777
4066
|
na_pass: bool = False,
|
|
3778
4067
|
pre: Callable | None = None,
|
|
4068
|
+
segments: SegmentSpec | None = None,
|
|
3779
4069
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
3780
4070
|
actions: Actions | None = None,
|
|
3781
4071
|
brief: str | bool | None = None,
|
|
@@ -3817,10 +4107,15 @@ class Validate:
|
|
|
3817
4107
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3818
4108
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3819
4109
|
pre
|
|
3820
|
-
|
|
4110
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3821
4111
|
interrogation. This function should take a table as input and return a modified table.
|
|
3822
4112
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3823
4113
|
argument.
|
|
4114
|
+
segments
|
|
4115
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4116
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4117
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4118
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3824
4119
|
thresholds
|
|
3825
4120
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3826
4121
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3885,6 +4180,42 @@ class Validate:
|
|
|
3885
4180
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
3886
4181
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
3887
4182
|
|
|
4183
|
+
Segmentation
|
|
4184
|
+
------------
|
|
4185
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
4186
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
4187
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
4188
|
+
column.
|
|
4189
|
+
|
|
4190
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
4191
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
4192
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
4193
|
+
region.
|
|
4194
|
+
|
|
4195
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
4196
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
4197
|
+
segment on only specific dates, you can provide a tuple like
|
|
4198
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
4199
|
+
(i.e., no validation steps will be created for them).
|
|
4200
|
+
|
|
4201
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4202
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
4203
|
+
|
|
4204
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
4205
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
4206
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
4207
|
+
columns
|
|
4208
|
+
|
|
4209
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4210
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
4211
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
4212
|
+
identify issues within specific segments.
|
|
4213
|
+
|
|
4214
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
4215
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
4216
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
4217
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
4218
|
+
|
|
3888
4219
|
Thresholds
|
|
3889
4220
|
----------
|
|
3890
4221
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3992,6 +4323,8 @@ class Validate:
|
|
|
3992
4323
|
# _check_value_float_int(value=left)
|
|
3993
4324
|
# _check_value_float_int(value=right)
|
|
3994
4325
|
_check_pre(pre=pre)
|
|
4326
|
+
# TODO: add check for segments
|
|
4327
|
+
# _check_segments(segments=segments)
|
|
3995
4328
|
_check_thresholds(thresholds=thresholds)
|
|
3996
4329
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3997
4330
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -4029,6 +4362,7 @@ class Validate:
|
|
|
4029
4362
|
inclusive=inclusive,
|
|
4030
4363
|
na_pass=na_pass,
|
|
4031
4364
|
pre=pre,
|
|
4365
|
+
segments=segments,
|
|
4032
4366
|
thresholds=thresholds,
|
|
4033
4367
|
actions=actions,
|
|
4034
4368
|
brief=brief,
|
|
@@ -4047,6 +4381,7 @@ class Validate:
|
|
|
4047
4381
|
inclusive: tuple[bool, bool] = (True, True),
|
|
4048
4382
|
na_pass: bool = False,
|
|
4049
4383
|
pre: Callable | None = None,
|
|
4384
|
+
segments: SegmentSpec | None = None,
|
|
4050
4385
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4051
4386
|
actions: Actions | None = None,
|
|
4052
4387
|
brief: str | bool | None = None,
|
|
@@ -4088,10 +4423,15 @@ class Validate:
|
|
|
4088
4423
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
4089
4424
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
4090
4425
|
pre
|
|
4091
|
-
|
|
4426
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4092
4427
|
interrogation. This function should take a table as input and return a modified table.
|
|
4093
4428
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4094
4429
|
argument.
|
|
4430
|
+
segments
|
|
4431
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4432
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4433
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4434
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4095
4435
|
thresholds
|
|
4096
4436
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4097
4437
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4156,6 +4496,42 @@ class Validate:
|
|
|
4156
4496
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
4157
4497
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
4158
4498
|
|
|
4499
|
+
Segmentation
|
|
4500
|
+
------------
|
|
4501
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
4502
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
4503
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
4504
|
+
column.
|
|
4505
|
+
|
|
4506
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
4507
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
4508
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
4509
|
+
region.
|
|
4510
|
+
|
|
4511
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
4512
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
4513
|
+
segment on only specific dates, you can provide a tuple like
|
|
4514
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
4515
|
+
(i.e., no validation steps will be created for them).
|
|
4516
|
+
|
|
4517
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4518
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
4519
|
+
|
|
4520
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
4521
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
4522
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
4523
|
+
columns
|
|
4524
|
+
|
|
4525
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4526
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
4527
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
4528
|
+
identify issues within specific segments.
|
|
4529
|
+
|
|
4530
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
4531
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
4532
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
4533
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
4534
|
+
|
|
4159
4535
|
Thresholds
|
|
4160
4536
|
----------
|
|
4161
4537
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -4263,6 +4639,8 @@ class Validate:
|
|
|
4263
4639
|
# _check_value_float_int(value=left)
|
|
4264
4640
|
# _check_value_float_int(value=right)
|
|
4265
4641
|
_check_pre(pre=pre)
|
|
4642
|
+
# TODO: add check for segments
|
|
4643
|
+
# _check_segments(segments=segments)
|
|
4266
4644
|
_check_thresholds(thresholds=thresholds)
|
|
4267
4645
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
4268
4646
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -4300,6 +4678,7 @@ class Validate:
|
|
|
4300
4678
|
inclusive=inclusive,
|
|
4301
4679
|
na_pass=na_pass,
|
|
4302
4680
|
pre=pre,
|
|
4681
|
+
segments=segments,
|
|
4303
4682
|
thresholds=thresholds,
|
|
4304
4683
|
actions=actions,
|
|
4305
4684
|
brief=brief,
|
|
@@ -4315,6 +4694,7 @@ class Validate:
|
|
|
4315
4694
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
4316
4695
|
set: Collection[Any],
|
|
4317
4696
|
pre: Callable | None = None,
|
|
4697
|
+
segments: SegmentSpec | None = None,
|
|
4318
4698
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4319
4699
|
actions: Actions | None = None,
|
|
4320
4700
|
brief: str | bool | None = None,
|
|
@@ -4338,10 +4718,15 @@ class Validate:
|
|
|
4338
4718
|
set
|
|
4339
4719
|
A list of values to compare against.
|
|
4340
4720
|
pre
|
|
4341
|
-
|
|
4721
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4342
4722
|
interrogation. This function should take a table as input and return a modified table.
|
|
4343
4723
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4344
4724
|
argument.
|
|
4725
|
+
segments
|
|
4726
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4727
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4728
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4729
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4345
4730
|
thresholds
|
|
4346
4731
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4347
4732
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4383,6 +4768,42 @@ class Validate:
|
|
|
4383
4768
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
4384
4769
|
subsequent validation steps.
|
|
4385
4770
|
|
|
4771
|
+
Segmentation
|
|
4772
|
+
------------
|
|
4773
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
4774
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
4775
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
4776
|
+
column.
|
|
4777
|
+
|
|
4778
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
4779
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
4780
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
4781
|
+
region.
|
|
4782
|
+
|
|
4783
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
4784
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
4785
|
+
segment on only specific dates, you can provide a tuple like
|
|
4786
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
4787
|
+
(i.e., no validation steps will be created for them).
|
|
4788
|
+
|
|
4789
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4790
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
4791
|
+
|
|
4792
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
4793
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
4794
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
4795
|
+
columns
|
|
4796
|
+
|
|
4797
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4798
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
4799
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
4800
|
+
identify issues within specific segments.
|
|
4801
|
+
|
|
4802
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
4803
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
4804
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
4805
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
4806
|
+
|
|
4386
4807
|
Thresholds
|
|
4387
4808
|
----------
|
|
4388
4809
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -4481,6 +4902,8 @@ class Validate:
|
|
|
4481
4902
|
raise ValueError("`set=` must be a list of floats, integers, or strings.")
|
|
4482
4903
|
|
|
4483
4904
|
_check_pre(pre=pre)
|
|
4905
|
+
# TODO: add check for segments
|
|
4906
|
+
# _check_segments(segments=segments)
|
|
4484
4907
|
_check_thresholds(thresholds=thresholds)
|
|
4485
4908
|
_check_boolean_input(param=active, param_name="active")
|
|
4486
4909
|
|
|
@@ -4508,6 +4931,7 @@ class Validate:
|
|
|
4508
4931
|
column=column,
|
|
4509
4932
|
values=set,
|
|
4510
4933
|
pre=pre,
|
|
4934
|
+
segments=segments,
|
|
4511
4935
|
thresholds=thresholds,
|
|
4512
4936
|
actions=actions,
|
|
4513
4937
|
brief=brief,
|
|
@@ -4523,6 +4947,7 @@ class Validate:
|
|
|
4523
4947
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
4524
4948
|
set: list[float | int],
|
|
4525
4949
|
pre: Callable | None = None,
|
|
4950
|
+
segments: SegmentSpec | None = None,
|
|
4526
4951
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4527
4952
|
actions: Actions | None = None,
|
|
4528
4953
|
brief: str | bool | None = None,
|
|
@@ -4546,10 +4971,15 @@ class Validate:
|
|
|
4546
4971
|
set
|
|
4547
4972
|
A list of values to compare against.
|
|
4548
4973
|
pre
|
|
4549
|
-
|
|
4974
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4550
4975
|
interrogation. This function should take a table as input and return a modified table.
|
|
4551
4976
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4552
4977
|
argument.
|
|
4978
|
+
segments
|
|
4979
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4980
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4981
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4982
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4553
4983
|
thresholds
|
|
4554
4984
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4555
4985
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4591,6 +5021,42 @@ class Validate:
|
|
|
4591
5021
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
4592
5022
|
subsequent validation steps.
|
|
4593
5023
|
|
|
5024
|
+
Segmentation
|
|
5025
|
+
------------
|
|
5026
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5027
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
5028
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
5029
|
+
column.
|
|
5030
|
+
|
|
5031
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
5032
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
5033
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
5034
|
+
region.
|
|
5035
|
+
|
|
5036
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
5037
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
5038
|
+
segment on only specific dates, you can provide a tuple like
|
|
5039
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
5040
|
+
(i.e., no validation steps will be created for them).
|
|
5041
|
+
|
|
5042
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5043
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
5044
|
+
|
|
5045
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
5046
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
5047
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
5048
|
+
columns
|
|
5049
|
+
|
|
5050
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5051
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
5052
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
5053
|
+
identify issues within specific segments.
|
|
5054
|
+
|
|
5055
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
5056
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
5057
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
5058
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
5059
|
+
|
|
4594
5060
|
Thresholds
|
|
4595
5061
|
----------
|
|
4596
5062
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -4684,6 +5150,8 @@ class Validate:
|
|
|
4684
5150
|
_check_column(column=columns)
|
|
4685
5151
|
_check_set_types(set=set)
|
|
4686
5152
|
_check_pre(pre=pre)
|
|
5153
|
+
# TODO: add check for segments
|
|
5154
|
+
# _check_segments(segments=segments)
|
|
4687
5155
|
_check_thresholds(thresholds=thresholds)
|
|
4688
5156
|
_check_boolean_input(param=active, param_name="active")
|
|
4689
5157
|
|
|
@@ -4711,6 +5179,7 @@ class Validate:
|
|
|
4711
5179
|
column=column,
|
|
4712
5180
|
values=set,
|
|
4713
5181
|
pre=pre,
|
|
5182
|
+
segments=segments,
|
|
4714
5183
|
thresholds=thresholds,
|
|
4715
5184
|
actions=actions,
|
|
4716
5185
|
brief=brief,
|
|
@@ -4725,6 +5194,7 @@ class Validate:
|
|
|
4725
5194
|
self,
|
|
4726
5195
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
4727
5196
|
pre: Callable | None = None,
|
|
5197
|
+
segments: SegmentSpec | None = None,
|
|
4728
5198
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4729
5199
|
actions: Actions | None = None,
|
|
4730
5200
|
brief: str | bool | None = None,
|
|
@@ -4745,10 +5215,15 @@ class Validate:
|
|
|
4745
5215
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
4746
5216
|
generated for each column.
|
|
4747
5217
|
pre
|
|
4748
|
-
|
|
5218
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4749
5219
|
interrogation. This function should take a table as input and return a modified table.
|
|
4750
5220
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4751
5221
|
argument.
|
|
5222
|
+
segments
|
|
5223
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
5224
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
5225
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
5226
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4752
5227
|
thresholds
|
|
4753
5228
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4754
5229
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4790,6 +5265,42 @@ class Validate:
|
|
|
4790
5265
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
4791
5266
|
subsequent validation steps.
|
|
4792
5267
|
|
|
5268
|
+
Segmentation
|
|
5269
|
+
------------
|
|
5270
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5271
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
5272
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
5273
|
+
column.
|
|
5274
|
+
|
|
5275
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
5276
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
5277
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
5278
|
+
region.
|
|
5279
|
+
|
|
5280
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
5281
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
5282
|
+
segment on only specific dates, you can provide a tuple like
|
|
5283
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
5284
|
+
(i.e., no validation steps will be created for them).
|
|
5285
|
+
|
|
5286
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5287
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
5288
|
+
|
|
5289
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
5290
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
5291
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
5292
|
+
columns
|
|
5293
|
+
|
|
5294
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5295
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
5296
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
5297
|
+
identify issues within specific segments.
|
|
5298
|
+
|
|
5299
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
5300
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
5301
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
5302
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
5303
|
+
|
|
4793
5304
|
Thresholds
|
|
4794
5305
|
----------
|
|
4795
5306
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -4879,6 +5390,8 @@ class Validate:
|
|
|
4879
5390
|
|
|
4880
5391
|
_check_column(column=columns)
|
|
4881
5392
|
_check_pre(pre=pre)
|
|
5393
|
+
# TODO: add check for segments
|
|
5394
|
+
# _check_segments(segments=segments)
|
|
4882
5395
|
_check_thresholds(thresholds=thresholds)
|
|
4883
5396
|
_check_boolean_input(param=active, param_name="active")
|
|
4884
5397
|
|
|
@@ -4905,6 +5418,7 @@ class Validate:
|
|
|
4905
5418
|
assertion_type=assertion_type,
|
|
4906
5419
|
column=column,
|
|
4907
5420
|
pre=pre,
|
|
5421
|
+
segments=segments,
|
|
4908
5422
|
thresholds=thresholds,
|
|
4909
5423
|
actions=actions,
|
|
4910
5424
|
brief=brief,
|
|
@@ -4919,6 +5433,7 @@ class Validate:
|
|
|
4919
5433
|
self,
|
|
4920
5434
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
4921
5435
|
pre: Callable | None = None,
|
|
5436
|
+
segments: SegmentSpec | None = None,
|
|
4922
5437
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4923
5438
|
actions: Actions | None = None,
|
|
4924
5439
|
brief: str | bool | None = None,
|
|
@@ -4939,10 +5454,15 @@ class Validate:
|
|
|
4939
5454
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
4940
5455
|
generated for each column.
|
|
4941
5456
|
pre
|
|
4942
|
-
|
|
5457
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4943
5458
|
interrogation. This function should take a table as input and return a modified table.
|
|
4944
5459
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4945
5460
|
argument.
|
|
5461
|
+
segments
|
|
5462
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
5463
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
5464
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
5465
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4946
5466
|
thresholds
|
|
4947
5467
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4948
5468
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4984,6 +5504,42 @@ class Validate:
|
|
|
4984
5504
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
4985
5505
|
subsequent validation steps.
|
|
4986
5506
|
|
|
5507
|
+
Segmentation
|
|
5508
|
+
------------
|
|
5509
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5510
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
5511
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
5512
|
+
column.
|
|
5513
|
+
|
|
5514
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
5515
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
5516
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
5517
|
+
region.
|
|
5518
|
+
|
|
5519
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
5520
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
5521
|
+
segment on only specific dates, you can provide a tuple like
|
|
5522
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
5523
|
+
(i.e., no validation steps will be created for them).
|
|
5524
|
+
|
|
5525
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5526
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
5527
|
+
|
|
5528
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
5529
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
5530
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
5531
|
+
columns
|
|
5532
|
+
|
|
5533
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5534
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
5535
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
5536
|
+
identify issues within specific segments.
|
|
5537
|
+
|
|
5538
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
5539
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
5540
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
5541
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
5542
|
+
|
|
4987
5543
|
Thresholds
|
|
4988
5544
|
----------
|
|
4989
5545
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -5073,6 +5629,8 @@ class Validate:
|
|
|
5073
5629
|
|
|
5074
5630
|
_check_column(column=columns)
|
|
5075
5631
|
_check_pre(pre=pre)
|
|
5632
|
+
# TODO: add check for segments
|
|
5633
|
+
# _check_segments(segments=segments)
|
|
5076
5634
|
_check_thresholds(thresholds=thresholds)
|
|
5077
5635
|
_check_boolean_input(param=active, param_name="active")
|
|
5078
5636
|
|
|
@@ -5099,6 +5657,7 @@ class Validate:
|
|
|
5099
5657
|
assertion_type=assertion_type,
|
|
5100
5658
|
column=column,
|
|
5101
5659
|
pre=pre,
|
|
5660
|
+
segments=segments,
|
|
5102
5661
|
thresholds=thresholds,
|
|
5103
5662
|
actions=actions,
|
|
5104
5663
|
brief=brief,
|
|
@@ -5115,6 +5674,7 @@ class Validate:
|
|
|
5115
5674
|
pattern: str,
|
|
5116
5675
|
na_pass: bool = False,
|
|
5117
5676
|
pre: Callable | None = None,
|
|
5677
|
+
segments: SegmentSpec | None = None,
|
|
5118
5678
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5119
5679
|
actions: Actions | None = None,
|
|
5120
5680
|
brief: str | bool | None = None,
|
|
@@ -5141,10 +5701,15 @@ class Validate:
|
|
|
5141
5701
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
5142
5702
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
5143
5703
|
pre
|
|
5144
|
-
|
|
5704
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
5145
5705
|
interrogation. This function should take a table as input and return a modified table.
|
|
5146
5706
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
5147
5707
|
argument.
|
|
5708
|
+
segments
|
|
5709
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
5710
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
5711
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
5712
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
5148
5713
|
thresholds
|
|
5149
5714
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
5150
5715
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -5186,6 +5751,42 @@ class Validate:
|
|
|
5186
5751
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
5187
5752
|
subsequent validation steps.
|
|
5188
5753
|
|
|
5754
|
+
Segmentation
|
|
5755
|
+
------------
|
|
5756
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5757
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
5758
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
5759
|
+
column.
|
|
5760
|
+
|
|
5761
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
5762
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
5763
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
5764
|
+
region.
|
|
5765
|
+
|
|
5766
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
5767
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
5768
|
+
segment on only specific dates, you can provide a tuple like
|
|
5769
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
5770
|
+
(i.e., no validation steps will be created for them).
|
|
5771
|
+
|
|
5772
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5773
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
5774
|
+
|
|
5775
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
5776
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
5777
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
5778
|
+
columns
|
|
5779
|
+
|
|
5780
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5781
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
5782
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
5783
|
+
identify issues within specific segments.
|
|
5784
|
+
|
|
5785
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
5786
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
5787
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
5788
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
5789
|
+
|
|
5189
5790
|
Thresholds
|
|
5190
5791
|
----------
|
|
5191
5792
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -5277,6 +5878,8 @@ class Validate:
|
|
|
5277
5878
|
|
|
5278
5879
|
_check_column(column=columns)
|
|
5279
5880
|
_check_pre(pre=pre)
|
|
5881
|
+
# TODO: add check for segments
|
|
5882
|
+
# _check_segments(segments=segments)
|
|
5280
5883
|
_check_thresholds(thresholds=thresholds)
|
|
5281
5884
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
5282
5885
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -5306,6 +5909,7 @@ class Validate:
|
|
|
5306
5909
|
values=pattern,
|
|
5307
5910
|
na_pass=na_pass,
|
|
5308
5911
|
pre=pre,
|
|
5912
|
+
segments=segments,
|
|
5309
5913
|
thresholds=thresholds,
|
|
5310
5914
|
actions=actions,
|
|
5311
5915
|
brief=brief,
|
|
@@ -5320,6 +5924,7 @@ class Validate:
|
|
|
5320
5924
|
self,
|
|
5321
5925
|
expr: any,
|
|
5322
5926
|
pre: Callable | None = None,
|
|
5927
|
+
segments: SegmentSpec | None = None,
|
|
5323
5928
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5324
5929
|
actions: Actions | None = None,
|
|
5325
5930
|
brief: str | bool | None = None,
|
|
@@ -5341,10 +5946,15 @@ class Validate:
|
|
|
5341
5946
|
be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
|
|
5342
5947
|
should either be a lambda expression or a Narwhals column expression.
|
|
5343
5948
|
pre
|
|
5344
|
-
|
|
5949
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
5345
5950
|
interrogation. This function should take a table as input and return a modified table.
|
|
5346
5951
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
5347
5952
|
argument.
|
|
5953
|
+
segments
|
|
5954
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
5955
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
5956
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
5957
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
5348
5958
|
thresholds
|
|
5349
5959
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
5350
5960
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -5384,6 +5994,42 @@ class Validate:
|
|
|
5384
5994
|
transformed table, it only exists during the validation step and is not stored in the
|
|
5385
5995
|
`Validate` object or used in subsequent validation steps.
|
|
5386
5996
|
|
|
5997
|
+
Segmentation
|
|
5998
|
+
------------
|
|
5999
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
6000
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
6001
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
6002
|
+
column.
|
|
6003
|
+
|
|
6004
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
6005
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
6006
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
6007
|
+
region.
|
|
6008
|
+
|
|
6009
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
6010
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
6011
|
+
segment on only specific dates, you can provide a tuple like
|
|
6012
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
6013
|
+
(i.e., no validation steps will be created for them).
|
|
6014
|
+
|
|
6015
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6016
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
6017
|
+
|
|
6018
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
6019
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
6020
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
6021
|
+
columns
|
|
6022
|
+
|
|
6023
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6024
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
6025
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
6026
|
+
identify issues within specific segments.
|
|
6027
|
+
|
|
6028
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
6029
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
6030
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
6031
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
6032
|
+
|
|
5387
6033
|
Thresholds
|
|
5388
6034
|
----------
|
|
5389
6035
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -5461,6 +6107,8 @@ class Validate:
|
|
|
5461
6107
|
# TODO: Add a check for the expression to ensure it's a valid expression object
|
|
5462
6108
|
# _check_expr(expr=expr)
|
|
5463
6109
|
_check_pre(pre=pre)
|
|
6110
|
+
# TODO: add check for segments
|
|
6111
|
+
# _check_segments(segments=segments)
|
|
5464
6112
|
_check_thresholds(thresholds=thresholds)
|
|
5465
6113
|
_check_boolean_input(param=active, param_name="active")
|
|
5466
6114
|
|
|
@@ -5477,6 +6125,7 @@ class Validate:
|
|
|
5477
6125
|
column=None,
|
|
5478
6126
|
values=expr,
|
|
5479
6127
|
pre=pre,
|
|
6128
|
+
segments=segments,
|
|
5480
6129
|
thresholds=thresholds,
|
|
5481
6130
|
actions=actions,
|
|
5482
6131
|
brief=brief,
|
|
@@ -5665,6 +6314,7 @@ class Validate:
|
|
|
5665
6314
|
self,
|
|
5666
6315
|
columns_subset: str | list[str] | None = None,
|
|
5667
6316
|
pre: Callable | None = None,
|
|
6317
|
+
segments: SegmentSpec | None = None,
|
|
5668
6318
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5669
6319
|
actions: Actions | None = None,
|
|
5670
6320
|
brief: str | bool | None = None,
|
|
@@ -5685,10 +6335,15 @@ class Validate:
|
|
|
5685
6335
|
columns are supplied, the distinct comparison will be made over the combination of
|
|
5686
6336
|
values in those columns.
|
|
5687
6337
|
pre
|
|
5688
|
-
|
|
6338
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
5689
6339
|
interrogation. This function should take a table as input and return a modified table.
|
|
5690
6340
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
5691
6341
|
argument.
|
|
6342
|
+
segments
|
|
6343
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
6344
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
6345
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
6346
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
5692
6347
|
thresholds
|
|
5693
6348
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
5694
6349
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -5730,6 +6385,42 @@ class Validate:
|
|
|
5730
6385
|
table, it only exists during the validation step and is not stored in the `Validate` object
|
|
5731
6386
|
or used in subsequent validation steps.
|
|
5732
6387
|
|
|
6388
|
+
Segmentation
|
|
6389
|
+
------------
|
|
6390
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
6391
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
6392
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
6393
|
+
column.
|
|
6394
|
+
|
|
6395
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
6396
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
6397
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
6398
|
+
region.
|
|
6399
|
+
|
|
6400
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
6401
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
6402
|
+
segment on only specific dates, you can provide a tuple like
|
|
6403
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
6404
|
+
(i.e., no validation steps will be created for them).
|
|
6405
|
+
|
|
6406
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6407
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
6408
|
+
|
|
6409
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
6410
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
6411
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
6412
|
+
columns
|
|
6413
|
+
|
|
6414
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6415
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
6416
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
6417
|
+
identify issues within specific segments.
|
|
6418
|
+
|
|
6419
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
6420
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
6421
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
6422
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
6423
|
+
|
|
5733
6424
|
Thresholds
|
|
5734
6425
|
----------
|
|
5735
6426
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -5823,6 +6514,8 @@ class Validate:
|
|
|
5823
6514
|
assertion_type = _get_fn_name()
|
|
5824
6515
|
|
|
5825
6516
|
_check_pre(pre=pre)
|
|
6517
|
+
# TODO: add check for segments
|
|
6518
|
+
# _check_segments(segments=segments)
|
|
5826
6519
|
_check_thresholds(thresholds=thresholds)
|
|
5827
6520
|
_check_boolean_input(param=active, param_name="active")
|
|
5828
6521
|
|
|
@@ -5843,6 +6536,244 @@ class Validate:
|
|
|
5843
6536
|
assertion_type=assertion_type,
|
|
5844
6537
|
column=columns_subset,
|
|
5845
6538
|
pre=pre,
|
|
6539
|
+
segments=segments,
|
|
6540
|
+
thresholds=thresholds,
|
|
6541
|
+
actions=actions,
|
|
6542
|
+
brief=brief,
|
|
6543
|
+
active=active,
|
|
6544
|
+
)
|
|
6545
|
+
|
|
6546
|
+
self._add_validation(validation_info=val_info)
|
|
6547
|
+
|
|
6548
|
+
return self
|
|
6549
|
+
|
|
6550
|
+
def rows_complete(
|
|
6551
|
+
self,
|
|
6552
|
+
columns_subset: str | list[str] | None = None,
|
|
6553
|
+
pre: Callable | None = None,
|
|
6554
|
+
segments: SegmentSpec | None = None,
|
|
6555
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
6556
|
+
actions: Actions | None = None,
|
|
6557
|
+
brief: str | bool | None = None,
|
|
6558
|
+
active: bool = True,
|
|
6559
|
+
) -> Validate:
|
|
6560
|
+
"""
|
|
6561
|
+
Validate whether row data are complete by having no missing values.
|
|
6562
|
+
|
|
6563
|
+
The `rows_complete()` method checks whether rows in the table are complete. Completeness
|
|
6564
|
+
of a row means that there are no missing values within the row. This validation will operate
|
|
6565
|
+
over the number of test units that is equal to the number of rows in the table (determined
|
|
6566
|
+
after any `pre=` mutation has been applied). A subset of columns can be specified for the
|
|
6567
|
+
completeness check. If no subset is provided, all columns in the table will be used.
|
|
6568
|
+
|
|
6569
|
+
Parameters
|
|
6570
|
+
----------
|
|
6571
|
+
columns_subset
|
|
6572
|
+
A single column or a list of columns to use as a subset for the completeness check. If
|
|
6573
|
+
`None` (the default), then all columns in the table will be used.
|
|
6574
|
+
pre
|
|
6575
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
6576
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
6577
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
6578
|
+
argument.
|
|
6579
|
+
segments
|
|
6580
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
6581
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
6582
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
6583
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
6584
|
+
thresholds
|
|
6585
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
6586
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
6587
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
6588
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
6589
|
+
section for information on how to set threshold levels.
|
|
6590
|
+
actions
|
|
6591
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6592
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6593
|
+
define the actions.
|
|
6594
|
+
brief
|
|
6595
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6596
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6597
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6598
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6599
|
+
won't be a brief.
|
|
6600
|
+
active
|
|
6601
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6602
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6603
|
+
for the steps unchanged).
|
|
6604
|
+
|
|
6605
|
+
Returns
|
|
6606
|
+
-------
|
|
6607
|
+
Validate
|
|
6608
|
+
The `Validate` object with the added validation step.
|
|
6609
|
+
|
|
6610
|
+
Preprocessing
|
|
6611
|
+
-------------
|
|
6612
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
6613
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
6614
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
6615
|
+
before the validation step is applied.
|
|
6616
|
+
|
|
6617
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
6618
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
6619
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
6620
|
+
columns via `columns_subset=` that are expected to be present in the transformed table, but
|
|
6621
|
+
may not exist in the table before preprocessing. Regarding the lifetime of the transformed
|
|
6622
|
+
table, it only exists during the validation step and is not stored in the `Validate` object
|
|
6623
|
+
or used in subsequent validation steps.
|
|
6624
|
+
|
|
6625
|
+
Segmentation
|
|
6626
|
+
------------
|
|
6627
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
6628
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
6629
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
6630
|
+
column.
|
|
6631
|
+
|
|
6632
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
6633
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
6634
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
6635
|
+
region.
|
|
6636
|
+
|
|
6637
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
6638
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
6639
|
+
segment on only specific dates, you can provide a tuple like
|
|
6640
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
6641
|
+
(i.e., no validation steps will be created for them).
|
|
6642
|
+
|
|
6643
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6644
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
6645
|
+
|
|
6646
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
6647
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
6648
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
6649
|
+
columns
|
|
6650
|
+
|
|
6651
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6652
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
6653
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
6654
|
+
identify issues within specific segments.
|
|
6655
|
+
|
|
6656
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
6657
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
6658
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
6659
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
6660
|
+
|
|
6661
|
+
Thresholds
|
|
6662
|
+
----------
|
|
6663
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6664
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6665
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6666
|
+
|
|
6667
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
6668
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
6669
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
6670
|
+
|
|
6671
|
+
Thresholds can be defined using one of these input schemes:
|
|
6672
|
+
|
|
6673
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6674
|
+
thresholds)
|
|
6675
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6676
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6677
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6678
|
+
'critical'
|
|
6679
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6680
|
+
for the 'warning' level only
|
|
6681
|
+
|
|
6682
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
6683
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
6684
|
+
set, you're free to set any combination of them.
|
|
6685
|
+
|
|
6686
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
6687
|
+
take for each level of failure (using the `actions=` parameter).
|
|
6688
|
+
|
|
6689
|
+
Examples
|
|
6690
|
+
--------
|
|
6691
|
+
```{python}
|
|
6692
|
+
#| echo: false
|
|
6693
|
+
#| output: false
|
|
6694
|
+
import pointblank as pb
|
|
6695
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
6696
|
+
```
|
|
6697
|
+
For the examples here, we'll use a simple Polars DataFrame with three string columns
|
|
6698
|
+
(`col_1`, `col_2`, and `col_3`). The table is shown below:
|
|
6699
|
+
|
|
6700
|
+
```{python}
|
|
6701
|
+
import pointblank as pb
|
|
6702
|
+
import polars as pl
|
|
6703
|
+
|
|
6704
|
+
tbl = pl.DataFrame(
|
|
6705
|
+
{
|
|
6706
|
+
"col_1": ["a", None, "c", "d"],
|
|
6707
|
+
"col_2": ["a", "a", "c", None],
|
|
6708
|
+
"col_3": ["a", "a", "d", None],
|
|
6709
|
+
}
|
|
6710
|
+
)
|
|
6711
|
+
|
|
6712
|
+
pb.preview(tbl)
|
|
6713
|
+
```
|
|
6714
|
+
|
|
6715
|
+
Let's validate that the rows in the table are complete with `rows_complete()`. We'll
|
|
6716
|
+
determine if this validation had any failing test units (there are four test units, one for
|
|
6717
|
+
each row). A failing test units means that a given row is not complete (i.e., has at least
|
|
6718
|
+
one missing value).
|
|
6719
|
+
|
|
6720
|
+
```{python}
|
|
6721
|
+
validation = (
|
|
6722
|
+
pb.Validate(data=tbl)
|
|
6723
|
+
.rows_complete()
|
|
6724
|
+
.interrogate()
|
|
6725
|
+
)
|
|
6726
|
+
|
|
6727
|
+
validation
|
|
6728
|
+
```
|
|
6729
|
+
|
|
6730
|
+
From this validation table we see that there are two failing test units. This is because
|
|
6731
|
+
two rows in the table have at least one missing value (the second row and the last row).
|
|
6732
|
+
|
|
6733
|
+
We can also use a subset of columns to determine completeness. Let's specify the subset
|
|
6734
|
+
using columns `col_2` and `col_3` for the next validation.
|
|
6735
|
+
|
|
6736
|
+
```{python}
|
|
6737
|
+
validation = (
|
|
6738
|
+
pb.Validate(data=tbl)
|
|
6739
|
+
.rows_complete(columns_subset=["col_2", "col_3"])
|
|
6740
|
+
.interrogate()
|
|
6741
|
+
)
|
|
6742
|
+
|
|
6743
|
+
validation
|
|
6744
|
+
```
|
|
6745
|
+
|
|
6746
|
+
The validation table reports a single failing test units. The last row contains missing
|
|
6747
|
+
values in both the `col_2` and `col_3` columns.
|
|
6748
|
+
others.
|
|
6749
|
+
"""
|
|
6750
|
+
|
|
6751
|
+
assertion_type = _get_fn_name()
|
|
6752
|
+
|
|
6753
|
+
_check_pre(pre=pre)
|
|
6754
|
+
# TODO: add check for segments
|
|
6755
|
+
# _check_segments(segments=segments)
|
|
6756
|
+
_check_thresholds(thresholds=thresholds)
|
|
6757
|
+
_check_boolean_input(param=active, param_name="active")
|
|
6758
|
+
|
|
6759
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
6760
|
+
thresholds = (
|
|
6761
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
6762
|
+
)
|
|
6763
|
+
|
|
6764
|
+
if columns_subset is not None and isinstance(columns_subset, str):
|
|
6765
|
+
columns_subset = [columns_subset]
|
|
6766
|
+
|
|
6767
|
+
# TODO: incorporate Column object
|
|
6768
|
+
|
|
6769
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
6770
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
6771
|
+
|
|
6772
|
+
val_info = _ValidationInfo(
|
|
6773
|
+
assertion_type=assertion_type,
|
|
6774
|
+
column=columns_subset,
|
|
6775
|
+
pre=pre,
|
|
6776
|
+
segments=segments,
|
|
5846
6777
|
thresholds=thresholds,
|
|
5847
6778
|
actions=actions,
|
|
5848
6779
|
brief=brief,
|
|
@@ -5903,7 +6834,7 @@ class Validate:
|
|
|
5903
6834
|
substring matches are allowed, so a schema data type of `Int` would match a target table
|
|
5904
6835
|
data type of `Int64`.
|
|
5905
6836
|
pre
|
|
5906
|
-
|
|
6837
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
5907
6838
|
interrogation. This function should take a table as input and return a modified table.
|
|
5908
6839
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
5909
6840
|
argument.
|
|
@@ -6116,7 +7047,7 @@ class Validate:
|
|
|
6116
7047
|
Should the validation step be inverted? If `True`, then the expectation is that the row
|
|
6117
7048
|
count of the target table should not match the specified `count=` value.
|
|
6118
7049
|
pre
|
|
6119
|
-
|
|
7050
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
6120
7051
|
interrogation. This function should take a table as input and return a modified table.
|
|
6121
7052
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
6122
7053
|
argument.
|
|
@@ -6326,7 +7257,7 @@ class Validate:
|
|
|
6326
7257
|
Should the validation step be inverted? If `True`, then the expectation is that the
|
|
6327
7258
|
column count of the target table should not match the specified `count=` value.
|
|
6328
7259
|
pre
|
|
6329
|
-
|
|
7260
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
6330
7261
|
interrogation. This function should take a table as input and return a modified table.
|
|
6331
7262
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
6332
7263
|
argument.
|
|
@@ -6844,10 +7775,14 @@ class Validate:
|
|
|
6844
7775
|
|
|
6845
7776
|
self.time_start = datetime.datetime.now(datetime.timezone.utc)
|
|
6846
7777
|
|
|
6847
|
-
# Expand `validation_info` by evaluating any column expressions in `
|
|
7778
|
+
# Expand `validation_info` by evaluating any column expressions in `columns=`
|
|
6848
7779
|
# (the `_evaluate_column_exprs()` method will eval and expand as needed)
|
|
6849
7780
|
self._evaluate_column_exprs(validation_info=self.validation_info)
|
|
6850
7781
|
|
|
7782
|
+
# Expand `validation_info` by evaluating for any segmentation directives
|
|
7783
|
+
# provided in `segments=` (the `_evaluate_segments()` method will eval and expand as needed)
|
|
7784
|
+
self._evaluate_segments(validation_info=self.validation_info)
|
|
7785
|
+
|
|
6851
7786
|
for validation in self.validation_info:
|
|
6852
7787
|
# Set the `i` value for the validation step (this is 1-indexed)
|
|
6853
7788
|
index_value = self.validation_info.index(validation) + 1
|
|
@@ -6883,6 +7818,10 @@ class Validate:
|
|
|
6883
7818
|
|
|
6884
7819
|
validation.autobrief = autobrief
|
|
6885
7820
|
|
|
7821
|
+
# ------------------------------------------------
|
|
7822
|
+
# Bypassing the validation step if conditions met
|
|
7823
|
+
# ------------------------------------------------
|
|
7824
|
+
|
|
6886
7825
|
# Skip the validation step if it is not active but still record the time of processing
|
|
6887
7826
|
if not validation.active:
|
|
6888
7827
|
end_time = datetime.datetime.now(datetime.timezone.utc)
|
|
@@ -6939,6 +7878,17 @@ class Validate:
|
|
|
6939
7878
|
elif isinstance(validation.pre, Callable):
|
|
6940
7879
|
data_tbl_step = validation.pre(data_tbl_step)
|
|
6941
7880
|
|
|
7881
|
+
# ------------------------------------------------
|
|
7882
|
+
# Segmentation stage
|
|
7883
|
+
# ------------------------------------------------
|
|
7884
|
+
|
|
7885
|
+
# Determine whether any segmentation directives are to be applied to the table
|
|
7886
|
+
|
|
7887
|
+
if validation.segments is not None:
|
|
7888
|
+
data_tbl_step = _apply_segments(
|
|
7889
|
+
data_tbl=data_tbl_step, segments_expr=validation.segments
|
|
7890
|
+
)
|
|
7891
|
+
|
|
6942
7892
|
validation.n = NumberOfTestUnits(df=data_tbl_step, column=column).get_test_units(
|
|
6943
7893
|
tbl_type=tbl_type
|
|
6944
7894
|
)
|
|
@@ -7012,6 +7962,14 @@ class Validate:
|
|
|
7012
7962
|
tbl_type=tbl_type,
|
|
7013
7963
|
).get_test_results()
|
|
7014
7964
|
|
|
7965
|
+
if assertion_category == "ROWS_COMPLETE":
|
|
7966
|
+
results_tbl = RowsComplete(
|
|
7967
|
+
data_tbl=data_tbl_step,
|
|
7968
|
+
columns_subset=column,
|
|
7969
|
+
threshold=threshold,
|
|
7970
|
+
tbl_type=tbl_type,
|
|
7971
|
+
).get_test_results()
|
|
7972
|
+
|
|
7015
7973
|
if assertion_category == "COL_EXISTS_HAS_TYPE":
|
|
7016
7974
|
result_bool = ColExistsHasType(
|
|
7017
7975
|
data_tbl=data_tbl_step,
|
|
@@ -7282,7 +8240,8 @@ class Validate:
|
|
|
7282
8240
|
# TODO: Add support for extraction of rows for Ibis backends
|
|
7283
8241
|
if (
|
|
7284
8242
|
collect_extracts
|
|
7285
|
-
and assertion_type
|
|
8243
|
+
and assertion_type
|
|
8244
|
+
in ROW_BASED_VALIDATION_TYPES + ["rows_distinct", "rows_complete"]
|
|
7286
8245
|
and tbl_type not in IBIS_BACKENDS
|
|
7287
8246
|
):
|
|
7288
8247
|
# Add row numbers to the results table
|
|
@@ -8364,19 +9323,134 @@ class Validate:
|
|
|
8364
9323
|
"""
|
|
8365
9324
|
Get a report of the validation results as a JSON-formatted string.
|
|
8366
9325
|
|
|
9326
|
+
The `get_json_report()` method provides a machine-readable report of validation results in
|
|
9327
|
+
JSON format. This is particularly useful for programmatic processing, storing validation
|
|
9328
|
+
results, or integrating with other systems. The report includes detailed information about
|
|
9329
|
+
each validation step, such as assertion type, columns validated, threshold values, test
|
|
9330
|
+
results, and more.
|
|
9331
|
+
|
|
9332
|
+
By default, all available validation information fields are included in the report. However,
|
|
9333
|
+
you can customize the fields to include or exclude using the `use_fields=` and
|
|
9334
|
+
`exclude_fields=` parameters.
|
|
9335
|
+
|
|
8367
9336
|
Parameters
|
|
8368
9337
|
----------
|
|
8369
9338
|
use_fields
|
|
8370
|
-
|
|
9339
|
+
An optional list of specific fields to include in the report. If provided, only these
|
|
9340
|
+
fields will be included in the JSON output. If `None` (the default), all standard
|
|
9341
|
+
validation report fields are included. Have a look at the *Available Report Fields*
|
|
9342
|
+
section below for a list of fields that can be included in the report.
|
|
8371
9343
|
exclude_fields
|
|
8372
|
-
|
|
9344
|
+
An optional list of fields to exclude from the report. If provided, these fields will
|
|
9345
|
+
be omitted from the JSON output. If `None` (the default), no fields are excluded.
|
|
9346
|
+
This parameter cannot be used together with `use_fields=`. The *Available Report Fields*
|
|
9347
|
+
provides a listing of fields that can be excluded from the report.
|
|
8373
9348
|
|
|
8374
9349
|
Returns
|
|
8375
9350
|
-------
|
|
8376
9351
|
str
|
|
8377
|
-
A JSON-formatted string representing the validation report
|
|
8378
|
-
|
|
9352
|
+
A JSON-formatted string representing the validation report, with each validation step
|
|
9353
|
+
as an object in the report array.
|
|
9354
|
+
|
|
9355
|
+
Available Report Fields
|
|
9356
|
+
-----------------------
|
|
9357
|
+
The JSON report can include any of the standard validation report fields, including:
|
|
9358
|
+
|
|
9359
|
+
- `i`: the step number (1-indexed)
|
|
9360
|
+
- `i_o`: the original step index from the validation plan (pre-expansion)
|
|
9361
|
+
- `assertion_type`: the type of validation assertion (e.g., `"col_vals_gt"`, etc.)
|
|
9362
|
+
- `column`: the column being validated (or columns used in certain validations)
|
|
9363
|
+
- `values`: the comparison values or parameters used in the validation
|
|
9364
|
+
- `inclusive`: whether the comparison is inclusive (for range-based validations)
|
|
9365
|
+
- `na_pass`: whether `NA`/`Null` values are considered passing (for certain validations)
|
|
9366
|
+
- `pre`: preprocessing function applied before validation
|
|
9367
|
+
- `segments`: data segments to which the validation was applied
|
|
9368
|
+
- `thresholds`: threshold level statement that was used for the validation step
|
|
9369
|
+
- `label`: custom label for the validation step
|
|
9370
|
+
- `brief`: a brief description of the validation step
|
|
9371
|
+
- `active`: whether the validation step is active
|
|
9372
|
+
- `all_passed`: whether all test units passed in the step
|
|
9373
|
+
- `n`: total number of test units
|
|
9374
|
+
- `n_passed`, `n_failed`: number of test units that passed and failed
|
|
9375
|
+
- `f_passed`, `f_failed`: Fraction of test units that passed and failed
|
|
9376
|
+
- `warning`, `error`, `critical`: whether the namesake threshold level was exceeded (is
|
|
9377
|
+
`null` if threshold not set)
|
|
9378
|
+
- `time_processed`: when the validation step was processed (ISO 8601 format)
|
|
9379
|
+
- `proc_duration_s`: the processing duration in seconds
|
|
9380
|
+
|
|
9381
|
+
Examples
|
|
9382
|
+
--------
|
|
9383
|
+
Let's create a validation plan with a few validation steps and generate a JSON report of the
|
|
9384
|
+
results:
|
|
9385
|
+
|
|
9386
|
+
```{python}
|
|
9387
|
+
import pointblank as pb
|
|
9388
|
+
import polars as pl
|
|
9389
|
+
|
|
9390
|
+
# Create a sample DataFrame
|
|
9391
|
+
tbl = pl.DataFrame({
|
|
9392
|
+
"a": [5, 7, 8, 9],
|
|
9393
|
+
"b": [3, 4, 2, 1]
|
|
9394
|
+
})
|
|
8379
9395
|
|
|
9396
|
+
# Create and execute a validation plan
|
|
9397
|
+
validation = (
|
|
9398
|
+
pb.Validate(data=tbl)
|
|
9399
|
+
.col_vals_gt(columns="a", value=6)
|
|
9400
|
+
.col_vals_lt(columns="b", value=4)
|
|
9401
|
+
.interrogate()
|
|
9402
|
+
)
|
|
9403
|
+
|
|
9404
|
+
# Get the full JSON report
|
|
9405
|
+
json_report = validation.get_json_report()
|
|
9406
|
+
|
|
9407
|
+
print(json_report)
|
|
9408
|
+
```
|
|
9409
|
+
|
|
9410
|
+
You can also customize which fields to include:
|
|
9411
|
+
|
|
9412
|
+
```{python}
|
|
9413
|
+
json_report = validation.get_json_report(
|
|
9414
|
+
use_fields=["i", "assertion_type", "column", "n_passed", "n_failed"]
|
|
9415
|
+
)
|
|
9416
|
+
|
|
9417
|
+
print(json_report)
|
|
9418
|
+
```
|
|
9419
|
+
|
|
9420
|
+
Or which fields to exclude:
|
|
9421
|
+
|
|
9422
|
+
```{python}
|
|
9423
|
+
json_report = validation.get_json_report(
|
|
9424
|
+
exclude_fields=[
|
|
9425
|
+
"i_o", "thresholds", "pre", "segments", "values",
|
|
9426
|
+
"na_pass", "inclusive", "label", "brief", "active",
|
|
9427
|
+
"time_processed", "proc_duration_s"
|
|
9428
|
+
]
|
|
9429
|
+
)
|
|
9430
|
+
|
|
9431
|
+
print(json_report)
|
|
9432
|
+
```
|
|
9433
|
+
|
|
9434
|
+
The JSON output can be further processed or analyzed programmatically:
|
|
9435
|
+
|
|
9436
|
+
```{python}
|
|
9437
|
+
import json
|
|
9438
|
+
|
|
9439
|
+
# Parse the JSON report
|
|
9440
|
+
report_data = json.loads(validation.get_json_report())
|
|
9441
|
+
|
|
9442
|
+
# Extract and analyze validation results
|
|
9443
|
+
failing_steps = [step for step in report_data if step["n_failed"] > 0]
|
|
9444
|
+
print(f"Number of failing validation steps: {len(failing_steps)}")
|
|
9445
|
+
```
|
|
9446
|
+
|
|
9447
|
+
See Also
|
|
9448
|
+
--------
|
|
9449
|
+
- [`get_tabular_report()`](`pointblank.Validate.get_tabular_report`): Get a formatted HTML
|
|
9450
|
+
report as a GT table
|
|
9451
|
+
- [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`): Get rows that
|
|
9452
|
+
failed validation
|
|
9453
|
+
"""
|
|
8380
9454
|
if use_fields is not None and exclude_fields is not None:
|
|
8381
9455
|
raise ValueError("Cannot specify both `use_fields=` and `exclude_fields=`.")
|
|
8382
9456
|
|
|
@@ -8840,6 +9914,13 @@ class Validate:
|
|
|
8840
9914
|
# will be made blank if the validation has not been performed
|
|
8841
9915
|
interrogation_performed = validation_info_dict.get("proc_duration_s", [None])[0] is not None
|
|
8842
9916
|
|
|
9917
|
+
# Determine which steps are those using segmented data
|
|
9918
|
+
segmented_steps = [
|
|
9919
|
+
i + 1
|
|
9920
|
+
for i, segment in enumerate(validation_info_dict["segments"])
|
|
9921
|
+
if segment is not None
|
|
9922
|
+
]
|
|
9923
|
+
|
|
8843
9924
|
# ------------------------------------------------
|
|
8844
9925
|
# Process the `type_upd` entry
|
|
8845
9926
|
# ------------------------------------------------
|
|
@@ -8849,6 +9930,7 @@ class Validate:
|
|
|
8849
9930
|
assertion_str=validation_info_dict["assertion_type"],
|
|
8850
9931
|
brief_str=validation_info_dict["brief"],
|
|
8851
9932
|
autobrief_str=validation_info_dict["autobrief"],
|
|
9933
|
+
segmentation_str=validation_info_dict["segments"],
|
|
8852
9934
|
lang=lang,
|
|
8853
9935
|
)
|
|
8854
9936
|
|
|
@@ -8877,7 +9959,7 @@ class Validate:
|
|
|
8877
9959
|
"col_vals_expr",
|
|
8878
9960
|
]:
|
|
8879
9961
|
columns_upd.append("—")
|
|
8880
|
-
elif assertion_type[i] in ["rows_distinct"]:
|
|
9962
|
+
elif assertion_type[i] in ["rows_distinct", "rows_complete"]:
|
|
8881
9963
|
if not column:
|
|
8882
9964
|
# If there is no column subset, then all columns are used
|
|
8883
9965
|
columns_upd.append("ALL COLUMNS")
|
|
@@ -8940,6 +10022,7 @@ class Validate:
|
|
|
8940
10022
|
"col_vals_not_null",
|
|
8941
10023
|
"col_exists",
|
|
8942
10024
|
"rows_distinct",
|
|
10025
|
+
"rows_complete",
|
|
8943
10026
|
]:
|
|
8944
10027
|
values_upd.append("—")
|
|
8945
10028
|
|
|
@@ -8980,11 +10063,14 @@ class Validate:
|
|
|
8980
10063
|
# Add the `tbl` entry
|
|
8981
10064
|
# ------------------------------------------------
|
|
8982
10065
|
|
|
8983
|
-
# Depending on if there was some preprocessing done, get the appropriate icon
|
|
8984
|
-
#
|
|
10066
|
+
# Depending on if there was some preprocessing done, get the appropriate icon for
|
|
10067
|
+
# the table processing status to be displayed in the report under the `tbl` column
|
|
10068
|
+
# TODO: add the icon for the segmented data option when the step is segmented
|
|
8985
10069
|
|
|
8986
10070
|
validation_info_dict["tbl"] = _transform_tbl_preprocessed(
|
|
8987
|
-
pre=validation_info_dict["pre"],
|
|
10071
|
+
pre=validation_info_dict["pre"],
|
|
10072
|
+
seg=validation_info_dict["segments"],
|
|
10073
|
+
interrogation_performed=interrogation_performed,
|
|
8988
10074
|
)
|
|
8989
10075
|
|
|
8990
10076
|
# ------------------------------------------------
|
|
@@ -9019,8 +10105,9 @@ class Validate:
|
|
|
9019
10105
|
# Process `pass` and `fail` entries
|
|
9020
10106
|
# ------------------------------------------------
|
|
9021
10107
|
|
|
9022
|
-
# Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries
|
|
9023
|
-
# of the `pass` entry should be equal to the length of the
|
|
10108
|
+
# Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries
|
|
10109
|
+
# (the length of the `pass` entry should be equal to the length of the
|
|
10110
|
+
# `n_passed` and `n_failed` entries)
|
|
9024
10111
|
|
|
9025
10112
|
validation_info_dict["pass"] = _transform_passed_failed(
|
|
9026
10113
|
n_passed_failed=validation_info_dict["n_passed"],
|
|
@@ -9173,6 +10260,9 @@ class Validate:
|
|
|
9173
10260
|
# Remove the `pre` entry from the dictionary
|
|
9174
10261
|
validation_info_dict.pop("pre")
|
|
9175
10262
|
|
|
10263
|
+
# Remove the `segments` entry from the dictionary
|
|
10264
|
+
validation_info_dict.pop("segments")
|
|
10265
|
+
|
|
9176
10266
|
# Remove the `proc_duration_s` entry from the dictionary
|
|
9177
10267
|
validation_info_dict.pop("proc_duration_s")
|
|
9178
10268
|
|
|
@@ -9255,6 +10345,10 @@ class Validate:
|
|
|
9255
10345
|
columns=["type_upd", "columns_upd", "values_upd", "test_units", "pass", "fail"]
|
|
9256
10346
|
),
|
|
9257
10347
|
)
|
|
10348
|
+
.tab_style(
|
|
10349
|
+
style=style.css("overflow-x: visible; white-space: nowrap;"),
|
|
10350
|
+
locations=loc.body(columns="type_upd", rows=segmented_steps),
|
|
10351
|
+
)
|
|
9258
10352
|
.tab_style(
|
|
9259
10353
|
style=style.fill(color="#FCFCFC" if interrogation_performed else "white"),
|
|
9260
10354
|
locations=loc.body(columns=["w_upd", "e_upd", "c_upd"]),
|
|
@@ -9429,8 +10523,8 @@ class Validate:
|
|
|
9429
10523
|
table object, which can be displayed in a notebook or exported to an HTML file.
|
|
9430
10524
|
|
|
9431
10525
|
:::{.callout-warning}
|
|
9432
|
-
The `get_step_report()` is still experimental. Please report any issues you encounter
|
|
9433
|
-
[Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
10526
|
+
The `get_step_report()` method is still experimental. Please report any issues you encounter
|
|
10527
|
+
in the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
9434
10528
|
:::
|
|
9435
10529
|
|
|
9436
10530
|
Parameters
|
|
@@ -9463,6 +10557,36 @@ class Validate:
|
|
|
9463
10557
|
GT
|
|
9464
10558
|
A GT table object that represents the detailed report for the validation step.
|
|
9465
10559
|
|
|
10560
|
+
Types of Step Reports
|
|
10561
|
+
---------------------
|
|
10562
|
+
The `get_step_report()` method produces a report based on the *type* of validation step.
|
|
10563
|
+
The following row-based validation methods will produce a report that shows the rows of the
|
|
10564
|
+
data that failed because of failing test units within one or more columns failed:
|
|
10565
|
+
|
|
10566
|
+
- [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
|
|
10567
|
+
- [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
|
|
10568
|
+
- [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
|
|
10569
|
+
- [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
|
|
10570
|
+
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
10571
|
+
- [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
|
|
10572
|
+
- [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
|
|
10573
|
+
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
10574
|
+
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
10575
|
+
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
10576
|
+
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
10577
|
+
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
10578
|
+
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
10579
|
+
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
10580
|
+
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
10581
|
+
|
|
10582
|
+
The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
|
|
10583
|
+
report that shows duplicate rows (or duplicate values in one or a set of columns as defined
|
|
10584
|
+
in that method's `columns_subset=` parameter.
|
|
10585
|
+
|
|
10586
|
+
The [`col_schema_match()`](`pointblank.Validate.col_schema_match`) validation step will
|
|
10587
|
+
produce a report that shows the schema of the data table and the schema of the validation
|
|
10588
|
+
step. The report will indicate whether the schemas match or not.
|
|
10589
|
+
|
|
9466
10590
|
Examples
|
|
9467
10591
|
--------
|
|
9468
10592
|
```{python}
|
|
@@ -9488,7 +10612,7 @@ class Validate:
|
|
|
9488
10612
|
.col_vals_lt(columns="d", value=3500)
|
|
9489
10613
|
.col_vals_between(columns="c", left=1, right=8)
|
|
9490
10614
|
.col_vals_gt(columns="a", value=3)
|
|
9491
|
-
.col_vals_regex(columns="b", pattern=r"
|
|
10615
|
+
.col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
|
|
9492
10616
|
.interrogate()
|
|
9493
10617
|
)
|
|
9494
10618
|
|
|
@@ -9612,7 +10736,7 @@ class Validate:
|
|
|
9612
10736
|
# if get_row_count(extract) == 0:
|
|
9613
10737
|
# return "No rows were extracted."
|
|
9614
10738
|
|
|
9615
|
-
if assertion_type in ROW_BASED_VALIDATION_TYPES:
|
|
10739
|
+
if assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_complete"]:
|
|
9616
10740
|
# Get the extracted data for the step
|
|
9617
10741
|
extract = self.get_data_extracts(i=i, frame=True)
|
|
9618
10742
|
|
|
@@ -9776,6 +10900,95 @@ class Validate:
|
|
|
9776
10900
|
|
|
9777
10901
|
return self
|
|
9778
10902
|
|
|
10903
|
+
def _evaluate_segments(self, validation_info):
|
|
10904
|
+
"""
|
|
10905
|
+
Evaluate any segmentation expressions stored in the `segments` attribute and expand each
|
|
10906
|
+
validation step with such directives into multiple. This is done by evaluating the
|
|
10907
|
+
segmentation expression and creating a new validation step for each segment. Errors in
|
|
10908
|
+
evaluation (such as no segments matched) will be caught and recorded in the `eval_error`
|
|
10909
|
+
attribute.
|
|
10910
|
+
|
|
10911
|
+
Parameters
|
|
10912
|
+
----------
|
|
10913
|
+
validation_info
|
|
10914
|
+
Information about the validation to evaluate and expand.
|
|
10915
|
+
"""
|
|
10916
|
+
|
|
10917
|
+
# Create a list to store the expanded validation steps
|
|
10918
|
+
expanded_validation_info = []
|
|
10919
|
+
|
|
10920
|
+
# Iterate over the validation steps
|
|
10921
|
+
for i, validation in enumerate(validation_info):
|
|
10922
|
+
# Get the segments expression
|
|
10923
|
+
segments_expr = validation.segments
|
|
10924
|
+
|
|
10925
|
+
# If the value is None, then skip the evaluation and append the validation step to the
|
|
10926
|
+
# list of expanded validation steps
|
|
10927
|
+
if segments_expr is None:
|
|
10928
|
+
expanded_validation_info.append(validation)
|
|
10929
|
+
continue
|
|
10930
|
+
|
|
10931
|
+
# Evaluate the segments expression
|
|
10932
|
+
try:
|
|
10933
|
+
# Get the table for this step, it can either be:
|
|
10934
|
+
# 1. the target table itself
|
|
10935
|
+
# 2. the target table modified by a `pre` attribute
|
|
10936
|
+
|
|
10937
|
+
if validation.pre is None:
|
|
10938
|
+
table = self.data
|
|
10939
|
+
else:
|
|
10940
|
+
table = validation.pre(self.data)
|
|
10941
|
+
|
|
10942
|
+
# If the `segments` expression is a string, that string is taken as a column name
|
|
10943
|
+
# for which segmentation should occur across unique values in the column
|
|
10944
|
+
if isinstance(segments_expr, str):
|
|
10945
|
+
seg_tuples = _seg_expr_from_string(data_tbl=table, segments_expr=segments_expr)
|
|
10946
|
+
|
|
10947
|
+
# If the 'segments' expression is a tuple, then normalize it to a list of tuples
|
|
10948
|
+
# - ("col", "value") -> [("col", "value")]
|
|
10949
|
+
# - ("col", ["value1", "value2"]) -> [("col", "value1"), ("col", "value2")]
|
|
10950
|
+
elif isinstance(segments_expr, tuple):
|
|
10951
|
+
seg_tuples = _seg_expr_from_tuple(segments_expr=segments_expr)
|
|
10952
|
+
|
|
10953
|
+
# If the 'segments' expression is a list of strings or tuples (can be mixed) then
|
|
10954
|
+
# normalize it to a list of tuples following the rules above
|
|
10955
|
+
elif isinstance(segments_expr, list):
|
|
10956
|
+
seg_tuples = []
|
|
10957
|
+
for seg in segments_expr:
|
|
10958
|
+
if isinstance(seg, str):
|
|
10959
|
+
# Use the utility function for string items
|
|
10960
|
+
str_seg_tuples = _seg_expr_from_string(
|
|
10961
|
+
data_tbl=table, segments_expr=seg
|
|
10962
|
+
)
|
|
10963
|
+
seg_tuples.extend(str_seg_tuples)
|
|
10964
|
+
elif isinstance(seg, tuple):
|
|
10965
|
+
# Use the utility function for tuple items
|
|
10966
|
+
tuple_seg_tuples = _seg_expr_from_tuple(segments_expr=seg)
|
|
10967
|
+
seg_tuples.extend(tuple_seg_tuples)
|
|
10968
|
+
else: # pragma: no cover
|
|
10969
|
+
# Handle invalid segment type
|
|
10970
|
+
raise ValueError(
|
|
10971
|
+
f"Invalid segment expression item type: {type(seg)}. "
|
|
10972
|
+
"Must be either string or tuple."
|
|
10973
|
+
)
|
|
10974
|
+
|
|
10975
|
+
except Exception: # pragma: no cover
|
|
10976
|
+
validation.eval_error = True
|
|
10977
|
+
|
|
10978
|
+
# For each segmentation resolved, create a new validation step and add it to the list of
|
|
10979
|
+
# expanded validation steps
|
|
10980
|
+
for seg in seg_tuples:
|
|
10981
|
+
new_validation = copy.deepcopy(validation)
|
|
10982
|
+
|
|
10983
|
+
new_validation.segments = seg
|
|
10984
|
+
|
|
10985
|
+
expanded_validation_info.append(new_validation)
|
|
10986
|
+
|
|
10987
|
+
# Replace the `validation_info` attribute with the expanded version
|
|
10988
|
+
self.validation_info = expanded_validation_info
|
|
10989
|
+
|
|
10990
|
+
return self
|
|
10991
|
+
|
|
9779
10992
|
def _get_validation_dict(self, i: int | list[int] | None, attr: str) -> dict[int, int]:
|
|
9780
10993
|
"""
|
|
9781
10994
|
Utility function to get a dictionary of validation attributes for each validation step.
|
|
@@ -10233,6 +11446,13 @@ def _create_autobrief_or_failure_text(
|
|
|
10233
11446
|
for_failure=for_failure,
|
|
10234
11447
|
)
|
|
10235
11448
|
|
|
11449
|
+
if assertion_type == "rows_complete":
|
|
11450
|
+
return _create_text_rows_complete(
|
|
11451
|
+
lang=lang,
|
|
11452
|
+
columns_subset=column,
|
|
11453
|
+
for_failure=for_failure,
|
|
11454
|
+
)
|
|
11455
|
+
|
|
10236
11456
|
if assertion_type == "row_count_match":
|
|
10237
11457
|
return _create_text_row_count_match(
|
|
10238
11458
|
lang=lang,
|
|
@@ -10408,6 +11628,24 @@ def _create_text_rows_distinct(
|
|
|
10408
11628
|
return text
|
|
10409
11629
|
|
|
10410
11630
|
|
|
11631
|
+
def _create_text_rows_complete(
|
|
11632
|
+
lang: str, columns_subset: list[str] | None, for_failure: bool = False
|
|
11633
|
+
) -> str:
|
|
11634
|
+
type_ = _expect_failure_type(for_failure=for_failure)
|
|
11635
|
+
|
|
11636
|
+
if columns_subset is None:
|
|
11637
|
+
text = EXPECT_FAIL_TEXT[f"all_row_complete_{type_}_text"][lang]
|
|
11638
|
+
|
|
11639
|
+
else:
|
|
11640
|
+
column_text = _prep_values_text(values=columns_subset, lang=lang, limit=3)
|
|
11641
|
+
|
|
11642
|
+
text = EXPECT_FAIL_TEXT[f"across_row_complete_{type_}_text"][lang].format(
|
|
11643
|
+
column_text=column_text
|
|
11644
|
+
)
|
|
11645
|
+
|
|
11646
|
+
return text
|
|
11647
|
+
|
|
11648
|
+
|
|
10411
11649
|
def _create_text_row_count_match(lang: str, value: int, for_failure: bool = False) -> str:
|
|
10412
11650
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
10413
11651
|
|
|
@@ -10493,6 +11731,143 @@ def _prep_values_text(
|
|
|
10493
11731
|
return values_str
|
|
10494
11732
|
|
|
10495
11733
|
|
|
11734
|
+
def _seg_expr_from_string(data_tbl: any, segments_expr: str) -> tuple[str, str]:
|
|
11735
|
+
"""
|
|
11736
|
+
Obtain the segmentation categories from a table column.
|
|
11737
|
+
|
|
11738
|
+
The `segments_expr` value will have been checked to be a string, so there's no need to check for
|
|
11739
|
+
that here. The function will return a list of tuples representing pairings of a column name and
|
|
11740
|
+
a value. The task is to obtain the unique values in the column (handling different table types)
|
|
11741
|
+
and produce a normalized list of tuples of the form: `(column, value)`.
|
|
11742
|
+
|
|
11743
|
+
This function is used to create a list of segments for the validation step. And since there will
|
|
11744
|
+
usually be more than one segment, the validation step will be expanded into multiple during
|
|
11745
|
+
interrogation (where this function is called).
|
|
11746
|
+
|
|
11747
|
+
Parameters
|
|
11748
|
+
----------
|
|
11749
|
+
data_tbl
|
|
11750
|
+
The table from which to obtain the segmentation categories.
|
|
11751
|
+
segments_expr
|
|
11752
|
+
The column name for which segmentation should occur across unique values in the column.
|
|
11753
|
+
|
|
11754
|
+
Returns
|
|
11755
|
+
-------
|
|
11756
|
+
list[tuple[str, str]]
|
|
11757
|
+
A list of tuples representing pairings of a column name and a value in the column.
|
|
11758
|
+
"""
|
|
11759
|
+
# Determine if the table is a DataFrame or a DB table
|
|
11760
|
+
tbl_type = _get_tbl_type(data=data_tbl)
|
|
11761
|
+
|
|
11762
|
+
# Obtain the segmentation categories from the table column given as `segments_expr`
|
|
11763
|
+
if tbl_type == "polars":
|
|
11764
|
+
seg_categories = data_tbl[segments_expr].unique().to_list()
|
|
11765
|
+
elif tbl_type == "pandas":
|
|
11766
|
+
seg_categories = data_tbl[segments_expr].unique().tolist()
|
|
11767
|
+
elif tbl_type in IBIS_BACKENDS:
|
|
11768
|
+
distinct_col_vals = data_tbl.select(segments_expr).distinct()
|
|
11769
|
+
seg_categories = distinct_col_vals[segments_expr].to_list()
|
|
11770
|
+
else: # pragma: no cover
|
|
11771
|
+
raise ValueError(f"Unsupported table type: {tbl_type}")
|
|
11772
|
+
|
|
11773
|
+
# Ensure that the categories are sorted
|
|
11774
|
+
seg_categories.sort()
|
|
11775
|
+
|
|
11776
|
+
# Place each category and each value in a list of tuples as: `(column, value)`
|
|
11777
|
+
seg_tuples = [(segments_expr, category) for category in seg_categories]
|
|
11778
|
+
|
|
11779
|
+
return seg_tuples
|
|
11780
|
+
|
|
11781
|
+
|
|
11782
|
+
def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, str]]:
|
|
11783
|
+
"""
|
|
11784
|
+
Normalize the segments expression to a list of tuples, given a single tuple.
|
|
11785
|
+
|
|
11786
|
+
The `segments_expr` value will have been checked to be a tuple, so there's no need to check for
|
|
11787
|
+
that here. The function will return a list of tuples representing pairings of a column name and
|
|
11788
|
+
a value. The task is to normalize the tuple into a list of tuples of the form:
|
|
11789
|
+
`(column, value)`.
|
|
11790
|
+
|
|
11791
|
+
The following examples show how this normalzation works:
|
|
11792
|
+
- `("col", "value")` -> `[("col", "value")]` (single tuple, upgraded to a list of tuples)
|
|
11793
|
+
- `("col", ["value1", "value2"])` -> `[("col", "value1"), ("col", "value2")]` (tuple with a list
|
|
11794
|
+
of values, expanded into multiple tuples within a list)
|
|
11795
|
+
|
|
11796
|
+
This function is used to create a list of segments for the validation step. And since there will
|
|
11797
|
+
usually be more than one segment, the validation step will be expanded into multiple during
|
|
11798
|
+
interrogation (where this function is called).
|
|
11799
|
+
|
|
11800
|
+
Parameters
|
|
11801
|
+
----------
|
|
11802
|
+
segments_expr
|
|
11803
|
+
The segments expression to normalize. It can be a tuple of the form
|
|
11804
|
+
`(column, value)` or `(column, [value1, value2])`.
|
|
11805
|
+
|
|
11806
|
+
Returns
|
|
11807
|
+
-------
|
|
11808
|
+
list[tuple[str, str]]
|
|
11809
|
+
A list of tuples representing pairings of a column name and a value in the column.
|
|
11810
|
+
"""
|
|
11811
|
+
# Check if the first element is a string
|
|
11812
|
+
if isinstance(segments_expr[0], str):
|
|
11813
|
+
# If the second element is a list, create a list of tuples
|
|
11814
|
+
if isinstance(segments_expr[1], list):
|
|
11815
|
+
seg_tuples = [(segments_expr[0], value) for value in segments_expr[1]]
|
|
11816
|
+
# If the second element is not a list, create a single tuple
|
|
11817
|
+
else:
|
|
11818
|
+
seg_tuples = [(segments_expr[0], segments_expr[1])]
|
|
11819
|
+
# If the first element is not a string, raise an error
|
|
11820
|
+
else: # pragma: no cover
|
|
11821
|
+
raise ValueError("The first element of the segments expression must be a string.")
|
|
11822
|
+
|
|
11823
|
+
return seg_tuples
|
|
11824
|
+
|
|
11825
|
+
|
|
11826
|
+
def _apply_segments(data_tbl: any, segments_expr: tuple[str, str]) -> any:
|
|
11827
|
+
"""
|
|
11828
|
+
Apply the segments expression to the data table.
|
|
11829
|
+
|
|
11830
|
+
Filter the data table based on the `segments_expr=` value, where the first element is the
|
|
11831
|
+
column name and the second element is the value to filter by.
|
|
11832
|
+
|
|
11833
|
+
Parameters
|
|
11834
|
+
----------
|
|
11835
|
+
data_tbl
|
|
11836
|
+
The data table to filter. It can be a Pandas DataFrame, Polars DataFrame, or an Ibis
|
|
11837
|
+
backend table.
|
|
11838
|
+
segments_expr
|
|
11839
|
+
The segments expression to apply. It is a tuple of the form `(column, value)`.
|
|
11840
|
+
|
|
11841
|
+
Returns
|
|
11842
|
+
-------
|
|
11843
|
+
any
|
|
11844
|
+
The filtered data table. It will be of the same type as the input table.
|
|
11845
|
+
"""
|
|
11846
|
+
# Get the table type
|
|
11847
|
+
tbl_type = _get_tbl_type(data=data_tbl)
|
|
11848
|
+
|
|
11849
|
+
if tbl_type in ["pandas", "polars"]:
|
|
11850
|
+
# If the table is a Pandas or Polars DataFrame, transforming to a Narwhals table
|
|
11851
|
+
# and perform the filtering operation
|
|
11852
|
+
|
|
11853
|
+
# Transform to Narwhals table if a DataFrame
|
|
11854
|
+
data_tbl_nw = nw.from_native(data_tbl)
|
|
11855
|
+
|
|
11856
|
+
# Filter the data table based on the column name and value
|
|
11857
|
+
data_tbl_nw = data_tbl_nw.filter(nw.col(segments_expr[0]) == segments_expr[1])
|
|
11858
|
+
|
|
11859
|
+
# Transform back to the original table type
|
|
11860
|
+
data_tbl = data_tbl_nw.to_native()
|
|
11861
|
+
|
|
11862
|
+
elif tbl_type in IBIS_BACKENDS:
|
|
11863
|
+
# If the table is an Ibis backend table, perform the filtering operation directly
|
|
11864
|
+
|
|
11865
|
+
# Filter the data table based on the column name and value
|
|
11866
|
+
data_tbl = data_tbl[data_tbl[segments_expr[0]] == segments_expr[1]]
|
|
11867
|
+
|
|
11868
|
+
return data_tbl
|
|
11869
|
+
|
|
11870
|
+
|
|
10496
11871
|
def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
|
|
10497
11872
|
"""
|
|
10498
11873
|
Convert a `_ValidationInfo` object to a dictionary.
|
|
@@ -10517,6 +11892,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
|
|
|
10517
11892
|
"inclusive",
|
|
10518
11893
|
"na_pass",
|
|
10519
11894
|
"pre",
|
|
11895
|
+
"segments",
|
|
10520
11896
|
"label",
|
|
10521
11897
|
"brief",
|
|
10522
11898
|
"autobrief",
|
|
@@ -10631,7 +12007,7 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s
|
|
|
10631
12007
|
return title_text
|
|
10632
12008
|
|
|
10633
12009
|
|
|
10634
|
-
def _transform_tbl_preprocessed(pre:
|
|
12010
|
+
def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: bool) -> list[str]:
|
|
10635
12011
|
# If no interrogation was performed, return a list of empty strings
|
|
10636
12012
|
if not interrogation_performed:
|
|
10637
12013
|
return ["" for _ in range(len(pre))]
|
|
@@ -10640,11 +12016,13 @@ def _transform_tbl_preprocessed(pre: str, interrogation_performed: bool) -> list
|
|
|
10640
12016
|
# (either 'unchanged' (None) or 'modified' (not None))
|
|
10641
12017
|
status_list = []
|
|
10642
12018
|
|
|
10643
|
-
for
|
|
10644
|
-
if
|
|
10645
|
-
status_list.append("
|
|
10646
|
-
|
|
12019
|
+
for i in range(len(pre)):
|
|
12020
|
+
if seg[i] is not None:
|
|
12021
|
+
status_list.append("segmented")
|
|
12022
|
+
elif pre[i] is not None:
|
|
10647
12023
|
status_list.append("modified")
|
|
12024
|
+
else:
|
|
12025
|
+
status_list.append("unchanged")
|
|
10648
12026
|
|
|
10649
12027
|
return _get_preprocessed_table_icon(icon=status_list)
|
|
10650
12028
|
|
|
@@ -10752,7 +12130,11 @@ def _transform_w_e_c(values, color, interrogation_performed):
|
|
|
10752
12130
|
|
|
10753
12131
|
|
|
10754
12132
|
def _transform_assertion_str(
|
|
10755
|
-
assertion_str: list[str],
|
|
12133
|
+
assertion_str: list[str],
|
|
12134
|
+
brief_str: list[str | None],
|
|
12135
|
+
autobrief_str: list[str],
|
|
12136
|
+
segmentation_str: list[tuple | None],
|
|
12137
|
+
lang: str,
|
|
10756
12138
|
) -> list[str]:
|
|
10757
12139
|
# Get the SVG icons for the assertion types
|
|
10758
12140
|
svg_icon = _get_assertion_icon(icon=assertion_str)
|
|
@@ -10813,6 +12195,26 @@ def _transform_assertion_str(
|
|
|
10813
12195
|
for assertion, svg, size, brief_div in zip(assertion_str, svg_icon, text_size, brief_divs)
|
|
10814
12196
|
]
|
|
10815
12197
|
|
|
12198
|
+
# If the `segments` list is not empty, prepend a segmentation div to the `type_upd` strings
|
|
12199
|
+
if segmentation_str:
|
|
12200
|
+
for i in range(len(type_upd)):
|
|
12201
|
+
if segmentation_str[i] is not None:
|
|
12202
|
+
# Get the column name and value from the segmentation expression
|
|
12203
|
+
column_name = segmentation_str[i][0]
|
|
12204
|
+
column_value = segmentation_str[i][1]
|
|
12205
|
+
# Create the segmentation div
|
|
12206
|
+
segmentation_div = (
|
|
12207
|
+
"<div style='margin-top: 0px; margin-bottom: 0px; "
|
|
12208
|
+
"white-space: pre; font-size: 8px; color: darkblue; padding-bottom: 4px; "
|
|
12209
|
+
"'>"
|
|
12210
|
+
"<strong><span style='font-family: Helvetica, arial, sans-serif;'>"
|
|
12211
|
+
f"SEGMENT </span></strong><span>{column_name} / {column_value}"
|
|
12212
|
+
"</span>"
|
|
12213
|
+
"</div>"
|
|
12214
|
+
)
|
|
12215
|
+
# Prepend the segmentation div to the type_upd string
|
|
12216
|
+
type_upd[i] = f"{segmentation_div} {type_upd[i]}"
|
|
12217
|
+
|
|
10816
12218
|
return type_upd
|
|
10817
12219
|
|
|
10818
12220
|
|
|
@@ -11044,6 +12446,11 @@ def _step_report_row_based(
|
|
|
11044
12446
|
text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
|
|
11045
12447
|
elif assertion_type == "col_vals_not_null":
|
|
11046
12448
|
text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
|
|
12449
|
+
elif assertion_type == "rows_complete":
|
|
12450
|
+
if column is None:
|
|
12451
|
+
text = STEP_REPORT_TEXT["rows_complete_all"][lang]
|
|
12452
|
+
else:
|
|
12453
|
+
text = STEP_REPORT_TEXT["rows_complete_subset"][lang]
|
|
11047
12454
|
|
|
11048
12455
|
# Wrap assertion text in a <code> tag
|
|
11049
12456
|
text = (
|