pointblank 0.8.6__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants.py +11 -10
- pointblank/_interrogation.py +10 -4
- pointblank/_typing.py +19 -3
- pointblank/data/api-docs.txt +716 -49
- pointblank/datascan.py +4 -4
- pointblank/draft.py +1 -1
- pointblank/thresholds.py +10 -0
- pointblank/validate.py +1071 -50
- {pointblank-0.8.6.dist-info → pointblank-0.9.0.dist-info}/METADATA +19 -4
- {pointblank-0.8.6.dist-info → pointblank-0.9.0.dist-info}/RECORD +13 -13
- {pointblank-0.8.6.dist-info → pointblank-0.9.0.dist-info}/WHEEL +1 -1
- {pointblank-0.8.6.dist-info → pointblank-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.8.6.dist-info → pointblank-0.9.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -7,6 +7,7 @@ import datetime
|
|
|
7
7
|
import inspect
|
|
8
8
|
import json
|
|
9
9
|
import re
|
|
10
|
+
import tempfile
|
|
10
11
|
import threading
|
|
11
12
|
from dataclasses import dataclass
|
|
12
13
|
from importlib.metadata import version
|
|
@@ -57,6 +58,7 @@ from pointblank._interrogation import (
|
|
|
57
58
|
RowCountMatch,
|
|
58
59
|
RowsDistinct,
|
|
59
60
|
)
|
|
61
|
+
from pointblank._typing import SegmentSpec
|
|
60
62
|
from pointblank._utils import (
|
|
61
63
|
_check_any_df_lib,
|
|
62
64
|
_check_invalid_fields,
|
|
@@ -87,6 +89,8 @@ from pointblank.thresholds import (
|
|
|
87
89
|
)
|
|
88
90
|
|
|
89
91
|
if TYPE_CHECKING:
|
|
92
|
+
from collections.abc import Collection
|
|
93
|
+
|
|
90
94
|
from pointblank._typing import AbsoluteBounds, Tolerance
|
|
91
95
|
|
|
92
96
|
__all__ = [
|
|
@@ -117,16 +121,18 @@ def _action_context_manager(metadata):
|
|
|
117
121
|
delattr(_action_context, "metadata")
|
|
118
122
|
|
|
119
123
|
|
|
120
|
-
def get_action_metadata():
|
|
124
|
+
def get_action_metadata() -> dict | None:
|
|
121
125
|
"""Access step-level metadata when authoring custom actions.
|
|
122
126
|
|
|
123
127
|
Get the metadata for the validation step where an action was triggered. This can be called by
|
|
124
|
-
user functions to get the metadata for the current action.
|
|
128
|
+
user functions to get the metadata for the current action. This function can only be used within
|
|
129
|
+
callables crafted for the [`Actions`](`pointblank.Actions`) class.
|
|
125
130
|
|
|
126
131
|
Returns
|
|
127
132
|
-------
|
|
128
|
-
dict
|
|
129
|
-
A dictionary containing the metadata for the current step.
|
|
133
|
+
dict | None
|
|
134
|
+
A dictionary containing the metadata for the current step. If called outside of an action
|
|
135
|
+
(i.e., when no action is being executed), this function will return `None`.
|
|
130
136
|
|
|
131
137
|
Description of the Metadata Fields
|
|
132
138
|
----------------------------------
|
|
@@ -161,7 +167,7 @@ def get_action_metadata():
|
|
|
161
167
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
162
168
|
actions=pb.Actions(warning=log_issue),
|
|
163
169
|
)
|
|
164
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
170
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
165
171
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
166
172
|
.col_vals_gt(
|
|
167
173
|
columns="session_duration",
|
|
@@ -179,6 +185,11 @@ def get_action_metadata():
|
|
|
179
185
|
- the `metadata` is a dictionary that is used to craft the log message
|
|
180
186
|
- the action is passed as a bare function to the `Actions` object within the `Validate` object
|
|
181
187
|
(placing it within `Validate(actions=)` ensures it's set as an action for every validation step)
|
|
188
|
+
|
|
189
|
+
See Also
|
|
190
|
+
--------
|
|
191
|
+
Have a look at [`Actions`](`pointblank.Actions`) for more information on how to create custom
|
|
192
|
+
actions for validation steps that exceed a set threshold value.
|
|
182
193
|
"""
|
|
183
194
|
if hasattr(_action_context, "metadata"): # pragma: no cover
|
|
184
195
|
return _action_context.metadata # pragma: no cover
|
|
@@ -202,17 +213,19 @@ def _final_action_context_manager(summary):
|
|
|
202
213
|
delattr(_final_action_context, "summary")
|
|
203
214
|
|
|
204
215
|
|
|
205
|
-
def get_validation_summary():
|
|
216
|
+
def get_validation_summary() -> dict | None:
|
|
206
217
|
"""Access validation summary information when authoring final actions.
|
|
207
218
|
|
|
208
219
|
This function provides a convenient way to access summary information about the validation
|
|
209
220
|
process within a final action. It returns a dictionary with key metrics from the validation
|
|
210
|
-
process.
|
|
221
|
+
process. This function can only be used within callables crafted for the
|
|
222
|
+
[`FinalActions`](`pointblank.FinalActions`) class.
|
|
211
223
|
|
|
212
224
|
Returns
|
|
213
225
|
-------
|
|
214
226
|
dict | None
|
|
215
|
-
A dictionary containing validation metrics
|
|
227
|
+
A dictionary containing validation metrics. If called outside of an final action context,
|
|
228
|
+
this function will return `None`.
|
|
216
229
|
|
|
217
230
|
Description of the Summary Fields
|
|
218
231
|
--------------------------------
|
|
@@ -302,6 +315,11 @@ def get_validation_summary():
|
|
|
302
315
|
|
|
303
316
|
Final actions work well with both simple logging and more complex notification systems, allowing
|
|
304
317
|
you to integrate validation results into your broader data quality workflows.
|
|
318
|
+
|
|
319
|
+
See Also
|
|
320
|
+
--------
|
|
321
|
+
Have a look at [`FinalActions`](`pointblank.FinalActions`) for more information on how to create
|
|
322
|
+
custom actions that are executed after all validation steps have been completed.
|
|
305
323
|
"""
|
|
306
324
|
if hasattr(_final_action_context, "summary"):
|
|
307
325
|
return _final_action_context.summary
|
|
@@ -514,10 +532,10 @@ def load_dataset(
|
|
|
514
532
|
data_path = files("pointblank.data") / f"{dataset}-duckdb.zip"
|
|
515
533
|
|
|
516
534
|
# Unzip the DuckDB dataset to a temporary directory
|
|
517
|
-
with ZipFile(data_path, "r") as z:
|
|
518
|
-
z.extractall(path=
|
|
535
|
+
with tempfile.TemporaryDirectory() as tmp, ZipFile(data_path, "r") as z:
|
|
536
|
+
z.extractall(path=tmp)
|
|
519
537
|
|
|
520
|
-
data_path = f"
|
|
538
|
+
data_path = f"{tmp}/{dataset}.ddb"
|
|
521
539
|
|
|
522
540
|
dataset = ibis.connect(f"duckdb://{data_path}").table(dataset)
|
|
523
541
|
|
|
@@ -1781,14 +1799,15 @@ class _ValidationInfo:
|
|
|
1781
1799
|
assertion_type
|
|
1782
1800
|
The type of assertion. This is the method name of the validation (e.g., `"col_vals_gt"`).
|
|
1783
1801
|
column
|
|
1784
|
-
The column to validate.
|
|
1785
|
-
multiple columns).
|
|
1802
|
+
The column(s) to validate.
|
|
1786
1803
|
values
|
|
1787
1804
|
The value or values to compare against.
|
|
1788
1805
|
na_pass
|
|
1789
1806
|
Whether to pass test units that hold missing values.
|
|
1790
1807
|
pre
|
|
1791
1808
|
A preprocessing function or lambda to apply to the data table for the validation step.
|
|
1809
|
+
segments
|
|
1810
|
+
The segments to use for the validation step.
|
|
1792
1811
|
thresholds
|
|
1793
1812
|
The threshold values for the validation.
|
|
1794
1813
|
actions
|
|
@@ -1839,11 +1858,12 @@ class _ValidationInfo:
|
|
|
1839
1858
|
step_id: str | None = None
|
|
1840
1859
|
sha1: str | None = None
|
|
1841
1860
|
assertion_type: str | None = None
|
|
1842
|
-
column:
|
|
1861
|
+
column: any | None = None
|
|
1843
1862
|
values: any | list[any] | tuple | None = None
|
|
1844
1863
|
inclusive: tuple[bool, bool] | None = None
|
|
1845
1864
|
na_pass: bool | None = None
|
|
1846
1865
|
pre: Callable | None = None
|
|
1866
|
+
segments: any | None = None
|
|
1847
1867
|
thresholds: Thresholds | None = None
|
|
1848
1868
|
actions: Actions | None = None
|
|
1849
1869
|
label: str | None = None
|
|
@@ -1907,7 +1927,7 @@ class Validate:
|
|
|
1907
1927
|
The table to validate, which could be a DataFrame object or an Ibis table object. Read the
|
|
1908
1928
|
*Supported Input Table Types* section for details on the supported table types.
|
|
1909
1929
|
tbl_name
|
|
1910
|
-
|
|
1930
|
+
An optional name to assign to the input table object. If no value is provided, a name will
|
|
1911
1931
|
be generated based on whatever information is available. This table name will be displayed
|
|
1912
1932
|
in the header area of the tabular report.
|
|
1913
1933
|
label
|
|
@@ -2321,6 +2341,7 @@ class Validate:
|
|
|
2321
2341
|
value: float | int | Column,
|
|
2322
2342
|
na_pass: bool = False,
|
|
2323
2343
|
pre: Callable | None = None,
|
|
2344
|
+
segments: SegmentSpec | None = None,
|
|
2324
2345
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
2325
2346
|
actions: Actions | None = None,
|
|
2326
2347
|
brief: str | bool | None = None,
|
|
@@ -2352,10 +2373,15 @@ class Validate:
|
|
|
2352
2373
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2353
2374
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2354
2375
|
pre
|
|
2355
|
-
|
|
2376
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2356
2377
|
interrogation. This function should take a table as input and return a modified table.
|
|
2357
2378
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2358
2379
|
argument.
|
|
2380
|
+
segments
|
|
2381
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2382
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2383
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2384
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2359
2385
|
thresholds
|
|
2360
2386
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2361
2387
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2418,6 +2444,42 @@ class Validate:
|
|
|
2418
2444
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2419
2445
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2420
2446
|
|
|
2447
|
+
Segmentation
|
|
2448
|
+
------------
|
|
2449
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2450
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2451
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2452
|
+
column.
|
|
2453
|
+
|
|
2454
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2455
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2456
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2457
|
+
region.
|
|
2458
|
+
|
|
2459
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2460
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2461
|
+
segment on only specific dates, you can provide a tuple like
|
|
2462
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2463
|
+
(i.e., no validation steps will be created for them).
|
|
2464
|
+
|
|
2465
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2466
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2467
|
+
|
|
2468
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2469
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2470
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2471
|
+
columns
|
|
2472
|
+
|
|
2473
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2474
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2475
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2476
|
+
identify issues within specific segments.
|
|
2477
|
+
|
|
2478
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2479
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2480
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2481
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2482
|
+
|
|
2421
2483
|
Thresholds
|
|
2422
2484
|
----------
|
|
2423
2485
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2516,6 +2578,8 @@ class Validate:
|
|
|
2516
2578
|
_check_column(column=columns)
|
|
2517
2579
|
# _check_value_float_int(value=value)
|
|
2518
2580
|
_check_pre(pre=pre)
|
|
2581
|
+
# TODO: add check for segments
|
|
2582
|
+
# _check_segments(segments=segments)
|
|
2519
2583
|
_check_thresholds(thresholds=thresholds)
|
|
2520
2584
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
2521
2585
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -2548,6 +2612,7 @@ class Validate:
|
|
|
2548
2612
|
values=value,
|
|
2549
2613
|
na_pass=na_pass,
|
|
2550
2614
|
pre=pre,
|
|
2615
|
+
segments=segments,
|
|
2551
2616
|
thresholds=thresholds,
|
|
2552
2617
|
actions=actions,
|
|
2553
2618
|
brief=brief,
|
|
@@ -2564,6 +2629,7 @@ class Validate:
|
|
|
2564
2629
|
value: float | int | Column,
|
|
2565
2630
|
na_pass: bool = False,
|
|
2566
2631
|
pre: Callable | None = None,
|
|
2632
|
+
segments: SegmentSpec | None = None,
|
|
2567
2633
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
2568
2634
|
actions: Actions | None = None,
|
|
2569
2635
|
brief: str | bool | None = None,
|
|
@@ -2595,10 +2661,15 @@ class Validate:
|
|
|
2595
2661
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2596
2662
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2597
2663
|
pre
|
|
2598
|
-
|
|
2664
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2599
2665
|
interrogation. This function should take a table as input and return a modified table.
|
|
2600
2666
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2601
2667
|
argument.
|
|
2668
|
+
segments
|
|
2669
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2670
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2671
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2672
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2602
2673
|
thresholds
|
|
2603
2674
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2604
2675
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2661,6 +2732,42 @@ class Validate:
|
|
|
2661
2732
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2662
2733
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2663
2734
|
|
|
2735
|
+
Segmentation
|
|
2736
|
+
------------
|
|
2737
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2738
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2739
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2740
|
+
column.
|
|
2741
|
+
|
|
2742
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2743
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2744
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2745
|
+
region.
|
|
2746
|
+
|
|
2747
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2748
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2749
|
+
segment on only specific dates, you can provide a tuple like
|
|
2750
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2751
|
+
(i.e., no validation steps will be created for them).
|
|
2752
|
+
|
|
2753
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2754
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2755
|
+
|
|
2756
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2757
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2758
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2759
|
+
columns
|
|
2760
|
+
|
|
2761
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2762
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2763
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2764
|
+
identify issues within specific segments.
|
|
2765
|
+
|
|
2766
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2767
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2768
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2769
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2770
|
+
|
|
2664
2771
|
Thresholds
|
|
2665
2772
|
----------
|
|
2666
2773
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2758,6 +2865,8 @@ class Validate:
|
|
|
2758
2865
|
_check_column(column=columns)
|
|
2759
2866
|
# _check_value_float_int(value=value)
|
|
2760
2867
|
_check_pre(pre=pre)
|
|
2868
|
+
# TODO: add check for segments
|
|
2869
|
+
# _check_segments(segments=segments)
|
|
2761
2870
|
_check_thresholds(thresholds=thresholds)
|
|
2762
2871
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
2763
2872
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -2790,6 +2899,7 @@ class Validate:
|
|
|
2790
2899
|
values=value,
|
|
2791
2900
|
na_pass=na_pass,
|
|
2792
2901
|
pre=pre,
|
|
2902
|
+
segments=segments,
|
|
2793
2903
|
thresholds=thresholds,
|
|
2794
2904
|
actions=actions,
|
|
2795
2905
|
brief=brief,
|
|
@@ -2806,6 +2916,7 @@ class Validate:
|
|
|
2806
2916
|
value: float | int | Column,
|
|
2807
2917
|
na_pass: bool = False,
|
|
2808
2918
|
pre: Callable | None = None,
|
|
2919
|
+
segments: SegmentSpec | None = None,
|
|
2809
2920
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
2810
2921
|
actions: Actions | None = None,
|
|
2811
2922
|
brief: str | bool | None = None,
|
|
@@ -2837,10 +2948,15 @@ class Validate:
|
|
|
2837
2948
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2838
2949
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2839
2950
|
pre
|
|
2840
|
-
|
|
2951
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2841
2952
|
interrogation. This function should take a table as input and return a modified table.
|
|
2842
2953
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2843
2954
|
argument.
|
|
2955
|
+
segments
|
|
2956
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2957
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2958
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2959
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2844
2960
|
thresholds
|
|
2845
2961
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2846
2962
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2903,6 +3019,42 @@ class Validate:
|
|
|
2903
3019
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2904
3020
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2905
3021
|
|
|
3022
|
+
Segmentation
|
|
3023
|
+
------------
|
|
3024
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3025
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3026
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3027
|
+
column.
|
|
3028
|
+
|
|
3029
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3030
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3031
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3032
|
+
region.
|
|
3033
|
+
|
|
3034
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3035
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3036
|
+
segment on only specific dates, you can provide a tuple like
|
|
3037
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3038
|
+
(i.e., no validation steps will be created for them).
|
|
3039
|
+
|
|
3040
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3041
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3042
|
+
|
|
3043
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3044
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3045
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3046
|
+
columns
|
|
3047
|
+
|
|
3048
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3049
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3050
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3051
|
+
identify issues within specific segments.
|
|
3052
|
+
|
|
3053
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3054
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3055
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3056
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3057
|
+
|
|
2906
3058
|
Thresholds
|
|
2907
3059
|
----------
|
|
2908
3060
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2999,6 +3151,8 @@ class Validate:
|
|
|
2999
3151
|
_check_column(column=columns)
|
|
3000
3152
|
# _check_value_float_int(value=value)
|
|
3001
3153
|
_check_pre(pre=pre)
|
|
3154
|
+
# TODO: add check for segments
|
|
3155
|
+
# _check_segments(segments=segments)
|
|
3002
3156
|
_check_thresholds(thresholds=thresholds)
|
|
3003
3157
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3004
3158
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -3031,6 +3185,7 @@ class Validate:
|
|
|
3031
3185
|
values=value,
|
|
3032
3186
|
na_pass=na_pass,
|
|
3033
3187
|
pre=pre,
|
|
3188
|
+
segments=segments,
|
|
3034
3189
|
thresholds=thresholds,
|
|
3035
3190
|
actions=actions,
|
|
3036
3191
|
brief=brief,
|
|
@@ -3047,6 +3202,7 @@ class Validate:
|
|
|
3047
3202
|
value: float | int | Column,
|
|
3048
3203
|
na_pass: bool = False,
|
|
3049
3204
|
pre: Callable | None = None,
|
|
3205
|
+
segments: SegmentSpec | None = None,
|
|
3050
3206
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
3051
3207
|
actions: Actions | None = None,
|
|
3052
3208
|
brief: str | bool | None = None,
|
|
@@ -3078,10 +3234,15 @@ class Validate:
|
|
|
3078
3234
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3079
3235
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3080
3236
|
pre
|
|
3081
|
-
|
|
3237
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3082
3238
|
interrogation. This function should take a table as input and return a modified table.
|
|
3083
3239
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3084
3240
|
argument.
|
|
3241
|
+
segments
|
|
3242
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3243
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3244
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3245
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3085
3246
|
thresholds
|
|
3086
3247
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3087
3248
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3144,6 +3305,42 @@ class Validate:
|
|
|
3144
3305
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
3145
3306
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
3146
3307
|
|
|
3308
|
+
Segmentation
|
|
3309
|
+
------------
|
|
3310
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3311
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3312
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3313
|
+
column.
|
|
3314
|
+
|
|
3315
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3316
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3317
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3318
|
+
region.
|
|
3319
|
+
|
|
3320
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3321
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3322
|
+
segment on only specific dates, you can provide a tuple like
|
|
3323
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3324
|
+
(i.e., no validation steps will be created for them).
|
|
3325
|
+
|
|
3326
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3327
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3328
|
+
|
|
3329
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3330
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3331
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3332
|
+
columns
|
|
3333
|
+
|
|
3334
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3335
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3336
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3337
|
+
identify issues within specific segments.
|
|
3338
|
+
|
|
3339
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3340
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3341
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3342
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3343
|
+
|
|
3147
3344
|
Thresholds
|
|
3148
3345
|
----------
|
|
3149
3346
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3238,6 +3435,8 @@ class Validate:
|
|
|
3238
3435
|
_check_column(column=columns)
|
|
3239
3436
|
# _check_value_float_int(value=value)
|
|
3240
3437
|
_check_pre(pre=pre)
|
|
3438
|
+
# TODO: add check for segments
|
|
3439
|
+
# _check_segments(segments=segments)
|
|
3241
3440
|
_check_thresholds(thresholds=thresholds)
|
|
3242
3441
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3243
3442
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -3270,6 +3469,7 @@ class Validate:
|
|
|
3270
3469
|
values=value,
|
|
3271
3470
|
na_pass=na_pass,
|
|
3272
3471
|
pre=pre,
|
|
3472
|
+
segments=segments,
|
|
3273
3473
|
thresholds=thresholds,
|
|
3274
3474
|
actions=actions,
|
|
3275
3475
|
brief=brief,
|
|
@@ -3286,6 +3486,7 @@ class Validate:
|
|
|
3286
3486
|
value: float | int | Column,
|
|
3287
3487
|
na_pass: bool = False,
|
|
3288
3488
|
pre: Callable | None = None,
|
|
3489
|
+
segments: SegmentSpec | None = None,
|
|
3289
3490
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
3290
3491
|
actions: Actions | None = None,
|
|
3291
3492
|
brief: str | bool | None = None,
|
|
@@ -3317,10 +3518,15 @@ class Validate:
|
|
|
3317
3518
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3318
3519
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3319
3520
|
pre
|
|
3320
|
-
|
|
3521
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3321
3522
|
interrogation. This function should take a table as input and return a modified table.
|
|
3322
3523
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3323
3524
|
argument.
|
|
3525
|
+
segments
|
|
3526
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3527
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3528
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3529
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3324
3530
|
thresholds
|
|
3325
3531
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3326
3532
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3383,6 +3589,42 @@ class Validate:
|
|
|
3383
3589
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
3384
3590
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
3385
3591
|
|
|
3592
|
+
Segmentation
|
|
3593
|
+
------------
|
|
3594
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3595
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3596
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3597
|
+
column.
|
|
3598
|
+
|
|
3599
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3600
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3601
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3602
|
+
region.
|
|
3603
|
+
|
|
3604
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3605
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3606
|
+
segment on only specific dates, you can provide a tuple like
|
|
3607
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3608
|
+
(i.e., no validation steps will be created for them).
|
|
3609
|
+
|
|
3610
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3611
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3612
|
+
|
|
3613
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3614
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3615
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3616
|
+
columns
|
|
3617
|
+
|
|
3618
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3619
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3620
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3621
|
+
identify issues within specific segments.
|
|
3622
|
+
|
|
3623
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3624
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3625
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3626
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3627
|
+
|
|
3386
3628
|
Thresholds
|
|
3387
3629
|
----------
|
|
3388
3630
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3481,6 +3723,8 @@ class Validate:
|
|
|
3481
3723
|
_check_column(column=columns)
|
|
3482
3724
|
# _check_value_float_int(value=value)
|
|
3483
3725
|
_check_pre(pre=pre)
|
|
3726
|
+
# TODO: add check for segments
|
|
3727
|
+
# _check_segments(segments=segments)
|
|
3484
3728
|
_check_thresholds(thresholds=thresholds)
|
|
3485
3729
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3486
3730
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -3513,6 +3757,7 @@ class Validate:
|
|
|
3513
3757
|
values=value,
|
|
3514
3758
|
na_pass=na_pass,
|
|
3515
3759
|
pre=pre,
|
|
3760
|
+
segments=segments,
|
|
3516
3761
|
thresholds=thresholds,
|
|
3517
3762
|
actions=actions,
|
|
3518
3763
|
brief=brief,
|
|
@@ -3529,6 +3774,7 @@ class Validate:
|
|
|
3529
3774
|
value: float | int | Column,
|
|
3530
3775
|
na_pass: bool = False,
|
|
3531
3776
|
pre: Callable | None = None,
|
|
3777
|
+
segments: SegmentSpec | None = None,
|
|
3532
3778
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
3533
3779
|
actions: Actions | None = None,
|
|
3534
3780
|
brief: str | bool | None = None,
|
|
@@ -3560,10 +3806,15 @@ class Validate:
|
|
|
3560
3806
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3561
3807
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3562
3808
|
pre
|
|
3563
|
-
|
|
3809
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3564
3810
|
interrogation. This function should take a table as input and return a modified table.
|
|
3565
3811
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3566
3812
|
argument.
|
|
3813
|
+
segments
|
|
3814
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3815
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3816
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3817
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3567
3818
|
thresholds
|
|
3568
3819
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3569
3820
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3626,6 +3877,42 @@ class Validate:
|
|
|
3626
3877
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
3627
3878
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
3628
3879
|
|
|
3880
|
+
Segmentation
|
|
3881
|
+
------------
|
|
3882
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3883
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3884
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3885
|
+
column.
|
|
3886
|
+
|
|
3887
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3888
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3889
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3890
|
+
region.
|
|
3891
|
+
|
|
3892
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3893
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3894
|
+
segment on only specific dates, you can provide a tuple like
|
|
3895
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3896
|
+
(i.e., no validation steps will be created for them).
|
|
3897
|
+
|
|
3898
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3899
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3900
|
+
|
|
3901
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3902
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3903
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3904
|
+
columns
|
|
3905
|
+
|
|
3906
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3907
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3908
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3909
|
+
identify issues within specific segments.
|
|
3910
|
+
|
|
3911
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3912
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3913
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3914
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3915
|
+
|
|
3629
3916
|
Thresholds
|
|
3630
3917
|
----------
|
|
3631
3918
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3724,6 +4011,8 @@ class Validate:
|
|
|
3724
4011
|
_check_column(column=columns)
|
|
3725
4012
|
# _check_value_float_int(value=value)
|
|
3726
4013
|
_check_pre(pre=pre)
|
|
4014
|
+
# TODO: add check for segments
|
|
4015
|
+
# _check_segments(segments=segments)
|
|
3727
4016
|
_check_thresholds(thresholds=thresholds)
|
|
3728
4017
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3729
4018
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -3756,6 +4045,7 @@ class Validate:
|
|
|
3756
4045
|
values=value,
|
|
3757
4046
|
na_pass=na_pass,
|
|
3758
4047
|
pre=pre,
|
|
4048
|
+
segments=segments,
|
|
3759
4049
|
thresholds=thresholds,
|
|
3760
4050
|
actions=actions,
|
|
3761
4051
|
brief=brief,
|
|
@@ -3774,6 +4064,7 @@ class Validate:
|
|
|
3774
4064
|
inclusive: tuple[bool, bool] = (True, True),
|
|
3775
4065
|
na_pass: bool = False,
|
|
3776
4066
|
pre: Callable | None = None,
|
|
4067
|
+
segments: SegmentSpec | None = None,
|
|
3777
4068
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
3778
4069
|
actions: Actions | None = None,
|
|
3779
4070
|
brief: str | bool | None = None,
|
|
@@ -3815,10 +4106,15 @@ class Validate:
|
|
|
3815
4106
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3816
4107
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3817
4108
|
pre
|
|
3818
|
-
|
|
4109
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3819
4110
|
interrogation. This function should take a table as input and return a modified table.
|
|
3820
4111
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3821
4112
|
argument.
|
|
4113
|
+
segments
|
|
4114
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4115
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4116
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4117
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3822
4118
|
thresholds
|
|
3823
4119
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3824
4120
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3883,6 +4179,42 @@ class Validate:
|
|
|
3883
4179
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
3884
4180
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
3885
4181
|
|
|
4182
|
+
Segmentation
|
|
4183
|
+
------------
|
|
4184
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
4185
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
4186
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
4187
|
+
column.
|
|
4188
|
+
|
|
4189
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
4190
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
4191
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
4192
|
+
region.
|
|
4193
|
+
|
|
4194
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
4195
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
4196
|
+
segment on only specific dates, you can provide a tuple like
|
|
4197
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
4198
|
+
(i.e., no validation steps will be created for them).
|
|
4199
|
+
|
|
4200
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4201
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
4202
|
+
|
|
4203
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
4204
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
4205
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
4206
|
+
columns
|
|
4207
|
+
|
|
4208
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4209
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
4210
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
4211
|
+
identify issues within specific segments.
|
|
4212
|
+
|
|
4213
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
4214
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
4215
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
4216
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
4217
|
+
|
|
3886
4218
|
Thresholds
|
|
3887
4219
|
----------
|
|
3888
4220
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3990,6 +4322,8 @@ class Validate:
|
|
|
3990
4322
|
# _check_value_float_int(value=left)
|
|
3991
4323
|
# _check_value_float_int(value=right)
|
|
3992
4324
|
_check_pre(pre=pre)
|
|
4325
|
+
# TODO: add check for segments
|
|
4326
|
+
# _check_segments(segments=segments)
|
|
3993
4327
|
_check_thresholds(thresholds=thresholds)
|
|
3994
4328
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3995
4329
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -4027,6 +4361,7 @@ class Validate:
|
|
|
4027
4361
|
inclusive=inclusive,
|
|
4028
4362
|
na_pass=na_pass,
|
|
4029
4363
|
pre=pre,
|
|
4364
|
+
segments=segments,
|
|
4030
4365
|
thresholds=thresholds,
|
|
4031
4366
|
actions=actions,
|
|
4032
4367
|
brief=brief,
|
|
@@ -4045,6 +4380,7 @@ class Validate:
|
|
|
4045
4380
|
inclusive: tuple[bool, bool] = (True, True),
|
|
4046
4381
|
na_pass: bool = False,
|
|
4047
4382
|
pre: Callable | None = None,
|
|
4383
|
+
segments: SegmentSpec | None = None,
|
|
4048
4384
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4049
4385
|
actions: Actions | None = None,
|
|
4050
4386
|
brief: str | bool | None = None,
|
|
@@ -4086,10 +4422,15 @@ class Validate:
|
|
|
4086
4422
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
4087
4423
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
4088
4424
|
pre
|
|
4089
|
-
|
|
4425
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4090
4426
|
interrogation. This function should take a table as input and return a modified table.
|
|
4091
4427
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4092
4428
|
argument.
|
|
4429
|
+
segments
|
|
4430
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4431
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4432
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4433
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4093
4434
|
thresholds
|
|
4094
4435
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4095
4436
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4154,6 +4495,42 @@ class Validate:
|
|
|
4154
4495
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
4155
4496
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
4156
4497
|
|
|
4498
|
+
Segmentation
|
|
4499
|
+
------------
|
|
4500
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
4501
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
4502
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
4503
|
+
column.
|
|
4504
|
+
|
|
4505
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
4506
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
4507
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
4508
|
+
region.
|
|
4509
|
+
|
|
4510
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
4511
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
4512
|
+
segment on only specific dates, you can provide a tuple like
|
|
4513
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
4514
|
+
(i.e., no validation steps will be created for them).
|
|
4515
|
+
|
|
4516
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4517
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
4518
|
+
|
|
4519
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
4520
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
4521
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
4522
|
+
columns
|
|
4523
|
+
|
|
4524
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4525
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
4526
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
4527
|
+
identify issues within specific segments.
|
|
4528
|
+
|
|
4529
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
4530
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
4531
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
4532
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
4533
|
+
|
|
4157
4534
|
Thresholds
|
|
4158
4535
|
----------
|
|
4159
4536
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -4261,6 +4638,8 @@ class Validate:
|
|
|
4261
4638
|
# _check_value_float_int(value=left)
|
|
4262
4639
|
# _check_value_float_int(value=right)
|
|
4263
4640
|
_check_pre(pre=pre)
|
|
4641
|
+
# TODO: add check for segments
|
|
4642
|
+
# _check_segments(segments=segments)
|
|
4264
4643
|
_check_thresholds(thresholds=thresholds)
|
|
4265
4644
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
4266
4645
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -4298,6 +4677,7 @@ class Validate:
|
|
|
4298
4677
|
inclusive=inclusive,
|
|
4299
4678
|
na_pass=na_pass,
|
|
4300
4679
|
pre=pre,
|
|
4680
|
+
segments=segments,
|
|
4301
4681
|
thresholds=thresholds,
|
|
4302
4682
|
actions=actions,
|
|
4303
4683
|
brief=brief,
|
|
@@ -4311,8 +4691,9 @@ class Validate:
|
|
|
4311
4691
|
def col_vals_in_set(
|
|
4312
4692
|
self,
|
|
4313
4693
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
4314
|
-
set:
|
|
4694
|
+
set: Collection[Any],
|
|
4315
4695
|
pre: Callable | None = None,
|
|
4696
|
+
segments: SegmentSpec | None = None,
|
|
4316
4697
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4317
4698
|
actions: Actions | None = None,
|
|
4318
4699
|
brief: str | bool | None = None,
|
|
@@ -4336,10 +4717,15 @@ class Validate:
|
|
|
4336
4717
|
set
|
|
4337
4718
|
A list of values to compare against.
|
|
4338
4719
|
pre
|
|
4339
|
-
|
|
4720
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4340
4721
|
interrogation. This function should take a table as input and return a modified table.
|
|
4341
4722
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4342
4723
|
argument.
|
|
4724
|
+
segments
|
|
4725
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4726
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4727
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4728
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4343
4729
|
thresholds
|
|
4344
4730
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4345
4731
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4381,6 +4767,42 @@ class Validate:
|
|
|
4381
4767
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
4382
4768
|
subsequent validation steps.
|
|
4383
4769
|
|
|
4770
|
+
Segmentation
|
|
4771
|
+
------------
|
|
4772
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
4773
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
4774
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
4775
|
+
column.
|
|
4776
|
+
|
|
4777
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
4778
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
4779
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
4780
|
+
region.
|
|
4781
|
+
|
|
4782
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
4783
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
4784
|
+
segment on only specific dates, you can provide a tuple like
|
|
4785
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
4786
|
+
(i.e., no validation steps will be created for them).
|
|
4787
|
+
|
|
4788
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4789
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
4790
|
+
|
|
4791
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
4792
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
4793
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
4794
|
+
columns
|
|
4795
|
+
|
|
4796
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4797
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
4798
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
4799
|
+
identify issues within specific segments.
|
|
4800
|
+
|
|
4801
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
4802
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
4803
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
4804
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
4805
|
+
|
|
4384
4806
|
Thresholds
|
|
4385
4807
|
----------
|
|
4386
4808
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -4471,8 +4893,16 @@ class Validate:
|
|
|
4471
4893
|
assertion_type = _get_fn_name()
|
|
4472
4894
|
|
|
4473
4895
|
_check_column(column=columns)
|
|
4474
|
-
|
|
4896
|
+
|
|
4897
|
+
for val in set:
|
|
4898
|
+
if val is None:
|
|
4899
|
+
continue
|
|
4900
|
+
if not isinstance(val, (float, int, str)):
|
|
4901
|
+
raise ValueError("`set=` must be a list of floats, integers, or strings.")
|
|
4902
|
+
|
|
4475
4903
|
_check_pre(pre=pre)
|
|
4904
|
+
# TODO: add check for segments
|
|
4905
|
+
# _check_segments(segments=segments)
|
|
4476
4906
|
_check_thresholds(thresholds=thresholds)
|
|
4477
4907
|
_check_boolean_input(param=active, param_name="active")
|
|
4478
4908
|
|
|
@@ -4500,6 +4930,7 @@ class Validate:
|
|
|
4500
4930
|
column=column,
|
|
4501
4931
|
values=set,
|
|
4502
4932
|
pre=pre,
|
|
4933
|
+
segments=segments,
|
|
4503
4934
|
thresholds=thresholds,
|
|
4504
4935
|
actions=actions,
|
|
4505
4936
|
brief=brief,
|
|
@@ -4515,6 +4946,7 @@ class Validate:
|
|
|
4515
4946
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
4516
4947
|
set: list[float | int],
|
|
4517
4948
|
pre: Callable | None = None,
|
|
4949
|
+
segments: SegmentSpec | None = None,
|
|
4518
4950
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4519
4951
|
actions: Actions | None = None,
|
|
4520
4952
|
brief: str | bool | None = None,
|
|
@@ -4538,10 +4970,15 @@ class Validate:
|
|
|
4538
4970
|
set
|
|
4539
4971
|
A list of values to compare against.
|
|
4540
4972
|
pre
|
|
4541
|
-
|
|
4973
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4542
4974
|
interrogation. This function should take a table as input and return a modified table.
|
|
4543
4975
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4544
4976
|
argument.
|
|
4977
|
+
segments
|
|
4978
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4979
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4980
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4981
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4545
4982
|
thresholds
|
|
4546
4983
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4547
4984
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4583,6 +5020,42 @@ class Validate:
|
|
|
4583
5020
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
4584
5021
|
subsequent validation steps.
|
|
4585
5022
|
|
|
5023
|
+
Segmentation
|
|
5024
|
+
------------
|
|
5025
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5026
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
5027
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
5028
|
+
column.
|
|
5029
|
+
|
|
5030
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
5031
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
5032
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
5033
|
+
region.
|
|
5034
|
+
|
|
5035
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
5036
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
5037
|
+
segment on only specific dates, you can provide a tuple like
|
|
5038
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
5039
|
+
(i.e., no validation steps will be created for them).
|
|
5040
|
+
|
|
5041
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5042
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
5043
|
+
|
|
5044
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
5045
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
5046
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
5047
|
+
columns
|
|
5048
|
+
|
|
5049
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5050
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
5051
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
5052
|
+
identify issues within specific segments.
|
|
5053
|
+
|
|
5054
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
5055
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
5056
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
5057
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
5058
|
+
|
|
4586
5059
|
Thresholds
|
|
4587
5060
|
----------
|
|
4588
5061
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -4676,6 +5149,8 @@ class Validate:
|
|
|
4676
5149
|
_check_column(column=columns)
|
|
4677
5150
|
_check_set_types(set=set)
|
|
4678
5151
|
_check_pre(pre=pre)
|
|
5152
|
+
# TODO: add check for segments
|
|
5153
|
+
# _check_segments(segments=segments)
|
|
4679
5154
|
_check_thresholds(thresholds=thresholds)
|
|
4680
5155
|
_check_boolean_input(param=active, param_name="active")
|
|
4681
5156
|
|
|
@@ -4703,6 +5178,7 @@ class Validate:
|
|
|
4703
5178
|
column=column,
|
|
4704
5179
|
values=set,
|
|
4705
5180
|
pre=pre,
|
|
5181
|
+
segments=segments,
|
|
4706
5182
|
thresholds=thresholds,
|
|
4707
5183
|
actions=actions,
|
|
4708
5184
|
brief=brief,
|
|
@@ -4717,6 +5193,7 @@ class Validate:
|
|
|
4717
5193
|
self,
|
|
4718
5194
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
4719
5195
|
pre: Callable | None = None,
|
|
5196
|
+
segments: SegmentSpec | None = None,
|
|
4720
5197
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4721
5198
|
actions: Actions | None = None,
|
|
4722
5199
|
brief: str | bool | None = None,
|
|
@@ -4737,10 +5214,15 @@ class Validate:
|
|
|
4737
5214
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
4738
5215
|
generated for each column.
|
|
4739
5216
|
pre
|
|
4740
|
-
|
|
5217
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4741
5218
|
interrogation. This function should take a table as input and return a modified table.
|
|
4742
5219
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4743
5220
|
argument.
|
|
5221
|
+
segments
|
|
5222
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
5223
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
5224
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
5225
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4744
5226
|
thresholds
|
|
4745
5227
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4746
5228
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4782,6 +5264,42 @@ class Validate:
|
|
|
4782
5264
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
4783
5265
|
subsequent validation steps.
|
|
4784
5266
|
|
|
5267
|
+
Segmentation
|
|
5268
|
+
------------
|
|
5269
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5270
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
5271
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
5272
|
+
column.
|
|
5273
|
+
|
|
5274
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
5275
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
5276
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
5277
|
+
region.
|
|
5278
|
+
|
|
5279
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
5280
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
5281
|
+
segment on only specific dates, you can provide a tuple like
|
|
5282
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
5283
|
+
(i.e., no validation steps will be created for them).
|
|
5284
|
+
|
|
5285
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5286
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
5287
|
+
|
|
5288
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
5289
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
5290
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
5291
|
+
columns
|
|
5292
|
+
|
|
5293
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5294
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
5295
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
5296
|
+
identify issues within specific segments.
|
|
5297
|
+
|
|
5298
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
5299
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
5300
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
5301
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
5302
|
+
|
|
4785
5303
|
Thresholds
|
|
4786
5304
|
----------
|
|
4787
5305
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -4871,6 +5389,8 @@ class Validate:
|
|
|
4871
5389
|
|
|
4872
5390
|
_check_column(column=columns)
|
|
4873
5391
|
_check_pre(pre=pre)
|
|
5392
|
+
# TODO: add check for segments
|
|
5393
|
+
# _check_segments(segments=segments)
|
|
4874
5394
|
_check_thresholds(thresholds=thresholds)
|
|
4875
5395
|
_check_boolean_input(param=active, param_name="active")
|
|
4876
5396
|
|
|
@@ -4897,6 +5417,7 @@ class Validate:
|
|
|
4897
5417
|
assertion_type=assertion_type,
|
|
4898
5418
|
column=column,
|
|
4899
5419
|
pre=pre,
|
|
5420
|
+
segments=segments,
|
|
4900
5421
|
thresholds=thresholds,
|
|
4901
5422
|
actions=actions,
|
|
4902
5423
|
brief=brief,
|
|
@@ -4911,6 +5432,7 @@ class Validate:
|
|
|
4911
5432
|
self,
|
|
4912
5433
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
4913
5434
|
pre: Callable | None = None,
|
|
5435
|
+
segments: SegmentSpec | None = None,
|
|
4914
5436
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4915
5437
|
actions: Actions | None = None,
|
|
4916
5438
|
brief: str | bool | None = None,
|
|
@@ -4931,10 +5453,15 @@ class Validate:
|
|
|
4931
5453
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
4932
5454
|
generated for each column.
|
|
4933
5455
|
pre
|
|
4934
|
-
|
|
5456
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4935
5457
|
interrogation. This function should take a table as input and return a modified table.
|
|
4936
5458
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4937
5459
|
argument.
|
|
5460
|
+
segments
|
|
5461
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
5462
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
5463
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
5464
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4938
5465
|
thresholds
|
|
4939
5466
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4940
5467
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4976,6 +5503,42 @@ class Validate:
|
|
|
4976
5503
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
4977
5504
|
subsequent validation steps.
|
|
4978
5505
|
|
|
5506
|
+
Segmentation
|
|
5507
|
+
------------
|
|
5508
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5509
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
5510
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
5511
|
+
column.
|
|
5512
|
+
|
|
5513
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
5514
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
5515
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
5516
|
+
region.
|
|
5517
|
+
|
|
5518
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
5519
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
5520
|
+
segment on only specific dates, you can provide a tuple like
|
|
5521
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
5522
|
+
(i.e., no validation steps will be created for them).
|
|
5523
|
+
|
|
5524
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5525
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
5526
|
+
|
|
5527
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
5528
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
5529
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
5530
|
+
columns
|
|
5531
|
+
|
|
5532
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5533
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
5534
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
5535
|
+
identify issues within specific segments.
|
|
5536
|
+
|
|
5537
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
5538
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
5539
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
5540
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
5541
|
+
|
|
4979
5542
|
Thresholds
|
|
4980
5543
|
----------
|
|
4981
5544
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -5065,6 +5628,8 @@ class Validate:
|
|
|
5065
5628
|
|
|
5066
5629
|
_check_column(column=columns)
|
|
5067
5630
|
_check_pre(pre=pre)
|
|
5631
|
+
# TODO: add check for segments
|
|
5632
|
+
# _check_segments(segments=segments)
|
|
5068
5633
|
_check_thresholds(thresholds=thresholds)
|
|
5069
5634
|
_check_boolean_input(param=active, param_name="active")
|
|
5070
5635
|
|
|
@@ -5091,6 +5656,7 @@ class Validate:
|
|
|
5091
5656
|
assertion_type=assertion_type,
|
|
5092
5657
|
column=column,
|
|
5093
5658
|
pre=pre,
|
|
5659
|
+
segments=segments,
|
|
5094
5660
|
thresholds=thresholds,
|
|
5095
5661
|
actions=actions,
|
|
5096
5662
|
brief=brief,
|
|
@@ -5107,6 +5673,7 @@ class Validate:
|
|
|
5107
5673
|
pattern: str,
|
|
5108
5674
|
na_pass: bool = False,
|
|
5109
5675
|
pre: Callable | None = None,
|
|
5676
|
+
segments: SegmentSpec | None = None,
|
|
5110
5677
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5111
5678
|
actions: Actions | None = None,
|
|
5112
5679
|
brief: str | bool | None = None,
|
|
@@ -5133,10 +5700,15 @@ class Validate:
|
|
|
5133
5700
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
5134
5701
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
5135
5702
|
pre
|
|
5136
|
-
|
|
5703
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
5137
5704
|
interrogation. This function should take a table as input and return a modified table.
|
|
5138
5705
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
5139
5706
|
argument.
|
|
5707
|
+
segments
|
|
5708
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
5709
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
5710
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
5711
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
5140
5712
|
thresholds
|
|
5141
5713
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
5142
5714
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -5178,6 +5750,42 @@ class Validate:
|
|
|
5178
5750
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
5179
5751
|
subsequent validation steps.
|
|
5180
5752
|
|
|
5753
|
+
Segmentation
|
|
5754
|
+
------------
|
|
5755
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5756
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
5757
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
5758
|
+
column.
|
|
5759
|
+
|
|
5760
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
5761
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
5762
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
5763
|
+
region.
|
|
5764
|
+
|
|
5765
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
5766
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
5767
|
+
segment on only specific dates, you can provide a tuple like
|
|
5768
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
5769
|
+
(i.e., no validation steps will be created for them).
|
|
5770
|
+
|
|
5771
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5772
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
5773
|
+
|
|
5774
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
5775
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
5776
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
5777
|
+
columns
|
|
5778
|
+
|
|
5779
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5780
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
5781
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
5782
|
+
identify issues within specific segments.
|
|
5783
|
+
|
|
5784
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
5785
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
5786
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
5787
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
5788
|
+
|
|
5181
5789
|
Thresholds
|
|
5182
5790
|
----------
|
|
5183
5791
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -5269,6 +5877,8 @@ class Validate:
|
|
|
5269
5877
|
|
|
5270
5878
|
_check_column(column=columns)
|
|
5271
5879
|
_check_pre(pre=pre)
|
|
5880
|
+
# TODO: add check for segments
|
|
5881
|
+
# _check_segments(segments=segments)
|
|
5272
5882
|
_check_thresholds(thresholds=thresholds)
|
|
5273
5883
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
5274
5884
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -5298,6 +5908,7 @@ class Validate:
|
|
|
5298
5908
|
values=pattern,
|
|
5299
5909
|
na_pass=na_pass,
|
|
5300
5910
|
pre=pre,
|
|
5911
|
+
segments=segments,
|
|
5301
5912
|
thresholds=thresholds,
|
|
5302
5913
|
actions=actions,
|
|
5303
5914
|
brief=brief,
|
|
@@ -5312,6 +5923,7 @@ class Validate:
|
|
|
5312
5923
|
self,
|
|
5313
5924
|
expr: any,
|
|
5314
5925
|
pre: Callable | None = None,
|
|
5926
|
+
segments: SegmentSpec | None = None,
|
|
5315
5927
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5316
5928
|
actions: Actions | None = None,
|
|
5317
5929
|
brief: str | bool | None = None,
|
|
@@ -5333,10 +5945,15 @@ class Validate:
|
|
|
5333
5945
|
be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
|
|
5334
5946
|
should either be a lambda expression or a Narwhals column expression.
|
|
5335
5947
|
pre
|
|
5336
|
-
|
|
5948
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
5337
5949
|
interrogation. This function should take a table as input and return a modified table.
|
|
5338
5950
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
5339
5951
|
argument.
|
|
5952
|
+
segments
|
|
5953
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
5954
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
5955
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
5956
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
5340
5957
|
thresholds
|
|
5341
5958
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
5342
5959
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -5376,6 +5993,42 @@ class Validate:
|
|
|
5376
5993
|
transformed table, it only exists during the validation step and is not stored in the
|
|
5377
5994
|
`Validate` object or used in subsequent validation steps.
|
|
5378
5995
|
|
|
5996
|
+
Segmentation
|
|
5997
|
+
------------
|
|
5998
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5999
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
6000
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
6001
|
+
column.
|
|
6002
|
+
|
|
6003
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
6004
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
6005
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
6006
|
+
region.
|
|
6007
|
+
|
|
6008
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
6009
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
6010
|
+
segment on only specific dates, you can provide a tuple like
|
|
6011
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
6012
|
+
(i.e., no validation steps will be created for them).
|
|
6013
|
+
|
|
6014
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6015
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
6016
|
+
|
|
6017
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
6018
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
6019
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
6020
|
+
columns
|
|
6021
|
+
|
|
6022
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6023
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
6024
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
6025
|
+
identify issues within specific segments.
|
|
6026
|
+
|
|
6027
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
6028
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
6029
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
6030
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
6031
|
+
|
|
5379
6032
|
Thresholds
|
|
5380
6033
|
----------
|
|
5381
6034
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -5453,6 +6106,8 @@ class Validate:
|
|
|
5453
6106
|
# TODO: Add a check for the expression to ensure it's a valid expression object
|
|
5454
6107
|
# _check_expr(expr=expr)
|
|
5455
6108
|
_check_pre(pre=pre)
|
|
6109
|
+
# TODO: add check for segments
|
|
6110
|
+
# _check_segments(segments=segments)
|
|
5456
6111
|
_check_thresholds(thresholds=thresholds)
|
|
5457
6112
|
_check_boolean_input(param=active, param_name="active")
|
|
5458
6113
|
|
|
@@ -5469,6 +6124,7 @@ class Validate:
|
|
|
5469
6124
|
column=None,
|
|
5470
6125
|
values=expr,
|
|
5471
6126
|
pre=pre,
|
|
6127
|
+
segments=segments,
|
|
5472
6128
|
thresholds=thresholds,
|
|
5473
6129
|
actions=actions,
|
|
5474
6130
|
brief=brief,
|
|
@@ -5657,6 +6313,7 @@ class Validate:
|
|
|
5657
6313
|
self,
|
|
5658
6314
|
columns_subset: str | list[str] | None = None,
|
|
5659
6315
|
pre: Callable | None = None,
|
|
6316
|
+
segments: SegmentSpec | None = None,
|
|
5660
6317
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5661
6318
|
actions: Actions | None = None,
|
|
5662
6319
|
brief: str | bool | None = None,
|
|
@@ -5677,10 +6334,15 @@ class Validate:
|
|
|
5677
6334
|
columns are supplied, the distinct comparison will be made over the combination of
|
|
5678
6335
|
values in those columns.
|
|
5679
6336
|
pre
|
|
5680
|
-
|
|
6337
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
5681
6338
|
interrogation. This function should take a table as input and return a modified table.
|
|
5682
6339
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
5683
6340
|
argument.
|
|
6341
|
+
segments
|
|
6342
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
6343
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
6344
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
6345
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
5684
6346
|
thresholds
|
|
5685
6347
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
5686
6348
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -5722,6 +6384,42 @@ class Validate:
|
|
|
5722
6384
|
table, it only exists during the validation step and is not stored in the `Validate` object
|
|
5723
6385
|
or used in subsequent validation steps.
|
|
5724
6386
|
|
|
6387
|
+
Segmentation
|
|
6388
|
+
------------
|
|
6389
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
6390
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
6391
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
6392
|
+
column.
|
|
6393
|
+
|
|
6394
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
6395
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
6396
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
6397
|
+
region.
|
|
6398
|
+
|
|
6399
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
6400
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
6401
|
+
segment on only specific dates, you can provide a tuple like
|
|
6402
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
6403
|
+
(i.e., no validation steps will be created for them).
|
|
6404
|
+
|
|
6405
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6406
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
6407
|
+
|
|
6408
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
6409
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
6410
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
6411
|
+
columns
|
|
6412
|
+
|
|
6413
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6414
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
6415
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
6416
|
+
identify issues within specific segments.
|
|
6417
|
+
|
|
6418
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
6419
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
6420
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
6421
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
6422
|
+
|
|
5725
6423
|
Thresholds
|
|
5726
6424
|
----------
|
|
5727
6425
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -5815,6 +6513,8 @@ class Validate:
|
|
|
5815
6513
|
assertion_type = _get_fn_name()
|
|
5816
6514
|
|
|
5817
6515
|
_check_pre(pre=pre)
|
|
6516
|
+
# TODO: add check for segments
|
|
6517
|
+
# _check_segments(segments=segments)
|
|
5818
6518
|
_check_thresholds(thresholds=thresholds)
|
|
5819
6519
|
_check_boolean_input(param=active, param_name="active")
|
|
5820
6520
|
|
|
@@ -5835,6 +6535,7 @@ class Validate:
|
|
|
5835
6535
|
assertion_type=assertion_type,
|
|
5836
6536
|
column=columns_subset,
|
|
5837
6537
|
pre=pre,
|
|
6538
|
+
segments=segments,
|
|
5838
6539
|
thresholds=thresholds,
|
|
5839
6540
|
actions=actions,
|
|
5840
6541
|
brief=brief,
|
|
@@ -5895,7 +6596,7 @@ class Validate:
|
|
|
5895
6596
|
substring matches are allowed, so a schema data type of `Int` would match a target table
|
|
5896
6597
|
data type of `Int64`.
|
|
5897
6598
|
pre
|
|
5898
|
-
|
|
6599
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
5899
6600
|
interrogation. This function should take a table as input and return a modified table.
|
|
5900
6601
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
5901
6602
|
argument.
|
|
@@ -6108,7 +6809,7 @@ class Validate:
|
|
|
6108
6809
|
Should the validation step be inverted? If `True`, then the expectation is that the row
|
|
6109
6810
|
count of the target table should not match the specified `count=` value.
|
|
6110
6811
|
pre
|
|
6111
|
-
|
|
6812
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
6112
6813
|
interrogation. This function should take a table as input and return a modified table.
|
|
6113
6814
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
6114
6815
|
argument.
|
|
@@ -6318,7 +7019,7 @@ class Validate:
|
|
|
6318
7019
|
Should the validation step be inverted? If `True`, then the expectation is that the
|
|
6319
7020
|
column count of the target table should not match the specified `count=` value.
|
|
6320
7021
|
pre
|
|
6321
|
-
|
|
7022
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
6322
7023
|
interrogation. This function should take a table as input and return a modified table.
|
|
6323
7024
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
6324
7025
|
argument.
|
|
@@ -6836,10 +7537,14 @@ class Validate:
|
|
|
6836
7537
|
|
|
6837
7538
|
self.time_start = datetime.datetime.now(datetime.timezone.utc)
|
|
6838
7539
|
|
|
6839
|
-
# Expand `validation_info` by evaluating any column expressions in `
|
|
7540
|
+
# Expand `validation_info` by evaluating any column expressions in `columns=`
|
|
6840
7541
|
# (the `_evaluate_column_exprs()` method will eval and expand as needed)
|
|
6841
7542
|
self._evaluate_column_exprs(validation_info=self.validation_info)
|
|
6842
7543
|
|
|
7544
|
+
# Expand `validation_info` by evaluating for any segmentation directives
|
|
7545
|
+
# provided in `segments=` (the `_evaluate_segments()` method will eval and expand as needed)
|
|
7546
|
+
self._evaluate_segments(validation_info=self.validation_info)
|
|
7547
|
+
|
|
6843
7548
|
for validation in self.validation_info:
|
|
6844
7549
|
# Set the `i` value for the validation step (this is 1-indexed)
|
|
6845
7550
|
index_value = self.validation_info.index(validation) + 1
|
|
@@ -6875,6 +7580,10 @@ class Validate:
|
|
|
6875
7580
|
|
|
6876
7581
|
validation.autobrief = autobrief
|
|
6877
7582
|
|
|
7583
|
+
# ------------------------------------------------
|
|
7584
|
+
# Bypassing the validation step if conditions met
|
|
7585
|
+
# ------------------------------------------------
|
|
7586
|
+
|
|
6878
7587
|
# Skip the validation step if it is not active but still record the time of processing
|
|
6879
7588
|
if not validation.active:
|
|
6880
7589
|
end_time = datetime.datetime.now(datetime.timezone.utc)
|
|
@@ -6931,6 +7640,17 @@ class Validate:
|
|
|
6931
7640
|
elif isinstance(validation.pre, Callable):
|
|
6932
7641
|
data_tbl_step = validation.pre(data_tbl_step)
|
|
6933
7642
|
|
|
7643
|
+
# ------------------------------------------------
|
|
7644
|
+
# Segmentation stage
|
|
7645
|
+
# ------------------------------------------------
|
|
7646
|
+
|
|
7647
|
+
# Determine whether any segmentation directives are to be applied to the table
|
|
7648
|
+
|
|
7649
|
+
if validation.segments is not None:
|
|
7650
|
+
data_tbl_step = _apply_segments(
|
|
7651
|
+
data_tbl=data_tbl_step, segments_expr=validation.segments
|
|
7652
|
+
)
|
|
7653
|
+
|
|
6934
7654
|
validation.n = NumberOfTestUnits(df=data_tbl_step, column=column).get_test_units(
|
|
6935
7655
|
tbl_type=tbl_type
|
|
6936
7656
|
)
|
|
@@ -8832,6 +9552,13 @@ class Validate:
|
|
|
8832
9552
|
# will be made blank if the validation has not been performed
|
|
8833
9553
|
interrogation_performed = validation_info_dict.get("proc_duration_s", [None])[0] is not None
|
|
8834
9554
|
|
|
9555
|
+
# Determine which steps are those using segmented data
|
|
9556
|
+
segmented_steps = [
|
|
9557
|
+
i + 1
|
|
9558
|
+
for i, segment in enumerate(validation_info_dict["segments"])
|
|
9559
|
+
if segment is not None
|
|
9560
|
+
]
|
|
9561
|
+
|
|
8835
9562
|
# ------------------------------------------------
|
|
8836
9563
|
# Process the `type_upd` entry
|
|
8837
9564
|
# ------------------------------------------------
|
|
@@ -8841,6 +9568,7 @@ class Validate:
|
|
|
8841
9568
|
assertion_str=validation_info_dict["assertion_type"],
|
|
8842
9569
|
brief_str=validation_info_dict["brief"],
|
|
8843
9570
|
autobrief_str=validation_info_dict["autobrief"],
|
|
9571
|
+
segmentation_str=validation_info_dict["segments"],
|
|
8844
9572
|
lang=lang,
|
|
8845
9573
|
)
|
|
8846
9574
|
|
|
@@ -8972,11 +9700,14 @@ class Validate:
|
|
|
8972
9700
|
# Add the `tbl` entry
|
|
8973
9701
|
# ------------------------------------------------
|
|
8974
9702
|
|
|
8975
|
-
# Depending on if there was some preprocessing done, get the appropriate icon
|
|
8976
|
-
#
|
|
9703
|
+
# Depending on if there was some preprocessing done, get the appropriate icon for
|
|
9704
|
+
# the table processing status to be displayed in the report under the `tbl` column
|
|
9705
|
+
# TODO: add the icon for the segmented data option when the step is segmented
|
|
8977
9706
|
|
|
8978
9707
|
validation_info_dict["tbl"] = _transform_tbl_preprocessed(
|
|
8979
|
-
pre=validation_info_dict["pre"],
|
|
9708
|
+
pre=validation_info_dict["pre"],
|
|
9709
|
+
seg=validation_info_dict["segments"],
|
|
9710
|
+
interrogation_performed=interrogation_performed,
|
|
8980
9711
|
)
|
|
8981
9712
|
|
|
8982
9713
|
# ------------------------------------------------
|
|
@@ -9011,8 +9742,9 @@ class Validate:
|
|
|
9011
9742
|
# Process `pass` and `fail` entries
|
|
9012
9743
|
# ------------------------------------------------
|
|
9013
9744
|
|
|
9014
|
-
# Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries
|
|
9015
|
-
# of the `pass` entry should be equal to the length of the
|
|
9745
|
+
# Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries
|
|
9746
|
+
# (the length of the `pass` entry should be equal to the length of the
|
|
9747
|
+
# `n_passed` and `n_failed` entries)
|
|
9016
9748
|
|
|
9017
9749
|
validation_info_dict["pass"] = _transform_passed_failed(
|
|
9018
9750
|
n_passed_failed=validation_info_dict["n_passed"],
|
|
@@ -9165,6 +9897,9 @@ class Validate:
|
|
|
9165
9897
|
# Remove the `pre` entry from the dictionary
|
|
9166
9898
|
validation_info_dict.pop("pre")
|
|
9167
9899
|
|
|
9900
|
+
# Remove the `segments` entry from the dictionary
|
|
9901
|
+
validation_info_dict.pop("segments")
|
|
9902
|
+
|
|
9168
9903
|
# Remove the `proc_duration_s` entry from the dictionary
|
|
9169
9904
|
validation_info_dict.pop("proc_duration_s")
|
|
9170
9905
|
|
|
@@ -9247,6 +9982,10 @@ class Validate:
|
|
|
9247
9982
|
columns=["type_upd", "columns_upd", "values_upd", "test_units", "pass", "fail"]
|
|
9248
9983
|
),
|
|
9249
9984
|
)
|
|
9985
|
+
.tab_style(
|
|
9986
|
+
style=style.css("overflow-x: visible; white-space: nowrap;"),
|
|
9987
|
+
locations=loc.body(columns="type_upd", rows=segmented_steps),
|
|
9988
|
+
)
|
|
9250
9989
|
.tab_style(
|
|
9251
9990
|
style=style.fill(color="#FCFCFC" if interrogation_performed else "white"),
|
|
9252
9991
|
locations=loc.body(columns=["w_upd", "e_upd", "c_upd"]),
|
|
@@ -9421,8 +10160,8 @@ class Validate:
|
|
|
9421
10160
|
table object, which can be displayed in a notebook or exported to an HTML file.
|
|
9422
10161
|
|
|
9423
10162
|
:::{.callout-warning}
|
|
9424
|
-
The `get_step_report()` is still experimental. Please report any issues you encounter
|
|
9425
|
-
[Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
10163
|
+
The `get_step_report()` method is still experimental. Please report any issues you encounter
|
|
10164
|
+
in the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
9426
10165
|
:::
|
|
9427
10166
|
|
|
9428
10167
|
Parameters
|
|
@@ -9455,6 +10194,35 @@ class Validate:
|
|
|
9455
10194
|
GT
|
|
9456
10195
|
A GT table object that represents the detailed report for the validation step.
|
|
9457
10196
|
|
|
10197
|
+
Types of Step Reports
|
|
10198
|
+
---------------------
|
|
10199
|
+
The `get_step_report()` method produces a report based on the *type* of validation step.
|
|
10200
|
+
The following row-based validation methods will produce a report that shows the rows of the
|
|
10201
|
+
data that failed because of failing test units within one or more columns failed:
|
|
10202
|
+
|
|
10203
|
+
- [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
|
|
10204
|
+
- [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
|
|
10205
|
+
- [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
|
|
10206
|
+
- [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
|
|
10207
|
+
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
10208
|
+
- [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
|
|
10209
|
+
- [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
|
|
10210
|
+
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
10211
|
+
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
10212
|
+
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
10213
|
+
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
10214
|
+
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
10215
|
+
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
10216
|
+
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
10217
|
+
|
|
10218
|
+
The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
|
|
10219
|
+
report that shows duplicate rows (or duplicate values in one or a set of columns as defined
|
|
10220
|
+
in that method's `columns_subset=` parameter.
|
|
10221
|
+
|
|
10222
|
+
The [`col_schema_match()`](`pointblank.Validate.col_schema_match`) validation step will
|
|
10223
|
+
produce a report that shows the schema of the data table and the schema of the validation
|
|
10224
|
+
step. The report will indicate whether the schemas match or not.
|
|
10225
|
+
|
|
9458
10226
|
Examples
|
|
9459
10227
|
--------
|
|
9460
10228
|
```{python}
|
|
@@ -9480,7 +10248,7 @@ class Validate:
|
|
|
9480
10248
|
.col_vals_lt(columns="d", value=3500)
|
|
9481
10249
|
.col_vals_between(columns="c", left=1, right=8)
|
|
9482
10250
|
.col_vals_gt(columns="a", value=3)
|
|
9483
|
-
.col_vals_regex(columns="b", pattern=r"
|
|
10251
|
+
.col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
|
|
9484
10252
|
.interrogate()
|
|
9485
10253
|
)
|
|
9486
10254
|
|
|
@@ -9768,6 +10536,95 @@ class Validate:
|
|
|
9768
10536
|
|
|
9769
10537
|
return self
|
|
9770
10538
|
|
|
10539
|
+
def _evaluate_segments(self, validation_info):
|
|
10540
|
+
"""
|
|
10541
|
+
Evaluate any segmentation expressions stored in the `segments` attribute and expand each
|
|
10542
|
+
validation step with such directives into multiple. This is done by evaluating the
|
|
10543
|
+
segmentation expression and creating a new validation step for each segment. Errors in
|
|
10544
|
+
evaluation (such as no segments matched) will be caught and recorded in the `eval_error`
|
|
10545
|
+
attribute.
|
|
10546
|
+
|
|
10547
|
+
Parameters
|
|
10548
|
+
----------
|
|
10549
|
+
validation_info
|
|
10550
|
+
Information about the validation to evaluate and expand.
|
|
10551
|
+
"""
|
|
10552
|
+
|
|
10553
|
+
# Create a list to store the expanded validation steps
|
|
10554
|
+
expanded_validation_info = []
|
|
10555
|
+
|
|
10556
|
+
# Iterate over the validation steps
|
|
10557
|
+
for i, validation in enumerate(validation_info):
|
|
10558
|
+
# Get the segments expression
|
|
10559
|
+
segments_expr = validation.segments
|
|
10560
|
+
|
|
10561
|
+
# If the value is None, then skip the evaluation and append the validation step to the
|
|
10562
|
+
# list of expanded validation steps
|
|
10563
|
+
if segments_expr is None:
|
|
10564
|
+
expanded_validation_info.append(validation)
|
|
10565
|
+
continue
|
|
10566
|
+
|
|
10567
|
+
# Evaluate the segments expression
|
|
10568
|
+
try:
|
|
10569
|
+
# Get the table for this step, it can either be:
|
|
10570
|
+
# 1. the target table itself
|
|
10571
|
+
# 2. the target table modified by a `pre` attribute
|
|
10572
|
+
|
|
10573
|
+
if validation.pre is None:
|
|
10574
|
+
table = self.data
|
|
10575
|
+
else:
|
|
10576
|
+
table = validation.pre(self.data)
|
|
10577
|
+
|
|
10578
|
+
# If the `segments` expression is a string, that string is taken as a column name
|
|
10579
|
+
# for which segmentation should occur across unique values in the column
|
|
10580
|
+
if isinstance(segments_expr, str):
|
|
10581
|
+
seg_tuples = _seg_expr_from_string(data_tbl=table, segments_expr=segments_expr)
|
|
10582
|
+
|
|
10583
|
+
# If the 'segments' expression is a tuple, then normalize it to a list of tuples
|
|
10584
|
+
# - ("col", "value") -> [("col", "value")]
|
|
10585
|
+
# - ("col", ["value1", "value2"]) -> [("col", "value1"), ("col", "value2")]
|
|
10586
|
+
elif isinstance(segments_expr, tuple):
|
|
10587
|
+
seg_tuples = _seg_expr_from_tuple(segments_expr=segments_expr)
|
|
10588
|
+
|
|
10589
|
+
# If the 'segments' expression is a list of strings or tuples (can be mixed) then
|
|
10590
|
+
# normalize it to a list of tuples following the rules above
|
|
10591
|
+
elif isinstance(segments_expr, list):
|
|
10592
|
+
seg_tuples = []
|
|
10593
|
+
for seg in segments_expr:
|
|
10594
|
+
if isinstance(seg, str):
|
|
10595
|
+
# Use the utility function for string items
|
|
10596
|
+
str_seg_tuples = _seg_expr_from_string(
|
|
10597
|
+
data_tbl=table, segments_expr=seg
|
|
10598
|
+
)
|
|
10599
|
+
seg_tuples.extend(str_seg_tuples)
|
|
10600
|
+
elif isinstance(seg, tuple):
|
|
10601
|
+
# Use the utility function for tuple items
|
|
10602
|
+
tuple_seg_tuples = _seg_expr_from_tuple(segments_expr=seg)
|
|
10603
|
+
seg_tuples.extend(tuple_seg_tuples)
|
|
10604
|
+
else: # pragma: no cover
|
|
10605
|
+
# Handle invalid segment type
|
|
10606
|
+
raise ValueError(
|
|
10607
|
+
f"Invalid segment expression item type: {type(seg)}. "
|
|
10608
|
+
"Must be either string or tuple."
|
|
10609
|
+
)
|
|
10610
|
+
|
|
10611
|
+
except Exception: # pragma: no cover
|
|
10612
|
+
validation.eval_error = True
|
|
10613
|
+
|
|
10614
|
+
# For each segmentation resolved, create a new validation step and add it to the list of
|
|
10615
|
+
# expanded validation steps
|
|
10616
|
+
for seg in seg_tuples:
|
|
10617
|
+
new_validation = copy.deepcopy(validation)
|
|
10618
|
+
|
|
10619
|
+
new_validation.segments = seg
|
|
10620
|
+
|
|
10621
|
+
expanded_validation_info.append(new_validation)
|
|
10622
|
+
|
|
10623
|
+
# Replace the `validation_info` attribute with the expanded version
|
|
10624
|
+
self.validation_info = expanded_validation_info
|
|
10625
|
+
|
|
10626
|
+
return self
|
|
10627
|
+
|
|
9771
10628
|
def _get_validation_dict(self, i: int | list[int] | None, attr: str) -> dict[int, int]:
|
|
9772
10629
|
"""
|
|
9773
10630
|
Utility function to get a dictionary of validation attributes for each validation step.
|
|
@@ -10485,6 +11342,143 @@ def _prep_values_text(
|
|
|
10485
11342
|
return values_str
|
|
10486
11343
|
|
|
10487
11344
|
|
|
11345
|
+
def _seg_expr_from_string(data_tbl: any, segments_expr: str) -> tuple[str, str]:
|
|
11346
|
+
"""
|
|
11347
|
+
Obtain the segmentation categories from a table column.
|
|
11348
|
+
|
|
11349
|
+
The `segments_expr` value will have been checked to be a string, so there's no need to check for
|
|
11350
|
+
that here. The function will return a list of tuples representing pairings of a column name and
|
|
11351
|
+
a value. The task is to obtain the unique values in the column (handling different table types)
|
|
11352
|
+
and produce a normalized list of tuples of the form: `(column, value)`.
|
|
11353
|
+
|
|
11354
|
+
This function is used to create a list of segments for the validation step. And since there will
|
|
11355
|
+
usually be more than one segment, the validation step will be expanded into multiple during
|
|
11356
|
+
interrogation (where this function is called).
|
|
11357
|
+
|
|
11358
|
+
Parameters
|
|
11359
|
+
----------
|
|
11360
|
+
data_tbl
|
|
11361
|
+
The table from which to obtain the segmentation categories.
|
|
11362
|
+
segments_expr
|
|
11363
|
+
The column name for which segmentation should occur across unique values in the column.
|
|
11364
|
+
|
|
11365
|
+
Returns
|
|
11366
|
+
-------
|
|
11367
|
+
list[tuple[str, str]]
|
|
11368
|
+
A list of tuples representing pairings of a column name and a value in the column.
|
|
11369
|
+
"""
|
|
11370
|
+
# Determine if the table is a DataFrame or a DB table
|
|
11371
|
+
tbl_type = _get_tbl_type(data=data_tbl)
|
|
11372
|
+
|
|
11373
|
+
# Obtain the segmentation categories from the table column given as `segments_expr`
|
|
11374
|
+
if tbl_type == "polars":
|
|
11375
|
+
seg_categories = data_tbl[segments_expr].unique().to_list()
|
|
11376
|
+
elif tbl_type == "pandas":
|
|
11377
|
+
seg_categories = data_tbl[segments_expr].unique().tolist()
|
|
11378
|
+
elif tbl_type in IBIS_BACKENDS:
|
|
11379
|
+
distinct_col_vals = data_tbl.select(segments_expr).distinct()
|
|
11380
|
+
seg_categories = distinct_col_vals[segments_expr].to_list()
|
|
11381
|
+
else: # pragma: no cover
|
|
11382
|
+
raise ValueError(f"Unsupported table type: {tbl_type}")
|
|
11383
|
+
|
|
11384
|
+
# Ensure that the categories are sorted
|
|
11385
|
+
seg_categories.sort()
|
|
11386
|
+
|
|
11387
|
+
# Place each category and each value in a list of tuples as: `(column, value)`
|
|
11388
|
+
seg_tuples = [(segments_expr, category) for category in seg_categories]
|
|
11389
|
+
|
|
11390
|
+
return seg_tuples
|
|
11391
|
+
|
|
11392
|
+
|
|
11393
|
+
def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, str]]:
|
|
11394
|
+
"""
|
|
11395
|
+
Normalize the segments expression to a list of tuples, given a single tuple.
|
|
11396
|
+
|
|
11397
|
+
The `segments_expr` value will have been checked to be a tuple, so there's no need to check for
|
|
11398
|
+
that here. The function will return a list of tuples representing pairings of a column name and
|
|
11399
|
+
a value. The task is to normalize the tuple into a list of tuples of the form:
|
|
11400
|
+
`(column, value)`.
|
|
11401
|
+
|
|
11402
|
+
The following examples show how this normalzation works:
|
|
11403
|
+
- `("col", "value")` -> `[("col", "value")]` (single tuple, upgraded to a list of tuples)
|
|
11404
|
+
- `("col", ["value1", "value2"])` -> `[("col", "value1"), ("col", "value2")]` (tuple with a list
|
|
11405
|
+
of values, expanded into multiple tuples within a list)
|
|
11406
|
+
|
|
11407
|
+
This function is used to create a list of segments for the validation step. And since there will
|
|
11408
|
+
usually be more than one segment, the validation step will be expanded into multiple during
|
|
11409
|
+
interrogation (where this function is called).
|
|
11410
|
+
|
|
11411
|
+
Parameters
|
|
11412
|
+
----------
|
|
11413
|
+
segments_expr
|
|
11414
|
+
The segments expression to normalize. It can be a tuple of the form
|
|
11415
|
+
`(column, value)` or `(column, [value1, value2])`.
|
|
11416
|
+
|
|
11417
|
+
Returns
|
|
11418
|
+
-------
|
|
11419
|
+
list[tuple[str, str]]
|
|
11420
|
+
A list of tuples representing pairings of a column name and a value in the column.
|
|
11421
|
+
"""
|
|
11422
|
+
# Check if the first element is a string
|
|
11423
|
+
if isinstance(segments_expr[0], str):
|
|
11424
|
+
# If the second element is a list, create a list of tuples
|
|
11425
|
+
if isinstance(segments_expr[1], list):
|
|
11426
|
+
seg_tuples = [(segments_expr[0], value) for value in segments_expr[1]]
|
|
11427
|
+
# If the second element is not a list, create a single tuple
|
|
11428
|
+
else:
|
|
11429
|
+
seg_tuples = [(segments_expr[0], segments_expr[1])]
|
|
11430
|
+
# If the first element is not a string, raise an error
|
|
11431
|
+
else: # pragma: no cover
|
|
11432
|
+
raise ValueError("The first element of the segments expression must be a string.")
|
|
11433
|
+
|
|
11434
|
+
return seg_tuples
|
|
11435
|
+
|
|
11436
|
+
|
|
11437
|
+
def _apply_segments(data_tbl: any, segments_expr: tuple[str, str]) -> any:
|
|
11438
|
+
"""
|
|
11439
|
+
Apply the segments expression to the data table.
|
|
11440
|
+
|
|
11441
|
+
Filter the data table based on the `segments_expr=` value, where the first element is the
|
|
11442
|
+
column name and the second element is the value to filter by.
|
|
11443
|
+
|
|
11444
|
+
Parameters
|
|
11445
|
+
----------
|
|
11446
|
+
data_tbl
|
|
11447
|
+
The data table to filter. It can be a Pandas DataFrame, Polars DataFrame, or an Ibis
|
|
11448
|
+
backend table.
|
|
11449
|
+
segments_expr
|
|
11450
|
+
The segments expression to apply. It is a tuple of the form `(column, value)`.
|
|
11451
|
+
|
|
11452
|
+
Returns
|
|
11453
|
+
-------
|
|
11454
|
+
any
|
|
11455
|
+
The filtered data table. It will be of the same type as the input table.
|
|
11456
|
+
"""
|
|
11457
|
+
# Get the table type
|
|
11458
|
+
tbl_type = _get_tbl_type(data=data_tbl)
|
|
11459
|
+
|
|
11460
|
+
if tbl_type in ["pandas", "polars"]:
|
|
11461
|
+
# If the table is a Pandas or Polars DataFrame, transforming to a Narwhals table
|
|
11462
|
+
# and perform the filtering operation
|
|
11463
|
+
|
|
11464
|
+
# Transform to Narwhals table if a DataFrame
|
|
11465
|
+
data_tbl_nw = nw.from_native(data_tbl)
|
|
11466
|
+
|
|
11467
|
+
# Filter the data table based on the column name and value
|
|
11468
|
+
data_tbl_nw = data_tbl_nw.filter(nw.col(segments_expr[0]) == segments_expr[1])
|
|
11469
|
+
|
|
11470
|
+
# Transform back to the original table type
|
|
11471
|
+
data_tbl = data_tbl_nw.to_native()
|
|
11472
|
+
|
|
11473
|
+
elif tbl_type in IBIS_BACKENDS:
|
|
11474
|
+
# If the table is an Ibis backend table, perform the filtering operation directly
|
|
11475
|
+
|
|
11476
|
+
# Filter the data table based on the column name and value
|
|
11477
|
+
data_tbl = data_tbl[data_tbl[segments_expr[0]] == segments_expr[1]]
|
|
11478
|
+
|
|
11479
|
+
return data_tbl
|
|
11480
|
+
|
|
11481
|
+
|
|
10488
11482
|
def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
|
|
10489
11483
|
"""
|
|
10490
11484
|
Convert a `_ValidationInfo` object to a dictionary.
|
|
@@ -10509,6 +11503,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
|
|
|
10509
11503
|
"inclusive",
|
|
10510
11504
|
"na_pass",
|
|
10511
11505
|
"pre",
|
|
11506
|
+
"segments",
|
|
10512
11507
|
"label",
|
|
10513
11508
|
"brief",
|
|
10514
11509
|
"autobrief",
|
|
@@ -10623,7 +11618,7 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s
|
|
|
10623
11618
|
return title_text
|
|
10624
11619
|
|
|
10625
11620
|
|
|
10626
|
-
def _transform_tbl_preprocessed(pre:
|
|
11621
|
+
def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: bool) -> list[str]:
|
|
10627
11622
|
# If no interrogation was performed, return a list of empty strings
|
|
10628
11623
|
if not interrogation_performed:
|
|
10629
11624
|
return ["" for _ in range(len(pre))]
|
|
@@ -10632,11 +11627,13 @@ def _transform_tbl_preprocessed(pre: str, interrogation_performed: bool) -> list
|
|
|
10632
11627
|
# (either 'unchanged' (None) or 'modified' (not None))
|
|
10633
11628
|
status_list = []
|
|
10634
11629
|
|
|
10635
|
-
for
|
|
10636
|
-
if
|
|
10637
|
-
status_list.append("
|
|
10638
|
-
|
|
11630
|
+
for i in range(len(pre)):
|
|
11631
|
+
if seg[i] is not None:
|
|
11632
|
+
status_list.append("segmented")
|
|
11633
|
+
elif pre[i] is not None:
|
|
10639
11634
|
status_list.append("modified")
|
|
11635
|
+
else:
|
|
11636
|
+
status_list.append("unchanged")
|
|
10640
11637
|
|
|
10641
11638
|
return _get_preprocessed_table_icon(icon=status_list)
|
|
10642
11639
|
|
|
@@ -10744,7 +11741,11 @@ def _transform_w_e_c(values, color, interrogation_performed):
|
|
|
10744
11741
|
|
|
10745
11742
|
|
|
10746
11743
|
def _transform_assertion_str(
|
|
10747
|
-
assertion_str: list[str],
|
|
11744
|
+
assertion_str: list[str],
|
|
11745
|
+
brief_str: list[str | None],
|
|
11746
|
+
autobrief_str: list[str],
|
|
11747
|
+
segmentation_str: list[tuple | None],
|
|
11748
|
+
lang: str,
|
|
10748
11749
|
) -> list[str]:
|
|
10749
11750
|
# Get the SVG icons for the assertion types
|
|
10750
11751
|
svg_icon = _get_assertion_icon(icon=assertion_str)
|
|
@@ -10805,6 +11806,26 @@ def _transform_assertion_str(
|
|
|
10805
11806
|
for assertion, svg, size, brief_div in zip(assertion_str, svg_icon, text_size, brief_divs)
|
|
10806
11807
|
]
|
|
10807
11808
|
|
|
11809
|
+
# If the `segments` list is not empty, prepend a segmentation div to the `type_upd` strings
|
|
11810
|
+
if segmentation_str:
|
|
11811
|
+
for i in range(len(type_upd)):
|
|
11812
|
+
if segmentation_str[i] is not None:
|
|
11813
|
+
# Get the column name and value from the segmentation expression
|
|
11814
|
+
column_name = segmentation_str[i][0]
|
|
11815
|
+
column_value = segmentation_str[i][1]
|
|
11816
|
+
# Create the segmentation div
|
|
11817
|
+
segmentation_div = (
|
|
11818
|
+
"<div style='margin-top: 0px; margin-bottom: 0px; "
|
|
11819
|
+
"white-space: pre; font-size: 8px; color: darkblue; padding-bottom: 4px; "
|
|
11820
|
+
"'>"
|
|
11821
|
+
"<strong><span style='font-family: Helvetica, arial, sans-serif;'>"
|
|
11822
|
+
f"SEGMENT </span></strong><span>{column_name} / {column_value}"
|
|
11823
|
+
"</span>"
|
|
11824
|
+
"</div>"
|
|
11825
|
+
)
|
|
11826
|
+
# Prepend the segmentation div to the type_upd string
|
|
11827
|
+
type_upd[i] = f"{segmentation_div} {type_upd[i]}"
|
|
11828
|
+
|
|
10808
11829
|
return type_upd
|
|
10809
11830
|
|
|
10810
11831
|
|