pointblank 0.8.7__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants.py +11 -10
- pointblank/_typing.py +19 -3
- pointblank/data/api-docs.txt +716 -49
- pointblank/datascan.py +4 -4
- pointblank/draft.py +1 -1
- pointblank/thresholds.py +10 -0
- pointblank/validate.py +1061 -48
- {pointblank-0.8.7.dist-info → pointblank-0.9.0.dist-info}/METADATA +6 -2
- {pointblank-0.8.7.dist-info → pointblank-0.9.0.dist-info}/RECORD +12 -12
- {pointblank-0.8.7.dist-info → pointblank-0.9.0.dist-info}/WHEEL +1 -1
- {pointblank-0.8.7.dist-info → pointblank-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.8.7.dist-info → pointblank-0.9.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -7,6 +7,7 @@ import datetime
|
|
|
7
7
|
import inspect
|
|
8
8
|
import json
|
|
9
9
|
import re
|
|
10
|
+
import tempfile
|
|
10
11
|
import threading
|
|
11
12
|
from dataclasses import dataclass
|
|
12
13
|
from importlib.metadata import version
|
|
@@ -57,6 +58,7 @@ from pointblank._interrogation import (
|
|
|
57
58
|
RowCountMatch,
|
|
58
59
|
RowsDistinct,
|
|
59
60
|
)
|
|
61
|
+
from pointblank._typing import SegmentSpec
|
|
60
62
|
from pointblank._utils import (
|
|
61
63
|
_check_any_df_lib,
|
|
62
64
|
_check_invalid_fields,
|
|
@@ -119,16 +121,18 @@ def _action_context_manager(metadata):
|
|
|
119
121
|
delattr(_action_context, "metadata")
|
|
120
122
|
|
|
121
123
|
|
|
122
|
-
def get_action_metadata():
|
|
124
|
+
def get_action_metadata() -> dict | None:
|
|
123
125
|
"""Access step-level metadata when authoring custom actions.
|
|
124
126
|
|
|
125
127
|
Get the metadata for the validation step where an action was triggered. This can be called by
|
|
126
|
-
user functions to get the metadata for the current action.
|
|
128
|
+
user functions to get the metadata for the current action. This function can only be used within
|
|
129
|
+
callables crafted for the [`Actions`](`pointblank.Actions`) class.
|
|
127
130
|
|
|
128
131
|
Returns
|
|
129
132
|
-------
|
|
130
|
-
dict
|
|
131
|
-
A dictionary containing the metadata for the current step.
|
|
133
|
+
dict | None
|
|
134
|
+
A dictionary containing the metadata for the current step. If called outside of an action
|
|
135
|
+
(i.e., when no action is being executed), this function will return `None`.
|
|
132
136
|
|
|
133
137
|
Description of the Metadata Fields
|
|
134
138
|
----------------------------------
|
|
@@ -163,7 +167,7 @@ def get_action_metadata():
|
|
|
163
167
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
164
168
|
actions=pb.Actions(warning=log_issue),
|
|
165
169
|
)
|
|
166
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
170
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
167
171
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
168
172
|
.col_vals_gt(
|
|
169
173
|
columns="session_duration",
|
|
@@ -181,6 +185,11 @@ def get_action_metadata():
|
|
|
181
185
|
- the `metadata` is a dictionary that is used to craft the log message
|
|
182
186
|
- the action is passed as a bare function to the `Actions` object within the `Validate` object
|
|
183
187
|
(placing it within `Validate(actions=)` ensures it's set as an action for every validation step)
|
|
188
|
+
|
|
189
|
+
See Also
|
|
190
|
+
--------
|
|
191
|
+
Have a look at [`Actions`](`pointblank.Actions`) for more information on how to create custom
|
|
192
|
+
actions for validation steps that exceed a set threshold value.
|
|
184
193
|
"""
|
|
185
194
|
if hasattr(_action_context, "metadata"): # pragma: no cover
|
|
186
195
|
return _action_context.metadata # pragma: no cover
|
|
@@ -204,17 +213,19 @@ def _final_action_context_manager(summary):
|
|
|
204
213
|
delattr(_final_action_context, "summary")
|
|
205
214
|
|
|
206
215
|
|
|
207
|
-
def get_validation_summary():
|
|
216
|
+
def get_validation_summary() -> dict | None:
|
|
208
217
|
"""Access validation summary information when authoring final actions.
|
|
209
218
|
|
|
210
219
|
This function provides a convenient way to access summary information about the validation
|
|
211
220
|
process within a final action. It returns a dictionary with key metrics from the validation
|
|
212
|
-
process.
|
|
221
|
+
process. This function can only be used within callables crafted for the
|
|
222
|
+
[`FinalActions`](`pointblank.FinalActions`) class.
|
|
213
223
|
|
|
214
224
|
Returns
|
|
215
225
|
-------
|
|
216
226
|
dict | None
|
|
217
|
-
A dictionary containing validation metrics
|
|
227
|
+
A dictionary containing validation metrics. If called outside of an final action context,
|
|
228
|
+
this function will return `None`.
|
|
218
229
|
|
|
219
230
|
Description of the Summary Fields
|
|
220
231
|
--------------------------------
|
|
@@ -304,6 +315,11 @@ def get_validation_summary():
|
|
|
304
315
|
|
|
305
316
|
Final actions work well with both simple logging and more complex notification systems, allowing
|
|
306
317
|
you to integrate validation results into your broader data quality workflows.
|
|
318
|
+
|
|
319
|
+
See Also
|
|
320
|
+
--------
|
|
321
|
+
Have a look at [`FinalActions`](`pointblank.FinalActions`) for more information on how to create
|
|
322
|
+
custom actions that are executed after all validation steps have been completed.
|
|
307
323
|
"""
|
|
308
324
|
if hasattr(_final_action_context, "summary"):
|
|
309
325
|
return _final_action_context.summary
|
|
@@ -516,10 +532,10 @@ def load_dataset(
|
|
|
516
532
|
data_path = files("pointblank.data") / f"{dataset}-duckdb.zip"
|
|
517
533
|
|
|
518
534
|
# Unzip the DuckDB dataset to a temporary directory
|
|
519
|
-
with ZipFile(data_path, "r") as z:
|
|
520
|
-
z.extractall(path=
|
|
535
|
+
with tempfile.TemporaryDirectory() as tmp, ZipFile(data_path, "r") as z:
|
|
536
|
+
z.extractall(path=tmp)
|
|
521
537
|
|
|
522
|
-
data_path = f"
|
|
538
|
+
data_path = f"{tmp}/{dataset}.ddb"
|
|
523
539
|
|
|
524
540
|
dataset = ibis.connect(f"duckdb://{data_path}").table(dataset)
|
|
525
541
|
|
|
@@ -1783,14 +1799,15 @@ class _ValidationInfo:
|
|
|
1783
1799
|
assertion_type
|
|
1784
1800
|
The type of assertion. This is the method name of the validation (e.g., `"col_vals_gt"`).
|
|
1785
1801
|
column
|
|
1786
|
-
The column to validate.
|
|
1787
|
-
multiple columns).
|
|
1802
|
+
The column(s) to validate.
|
|
1788
1803
|
values
|
|
1789
1804
|
The value or values to compare against.
|
|
1790
1805
|
na_pass
|
|
1791
1806
|
Whether to pass test units that hold missing values.
|
|
1792
1807
|
pre
|
|
1793
1808
|
A preprocessing function or lambda to apply to the data table for the validation step.
|
|
1809
|
+
segments
|
|
1810
|
+
The segments to use for the validation step.
|
|
1794
1811
|
thresholds
|
|
1795
1812
|
The threshold values for the validation.
|
|
1796
1813
|
actions
|
|
@@ -1841,11 +1858,12 @@ class _ValidationInfo:
|
|
|
1841
1858
|
step_id: str | None = None
|
|
1842
1859
|
sha1: str | None = None
|
|
1843
1860
|
assertion_type: str | None = None
|
|
1844
|
-
column:
|
|
1861
|
+
column: any | None = None
|
|
1845
1862
|
values: any | list[any] | tuple | None = None
|
|
1846
1863
|
inclusive: tuple[bool, bool] | None = None
|
|
1847
1864
|
na_pass: bool | None = None
|
|
1848
1865
|
pre: Callable | None = None
|
|
1866
|
+
segments: any | None = None
|
|
1849
1867
|
thresholds: Thresholds | None = None
|
|
1850
1868
|
actions: Actions | None = None
|
|
1851
1869
|
label: str | None = None
|
|
@@ -1909,7 +1927,7 @@ class Validate:
|
|
|
1909
1927
|
The table to validate, which could be a DataFrame object or an Ibis table object. Read the
|
|
1910
1928
|
*Supported Input Table Types* section for details on the supported table types.
|
|
1911
1929
|
tbl_name
|
|
1912
|
-
|
|
1930
|
+
An optional name to assign to the input table object. If no value is provided, a name will
|
|
1913
1931
|
be generated based on whatever information is available. This table name will be displayed
|
|
1914
1932
|
in the header area of the tabular report.
|
|
1915
1933
|
label
|
|
@@ -2323,6 +2341,7 @@ class Validate:
|
|
|
2323
2341
|
value: float | int | Column,
|
|
2324
2342
|
na_pass: bool = False,
|
|
2325
2343
|
pre: Callable | None = None,
|
|
2344
|
+
segments: SegmentSpec | None = None,
|
|
2326
2345
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
2327
2346
|
actions: Actions | None = None,
|
|
2328
2347
|
brief: str | bool | None = None,
|
|
@@ -2354,10 +2373,15 @@ class Validate:
|
|
|
2354
2373
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2355
2374
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2356
2375
|
pre
|
|
2357
|
-
|
|
2376
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2358
2377
|
interrogation. This function should take a table as input and return a modified table.
|
|
2359
2378
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2360
2379
|
argument.
|
|
2380
|
+
segments
|
|
2381
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2382
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2383
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2384
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2361
2385
|
thresholds
|
|
2362
2386
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2363
2387
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2420,6 +2444,42 @@ class Validate:
|
|
|
2420
2444
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2421
2445
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2422
2446
|
|
|
2447
|
+
Segmentation
|
|
2448
|
+
------------
|
|
2449
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2450
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2451
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2452
|
+
column.
|
|
2453
|
+
|
|
2454
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2455
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2456
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2457
|
+
region.
|
|
2458
|
+
|
|
2459
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2460
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2461
|
+
segment on only specific dates, you can provide a tuple like
|
|
2462
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2463
|
+
(i.e., no validation steps will be created for them).
|
|
2464
|
+
|
|
2465
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2466
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2467
|
+
|
|
2468
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2469
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2470
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2471
|
+
columns
|
|
2472
|
+
|
|
2473
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2474
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2475
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2476
|
+
identify issues within specific segments.
|
|
2477
|
+
|
|
2478
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2479
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2480
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2481
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2482
|
+
|
|
2423
2483
|
Thresholds
|
|
2424
2484
|
----------
|
|
2425
2485
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2518,6 +2578,8 @@ class Validate:
|
|
|
2518
2578
|
_check_column(column=columns)
|
|
2519
2579
|
# _check_value_float_int(value=value)
|
|
2520
2580
|
_check_pre(pre=pre)
|
|
2581
|
+
# TODO: add check for segments
|
|
2582
|
+
# _check_segments(segments=segments)
|
|
2521
2583
|
_check_thresholds(thresholds=thresholds)
|
|
2522
2584
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
2523
2585
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -2550,6 +2612,7 @@ class Validate:
|
|
|
2550
2612
|
values=value,
|
|
2551
2613
|
na_pass=na_pass,
|
|
2552
2614
|
pre=pre,
|
|
2615
|
+
segments=segments,
|
|
2553
2616
|
thresholds=thresholds,
|
|
2554
2617
|
actions=actions,
|
|
2555
2618
|
brief=brief,
|
|
@@ -2566,6 +2629,7 @@ class Validate:
|
|
|
2566
2629
|
value: float | int | Column,
|
|
2567
2630
|
na_pass: bool = False,
|
|
2568
2631
|
pre: Callable | None = None,
|
|
2632
|
+
segments: SegmentSpec | None = None,
|
|
2569
2633
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
2570
2634
|
actions: Actions | None = None,
|
|
2571
2635
|
brief: str | bool | None = None,
|
|
@@ -2597,10 +2661,15 @@ class Validate:
|
|
|
2597
2661
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2598
2662
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2599
2663
|
pre
|
|
2600
|
-
|
|
2664
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2601
2665
|
interrogation. This function should take a table as input and return a modified table.
|
|
2602
2666
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2603
2667
|
argument.
|
|
2668
|
+
segments
|
|
2669
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2670
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2671
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2672
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2604
2673
|
thresholds
|
|
2605
2674
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2606
2675
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2663,6 +2732,42 @@ class Validate:
|
|
|
2663
2732
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2664
2733
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2665
2734
|
|
|
2735
|
+
Segmentation
|
|
2736
|
+
------------
|
|
2737
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
2738
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
2739
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
2740
|
+
column.
|
|
2741
|
+
|
|
2742
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
2743
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
2744
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
2745
|
+
region.
|
|
2746
|
+
|
|
2747
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
2748
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
2749
|
+
segment on only specific dates, you can provide a tuple like
|
|
2750
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
2751
|
+
(i.e., no validation steps will be created for them).
|
|
2752
|
+
|
|
2753
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2754
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
2755
|
+
|
|
2756
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
2757
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
2758
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
2759
|
+
columns
|
|
2760
|
+
|
|
2761
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2762
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
2763
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
2764
|
+
identify issues within specific segments.
|
|
2765
|
+
|
|
2766
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
2767
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
2768
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
2769
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
2770
|
+
|
|
2666
2771
|
Thresholds
|
|
2667
2772
|
----------
|
|
2668
2773
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -2760,6 +2865,8 @@ class Validate:
|
|
|
2760
2865
|
_check_column(column=columns)
|
|
2761
2866
|
# _check_value_float_int(value=value)
|
|
2762
2867
|
_check_pre(pre=pre)
|
|
2868
|
+
# TODO: add check for segments
|
|
2869
|
+
# _check_segments(segments=segments)
|
|
2763
2870
|
_check_thresholds(thresholds=thresholds)
|
|
2764
2871
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
2765
2872
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -2792,6 +2899,7 @@ class Validate:
|
|
|
2792
2899
|
values=value,
|
|
2793
2900
|
na_pass=na_pass,
|
|
2794
2901
|
pre=pre,
|
|
2902
|
+
segments=segments,
|
|
2795
2903
|
thresholds=thresholds,
|
|
2796
2904
|
actions=actions,
|
|
2797
2905
|
brief=brief,
|
|
@@ -2808,6 +2916,7 @@ class Validate:
|
|
|
2808
2916
|
value: float | int | Column,
|
|
2809
2917
|
na_pass: bool = False,
|
|
2810
2918
|
pre: Callable | None = None,
|
|
2919
|
+
segments: SegmentSpec | None = None,
|
|
2811
2920
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
2812
2921
|
actions: Actions | None = None,
|
|
2813
2922
|
brief: str | bool | None = None,
|
|
@@ -2839,10 +2948,15 @@ class Validate:
|
|
|
2839
2948
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
2840
2949
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
2841
2950
|
pre
|
|
2842
|
-
|
|
2951
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
2843
2952
|
interrogation. This function should take a table as input and return a modified table.
|
|
2844
2953
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
2845
2954
|
argument.
|
|
2955
|
+
segments
|
|
2956
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
2957
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
2958
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
2959
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
2846
2960
|
thresholds
|
|
2847
2961
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
2848
2962
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -2905,6 +3019,42 @@ class Validate:
|
|
|
2905
3019
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
2906
3020
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
2907
3021
|
|
|
3022
|
+
Segmentation
|
|
3023
|
+
------------
|
|
3024
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3025
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3026
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3027
|
+
column.
|
|
3028
|
+
|
|
3029
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3030
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3031
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3032
|
+
region.
|
|
3033
|
+
|
|
3034
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3035
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3036
|
+
segment on only specific dates, you can provide a tuple like
|
|
3037
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3038
|
+
(i.e., no validation steps will be created for them).
|
|
3039
|
+
|
|
3040
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3041
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3042
|
+
|
|
3043
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3044
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3045
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3046
|
+
columns
|
|
3047
|
+
|
|
3048
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3049
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3050
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3051
|
+
identify issues within specific segments.
|
|
3052
|
+
|
|
3053
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3054
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3055
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3056
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3057
|
+
|
|
2908
3058
|
Thresholds
|
|
2909
3059
|
----------
|
|
2910
3060
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3001,6 +3151,8 @@ class Validate:
|
|
|
3001
3151
|
_check_column(column=columns)
|
|
3002
3152
|
# _check_value_float_int(value=value)
|
|
3003
3153
|
_check_pre(pre=pre)
|
|
3154
|
+
# TODO: add check for segments
|
|
3155
|
+
# _check_segments(segments=segments)
|
|
3004
3156
|
_check_thresholds(thresholds=thresholds)
|
|
3005
3157
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3006
3158
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -3033,6 +3185,7 @@ class Validate:
|
|
|
3033
3185
|
values=value,
|
|
3034
3186
|
na_pass=na_pass,
|
|
3035
3187
|
pre=pre,
|
|
3188
|
+
segments=segments,
|
|
3036
3189
|
thresholds=thresholds,
|
|
3037
3190
|
actions=actions,
|
|
3038
3191
|
brief=brief,
|
|
@@ -3049,6 +3202,7 @@ class Validate:
|
|
|
3049
3202
|
value: float | int | Column,
|
|
3050
3203
|
na_pass: bool = False,
|
|
3051
3204
|
pre: Callable | None = None,
|
|
3205
|
+
segments: SegmentSpec | None = None,
|
|
3052
3206
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
3053
3207
|
actions: Actions | None = None,
|
|
3054
3208
|
brief: str | bool | None = None,
|
|
@@ -3080,10 +3234,15 @@ class Validate:
|
|
|
3080
3234
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3081
3235
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3082
3236
|
pre
|
|
3083
|
-
|
|
3237
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3084
3238
|
interrogation. This function should take a table as input and return a modified table.
|
|
3085
3239
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3086
3240
|
argument.
|
|
3241
|
+
segments
|
|
3242
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3243
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3244
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3245
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3087
3246
|
thresholds
|
|
3088
3247
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3089
3248
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3146,6 +3305,42 @@ class Validate:
|
|
|
3146
3305
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
3147
3306
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
3148
3307
|
|
|
3308
|
+
Segmentation
|
|
3309
|
+
------------
|
|
3310
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3311
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3312
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3313
|
+
column.
|
|
3314
|
+
|
|
3315
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3316
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3317
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3318
|
+
region.
|
|
3319
|
+
|
|
3320
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3321
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3322
|
+
segment on only specific dates, you can provide a tuple like
|
|
3323
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3324
|
+
(i.e., no validation steps will be created for them).
|
|
3325
|
+
|
|
3326
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3327
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3328
|
+
|
|
3329
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3330
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3331
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3332
|
+
columns
|
|
3333
|
+
|
|
3334
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3335
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3336
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3337
|
+
identify issues within specific segments.
|
|
3338
|
+
|
|
3339
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3340
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3341
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3342
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3343
|
+
|
|
3149
3344
|
Thresholds
|
|
3150
3345
|
----------
|
|
3151
3346
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3240,6 +3435,8 @@ class Validate:
|
|
|
3240
3435
|
_check_column(column=columns)
|
|
3241
3436
|
# _check_value_float_int(value=value)
|
|
3242
3437
|
_check_pre(pre=pre)
|
|
3438
|
+
# TODO: add check for segments
|
|
3439
|
+
# _check_segments(segments=segments)
|
|
3243
3440
|
_check_thresholds(thresholds=thresholds)
|
|
3244
3441
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3245
3442
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -3272,6 +3469,7 @@ class Validate:
|
|
|
3272
3469
|
values=value,
|
|
3273
3470
|
na_pass=na_pass,
|
|
3274
3471
|
pre=pre,
|
|
3472
|
+
segments=segments,
|
|
3275
3473
|
thresholds=thresholds,
|
|
3276
3474
|
actions=actions,
|
|
3277
3475
|
brief=brief,
|
|
@@ -3288,6 +3486,7 @@ class Validate:
|
|
|
3288
3486
|
value: float | int | Column,
|
|
3289
3487
|
na_pass: bool = False,
|
|
3290
3488
|
pre: Callable | None = None,
|
|
3489
|
+
segments: SegmentSpec | None = None,
|
|
3291
3490
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
3292
3491
|
actions: Actions | None = None,
|
|
3293
3492
|
brief: str | bool | None = None,
|
|
@@ -3319,10 +3518,15 @@ class Validate:
|
|
|
3319
3518
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3320
3519
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3321
3520
|
pre
|
|
3322
|
-
|
|
3521
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3323
3522
|
interrogation. This function should take a table as input and return a modified table.
|
|
3324
3523
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3325
3524
|
argument.
|
|
3525
|
+
segments
|
|
3526
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3527
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3528
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3529
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3326
3530
|
thresholds
|
|
3327
3531
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3328
3532
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3385,6 +3589,42 @@ class Validate:
|
|
|
3385
3589
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
3386
3590
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
3387
3591
|
|
|
3592
|
+
Segmentation
|
|
3593
|
+
------------
|
|
3594
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3595
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3596
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3597
|
+
column.
|
|
3598
|
+
|
|
3599
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3600
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3601
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3602
|
+
region.
|
|
3603
|
+
|
|
3604
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3605
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3606
|
+
segment on only specific dates, you can provide a tuple like
|
|
3607
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3608
|
+
(i.e., no validation steps will be created for them).
|
|
3609
|
+
|
|
3610
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3611
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3612
|
+
|
|
3613
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3614
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3615
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3616
|
+
columns
|
|
3617
|
+
|
|
3618
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3619
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3620
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3621
|
+
identify issues within specific segments.
|
|
3622
|
+
|
|
3623
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3624
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3625
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3626
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3627
|
+
|
|
3388
3628
|
Thresholds
|
|
3389
3629
|
----------
|
|
3390
3630
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3483,6 +3723,8 @@ class Validate:
|
|
|
3483
3723
|
_check_column(column=columns)
|
|
3484
3724
|
# _check_value_float_int(value=value)
|
|
3485
3725
|
_check_pre(pre=pre)
|
|
3726
|
+
# TODO: add check for segments
|
|
3727
|
+
# _check_segments(segments=segments)
|
|
3486
3728
|
_check_thresholds(thresholds=thresholds)
|
|
3487
3729
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3488
3730
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -3515,6 +3757,7 @@ class Validate:
|
|
|
3515
3757
|
values=value,
|
|
3516
3758
|
na_pass=na_pass,
|
|
3517
3759
|
pre=pre,
|
|
3760
|
+
segments=segments,
|
|
3518
3761
|
thresholds=thresholds,
|
|
3519
3762
|
actions=actions,
|
|
3520
3763
|
brief=brief,
|
|
@@ -3531,6 +3774,7 @@ class Validate:
|
|
|
3531
3774
|
value: float | int | Column,
|
|
3532
3775
|
na_pass: bool = False,
|
|
3533
3776
|
pre: Callable | None = None,
|
|
3777
|
+
segments: SegmentSpec | None = None,
|
|
3534
3778
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
3535
3779
|
actions: Actions | None = None,
|
|
3536
3780
|
brief: str | bool | None = None,
|
|
@@ -3562,10 +3806,15 @@ class Validate:
|
|
|
3562
3806
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3563
3807
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3564
3808
|
pre
|
|
3565
|
-
|
|
3809
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3566
3810
|
interrogation. This function should take a table as input and return a modified table.
|
|
3567
3811
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3568
3812
|
argument.
|
|
3813
|
+
segments
|
|
3814
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3815
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3816
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3817
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3569
3818
|
thresholds
|
|
3570
3819
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3571
3820
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3628,6 +3877,42 @@ class Validate:
|
|
|
3628
3877
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
3629
3878
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
3630
3879
|
|
|
3880
|
+
Segmentation
|
|
3881
|
+
------------
|
|
3882
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
3883
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
3884
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
3885
|
+
column.
|
|
3886
|
+
|
|
3887
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
3888
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
3889
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
3890
|
+
region.
|
|
3891
|
+
|
|
3892
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
3893
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
3894
|
+
segment on only specific dates, you can provide a tuple like
|
|
3895
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
3896
|
+
(i.e., no validation steps will be created for them).
|
|
3897
|
+
|
|
3898
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3899
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
3900
|
+
|
|
3901
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
3902
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
3903
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
3904
|
+
columns
|
|
3905
|
+
|
|
3906
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3907
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
3908
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
3909
|
+
identify issues within specific segments.
|
|
3910
|
+
|
|
3911
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
3912
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
3913
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
3914
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
3915
|
+
|
|
3631
3916
|
Thresholds
|
|
3632
3917
|
----------
|
|
3633
3918
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3726,6 +4011,8 @@ class Validate:
|
|
|
3726
4011
|
_check_column(column=columns)
|
|
3727
4012
|
# _check_value_float_int(value=value)
|
|
3728
4013
|
_check_pre(pre=pre)
|
|
4014
|
+
# TODO: add check for segments
|
|
4015
|
+
# _check_segments(segments=segments)
|
|
3729
4016
|
_check_thresholds(thresholds=thresholds)
|
|
3730
4017
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3731
4018
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -3758,6 +4045,7 @@ class Validate:
|
|
|
3758
4045
|
values=value,
|
|
3759
4046
|
na_pass=na_pass,
|
|
3760
4047
|
pre=pre,
|
|
4048
|
+
segments=segments,
|
|
3761
4049
|
thresholds=thresholds,
|
|
3762
4050
|
actions=actions,
|
|
3763
4051
|
brief=brief,
|
|
@@ -3776,6 +4064,7 @@ class Validate:
|
|
|
3776
4064
|
inclusive: tuple[bool, bool] = (True, True),
|
|
3777
4065
|
na_pass: bool = False,
|
|
3778
4066
|
pre: Callable | None = None,
|
|
4067
|
+
segments: SegmentSpec | None = None,
|
|
3779
4068
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
3780
4069
|
actions: Actions | None = None,
|
|
3781
4070
|
brief: str | bool | None = None,
|
|
@@ -3817,10 +4106,15 @@ class Validate:
|
|
|
3817
4106
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3818
4107
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3819
4108
|
pre
|
|
3820
|
-
|
|
4109
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3821
4110
|
interrogation. This function should take a table as input and return a modified table.
|
|
3822
4111
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3823
4112
|
argument.
|
|
4113
|
+
segments
|
|
4114
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4115
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4116
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4117
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3824
4118
|
thresholds
|
|
3825
4119
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3826
4120
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -3885,6 +4179,42 @@ class Validate:
|
|
|
3885
4179
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
3886
4180
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
3887
4181
|
|
|
4182
|
+
Segmentation
|
|
4183
|
+
------------
|
|
4184
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
4185
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
4186
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
4187
|
+
column.
|
|
4188
|
+
|
|
4189
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
4190
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
4191
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
4192
|
+
region.
|
|
4193
|
+
|
|
4194
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
4195
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
4196
|
+
segment on only specific dates, you can provide a tuple like
|
|
4197
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
4198
|
+
(i.e., no validation steps will be created for them).
|
|
4199
|
+
|
|
4200
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4201
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
4202
|
+
|
|
4203
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
4204
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
4205
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
4206
|
+
columns
|
|
4207
|
+
|
|
4208
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4209
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
4210
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
4211
|
+
identify issues within specific segments.
|
|
4212
|
+
|
|
4213
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
4214
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
4215
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
4216
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
4217
|
+
|
|
3888
4218
|
Thresholds
|
|
3889
4219
|
----------
|
|
3890
4220
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -3992,6 +4322,8 @@ class Validate:
|
|
|
3992
4322
|
# _check_value_float_int(value=left)
|
|
3993
4323
|
# _check_value_float_int(value=right)
|
|
3994
4324
|
_check_pre(pre=pre)
|
|
4325
|
+
# TODO: add check for segments
|
|
4326
|
+
# _check_segments(segments=segments)
|
|
3995
4327
|
_check_thresholds(thresholds=thresholds)
|
|
3996
4328
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
3997
4329
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -4029,6 +4361,7 @@ class Validate:
|
|
|
4029
4361
|
inclusive=inclusive,
|
|
4030
4362
|
na_pass=na_pass,
|
|
4031
4363
|
pre=pre,
|
|
4364
|
+
segments=segments,
|
|
4032
4365
|
thresholds=thresholds,
|
|
4033
4366
|
actions=actions,
|
|
4034
4367
|
brief=brief,
|
|
@@ -4047,6 +4380,7 @@ class Validate:
|
|
|
4047
4380
|
inclusive: tuple[bool, bool] = (True, True),
|
|
4048
4381
|
na_pass: bool = False,
|
|
4049
4382
|
pre: Callable | None = None,
|
|
4383
|
+
segments: SegmentSpec | None = None,
|
|
4050
4384
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4051
4385
|
actions: Actions | None = None,
|
|
4052
4386
|
brief: str | bool | None = None,
|
|
@@ -4088,10 +4422,15 @@ class Validate:
|
|
|
4088
4422
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
4089
4423
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
4090
4424
|
pre
|
|
4091
|
-
|
|
4425
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4092
4426
|
interrogation. This function should take a table as input and return a modified table.
|
|
4093
4427
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4094
4428
|
argument.
|
|
4429
|
+
segments
|
|
4430
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4431
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4432
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4433
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4095
4434
|
thresholds
|
|
4096
4435
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4097
4436
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4156,6 +4495,42 @@ class Validate:
|
|
|
4156
4495
|
lifetime of the transformed table, it only exists during the validation step and is not
|
|
4157
4496
|
stored in the `Validate` object or used in subsequent validation steps.
|
|
4158
4497
|
|
|
4498
|
+
Segmentation
|
|
4499
|
+
------------
|
|
4500
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
4501
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
4502
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
4503
|
+
column.
|
|
4504
|
+
|
|
4505
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
4506
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
4507
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
4508
|
+
region.
|
|
4509
|
+
|
|
4510
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
4511
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
4512
|
+
segment on only specific dates, you can provide a tuple like
|
|
4513
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
4514
|
+
(i.e., no validation steps will be created for them).
|
|
4515
|
+
|
|
4516
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4517
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
4518
|
+
|
|
4519
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
4520
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
4521
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
4522
|
+
columns
|
|
4523
|
+
|
|
4524
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4525
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
4526
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
4527
|
+
identify issues within specific segments.
|
|
4528
|
+
|
|
4529
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
4530
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
4531
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
4532
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
4533
|
+
|
|
4159
4534
|
Thresholds
|
|
4160
4535
|
----------
|
|
4161
4536
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -4263,6 +4638,8 @@ class Validate:
|
|
|
4263
4638
|
# _check_value_float_int(value=left)
|
|
4264
4639
|
# _check_value_float_int(value=right)
|
|
4265
4640
|
_check_pre(pre=pre)
|
|
4641
|
+
# TODO: add check for segments
|
|
4642
|
+
# _check_segments(segments=segments)
|
|
4266
4643
|
_check_thresholds(thresholds=thresholds)
|
|
4267
4644
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
4268
4645
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -4300,6 +4677,7 @@ class Validate:
|
|
|
4300
4677
|
inclusive=inclusive,
|
|
4301
4678
|
na_pass=na_pass,
|
|
4302
4679
|
pre=pre,
|
|
4680
|
+
segments=segments,
|
|
4303
4681
|
thresholds=thresholds,
|
|
4304
4682
|
actions=actions,
|
|
4305
4683
|
brief=brief,
|
|
@@ -4315,6 +4693,7 @@ class Validate:
|
|
|
4315
4693
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
4316
4694
|
set: Collection[Any],
|
|
4317
4695
|
pre: Callable | None = None,
|
|
4696
|
+
segments: SegmentSpec | None = None,
|
|
4318
4697
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4319
4698
|
actions: Actions | None = None,
|
|
4320
4699
|
brief: str | bool | None = None,
|
|
@@ -4338,10 +4717,15 @@ class Validate:
|
|
|
4338
4717
|
set
|
|
4339
4718
|
A list of values to compare against.
|
|
4340
4719
|
pre
|
|
4341
|
-
|
|
4720
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4342
4721
|
interrogation. This function should take a table as input and return a modified table.
|
|
4343
4722
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4344
4723
|
argument.
|
|
4724
|
+
segments
|
|
4725
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4726
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4727
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4728
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4345
4729
|
thresholds
|
|
4346
4730
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4347
4731
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4383,6 +4767,42 @@ class Validate:
|
|
|
4383
4767
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
4384
4768
|
subsequent validation steps.
|
|
4385
4769
|
|
|
4770
|
+
Segmentation
|
|
4771
|
+
------------
|
|
4772
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
4773
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
4774
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
4775
|
+
column.
|
|
4776
|
+
|
|
4777
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
4778
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
4779
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
4780
|
+
region.
|
|
4781
|
+
|
|
4782
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
4783
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
4784
|
+
segment on only specific dates, you can provide a tuple like
|
|
4785
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
4786
|
+
(i.e., no validation steps will be created for them).
|
|
4787
|
+
|
|
4788
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4789
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
4790
|
+
|
|
4791
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
4792
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
4793
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
4794
|
+
columns
|
|
4795
|
+
|
|
4796
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4797
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
4798
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
4799
|
+
identify issues within specific segments.
|
|
4800
|
+
|
|
4801
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
4802
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
4803
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
4804
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
4805
|
+
|
|
4386
4806
|
Thresholds
|
|
4387
4807
|
----------
|
|
4388
4808
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -4481,6 +4901,8 @@ class Validate:
|
|
|
4481
4901
|
raise ValueError("`set=` must be a list of floats, integers, or strings.")
|
|
4482
4902
|
|
|
4483
4903
|
_check_pre(pre=pre)
|
|
4904
|
+
# TODO: add check for segments
|
|
4905
|
+
# _check_segments(segments=segments)
|
|
4484
4906
|
_check_thresholds(thresholds=thresholds)
|
|
4485
4907
|
_check_boolean_input(param=active, param_name="active")
|
|
4486
4908
|
|
|
@@ -4508,6 +4930,7 @@ class Validate:
|
|
|
4508
4930
|
column=column,
|
|
4509
4931
|
values=set,
|
|
4510
4932
|
pre=pre,
|
|
4933
|
+
segments=segments,
|
|
4511
4934
|
thresholds=thresholds,
|
|
4512
4935
|
actions=actions,
|
|
4513
4936
|
brief=brief,
|
|
@@ -4523,6 +4946,7 @@ class Validate:
|
|
|
4523
4946
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
4524
4947
|
set: list[float | int],
|
|
4525
4948
|
pre: Callable | None = None,
|
|
4949
|
+
segments: SegmentSpec | None = None,
|
|
4526
4950
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4527
4951
|
actions: Actions | None = None,
|
|
4528
4952
|
brief: str | bool | None = None,
|
|
@@ -4546,10 +4970,15 @@ class Validate:
|
|
|
4546
4970
|
set
|
|
4547
4971
|
A list of values to compare against.
|
|
4548
4972
|
pre
|
|
4549
|
-
|
|
4973
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4550
4974
|
interrogation. This function should take a table as input and return a modified table.
|
|
4551
4975
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4552
4976
|
argument.
|
|
4977
|
+
segments
|
|
4978
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4979
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4980
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4981
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4553
4982
|
thresholds
|
|
4554
4983
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4555
4984
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4591,6 +5020,42 @@ class Validate:
|
|
|
4591
5020
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
4592
5021
|
subsequent validation steps.
|
|
4593
5022
|
|
|
5023
|
+
Segmentation
|
|
5024
|
+
------------
|
|
5025
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5026
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
5027
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
5028
|
+
column.
|
|
5029
|
+
|
|
5030
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
5031
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
5032
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
5033
|
+
region.
|
|
5034
|
+
|
|
5035
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
5036
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
5037
|
+
segment on only specific dates, you can provide a tuple like
|
|
5038
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
5039
|
+
(i.e., no validation steps will be created for them).
|
|
5040
|
+
|
|
5041
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5042
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
5043
|
+
|
|
5044
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
5045
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
5046
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
5047
|
+
columns
|
|
5048
|
+
|
|
5049
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5050
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
5051
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
5052
|
+
identify issues within specific segments.
|
|
5053
|
+
|
|
5054
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
5055
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
5056
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
5057
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
5058
|
+
|
|
4594
5059
|
Thresholds
|
|
4595
5060
|
----------
|
|
4596
5061
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -4684,6 +5149,8 @@ class Validate:
|
|
|
4684
5149
|
_check_column(column=columns)
|
|
4685
5150
|
_check_set_types(set=set)
|
|
4686
5151
|
_check_pre(pre=pre)
|
|
5152
|
+
# TODO: add check for segments
|
|
5153
|
+
# _check_segments(segments=segments)
|
|
4687
5154
|
_check_thresholds(thresholds=thresholds)
|
|
4688
5155
|
_check_boolean_input(param=active, param_name="active")
|
|
4689
5156
|
|
|
@@ -4711,6 +5178,7 @@ class Validate:
|
|
|
4711
5178
|
column=column,
|
|
4712
5179
|
values=set,
|
|
4713
5180
|
pre=pre,
|
|
5181
|
+
segments=segments,
|
|
4714
5182
|
thresholds=thresholds,
|
|
4715
5183
|
actions=actions,
|
|
4716
5184
|
brief=brief,
|
|
@@ -4725,6 +5193,7 @@ class Validate:
|
|
|
4725
5193
|
self,
|
|
4726
5194
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
4727
5195
|
pre: Callable | None = None,
|
|
5196
|
+
segments: SegmentSpec | None = None,
|
|
4728
5197
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4729
5198
|
actions: Actions | None = None,
|
|
4730
5199
|
brief: str | bool | None = None,
|
|
@@ -4745,10 +5214,15 @@ class Validate:
|
|
|
4745
5214
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
4746
5215
|
generated for each column.
|
|
4747
5216
|
pre
|
|
4748
|
-
|
|
5217
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4749
5218
|
interrogation. This function should take a table as input and return a modified table.
|
|
4750
5219
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4751
5220
|
argument.
|
|
5221
|
+
segments
|
|
5222
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
5223
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
5224
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
5225
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4752
5226
|
thresholds
|
|
4753
5227
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4754
5228
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4790,6 +5264,42 @@ class Validate:
|
|
|
4790
5264
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
4791
5265
|
subsequent validation steps.
|
|
4792
5266
|
|
|
5267
|
+
Segmentation
|
|
5268
|
+
------------
|
|
5269
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5270
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
5271
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
5272
|
+
column.
|
|
5273
|
+
|
|
5274
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
5275
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
5276
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
5277
|
+
region.
|
|
5278
|
+
|
|
5279
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
5280
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
5281
|
+
segment on only specific dates, you can provide a tuple like
|
|
5282
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
5283
|
+
(i.e., no validation steps will be created for them).
|
|
5284
|
+
|
|
5285
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5286
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
5287
|
+
|
|
5288
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
5289
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
5290
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
5291
|
+
columns
|
|
5292
|
+
|
|
5293
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5294
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
5295
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
5296
|
+
identify issues within specific segments.
|
|
5297
|
+
|
|
5298
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
5299
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
5300
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
5301
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
5302
|
+
|
|
4793
5303
|
Thresholds
|
|
4794
5304
|
----------
|
|
4795
5305
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -4879,6 +5389,8 @@ class Validate:
|
|
|
4879
5389
|
|
|
4880
5390
|
_check_column(column=columns)
|
|
4881
5391
|
_check_pre(pre=pre)
|
|
5392
|
+
# TODO: add check for segments
|
|
5393
|
+
# _check_segments(segments=segments)
|
|
4882
5394
|
_check_thresholds(thresholds=thresholds)
|
|
4883
5395
|
_check_boolean_input(param=active, param_name="active")
|
|
4884
5396
|
|
|
@@ -4905,6 +5417,7 @@ class Validate:
|
|
|
4905
5417
|
assertion_type=assertion_type,
|
|
4906
5418
|
column=column,
|
|
4907
5419
|
pre=pre,
|
|
5420
|
+
segments=segments,
|
|
4908
5421
|
thresholds=thresholds,
|
|
4909
5422
|
actions=actions,
|
|
4910
5423
|
brief=brief,
|
|
@@ -4919,6 +5432,7 @@ class Validate:
|
|
|
4919
5432
|
self,
|
|
4920
5433
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
4921
5434
|
pre: Callable | None = None,
|
|
5435
|
+
segments: SegmentSpec | None = None,
|
|
4922
5436
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4923
5437
|
actions: Actions | None = None,
|
|
4924
5438
|
brief: str | bool | None = None,
|
|
@@ -4939,10 +5453,15 @@ class Validate:
|
|
|
4939
5453
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
4940
5454
|
generated for each column.
|
|
4941
5455
|
pre
|
|
4942
|
-
|
|
5456
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4943
5457
|
interrogation. This function should take a table as input and return a modified table.
|
|
4944
5458
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4945
5459
|
argument.
|
|
5460
|
+
segments
|
|
5461
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
5462
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
5463
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
5464
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4946
5465
|
thresholds
|
|
4947
5466
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4948
5467
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4984,6 +5503,42 @@ class Validate:
|
|
|
4984
5503
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
4985
5504
|
subsequent validation steps.
|
|
4986
5505
|
|
|
5506
|
+
Segmentation
|
|
5507
|
+
------------
|
|
5508
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5509
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
5510
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
5511
|
+
column.
|
|
5512
|
+
|
|
5513
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
5514
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
5515
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
5516
|
+
region.
|
|
5517
|
+
|
|
5518
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
5519
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
5520
|
+
segment on only specific dates, you can provide a tuple like
|
|
5521
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
5522
|
+
(i.e., no validation steps will be created for them).
|
|
5523
|
+
|
|
5524
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5525
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
5526
|
+
|
|
5527
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
5528
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
5529
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
5530
|
+
columns
|
|
5531
|
+
|
|
5532
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5533
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
5534
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
5535
|
+
identify issues within specific segments.
|
|
5536
|
+
|
|
5537
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
5538
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
5539
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
5540
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
5541
|
+
|
|
4987
5542
|
Thresholds
|
|
4988
5543
|
----------
|
|
4989
5544
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -5073,6 +5628,8 @@ class Validate:
|
|
|
5073
5628
|
|
|
5074
5629
|
_check_column(column=columns)
|
|
5075
5630
|
_check_pre(pre=pre)
|
|
5631
|
+
# TODO: add check for segments
|
|
5632
|
+
# _check_segments(segments=segments)
|
|
5076
5633
|
_check_thresholds(thresholds=thresholds)
|
|
5077
5634
|
_check_boolean_input(param=active, param_name="active")
|
|
5078
5635
|
|
|
@@ -5099,6 +5656,7 @@ class Validate:
|
|
|
5099
5656
|
assertion_type=assertion_type,
|
|
5100
5657
|
column=column,
|
|
5101
5658
|
pre=pre,
|
|
5659
|
+
segments=segments,
|
|
5102
5660
|
thresholds=thresholds,
|
|
5103
5661
|
actions=actions,
|
|
5104
5662
|
brief=brief,
|
|
@@ -5115,6 +5673,7 @@ class Validate:
|
|
|
5115
5673
|
pattern: str,
|
|
5116
5674
|
na_pass: bool = False,
|
|
5117
5675
|
pre: Callable | None = None,
|
|
5676
|
+
segments: SegmentSpec | None = None,
|
|
5118
5677
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5119
5678
|
actions: Actions | None = None,
|
|
5120
5679
|
brief: str | bool | None = None,
|
|
@@ -5141,10 +5700,15 @@ class Validate:
|
|
|
5141
5700
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
5142
5701
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
5143
5702
|
pre
|
|
5144
|
-
|
|
5703
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
5145
5704
|
interrogation. This function should take a table as input and return a modified table.
|
|
5146
5705
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
5147
5706
|
argument.
|
|
5707
|
+
segments
|
|
5708
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
5709
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
5710
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
5711
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
5148
5712
|
thresholds
|
|
5149
5713
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
5150
5714
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -5186,6 +5750,42 @@ class Validate:
|
|
|
5186
5750
|
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
5187
5751
|
subsequent validation steps.
|
|
5188
5752
|
|
|
5753
|
+
Segmentation
|
|
5754
|
+
------------
|
|
5755
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5756
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
5757
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
5758
|
+
column.
|
|
5759
|
+
|
|
5760
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
5761
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
5762
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
5763
|
+
region.
|
|
5764
|
+
|
|
5765
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
5766
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
5767
|
+
segment on only specific dates, you can provide a tuple like
|
|
5768
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
5769
|
+
(i.e., no validation steps will be created for them).
|
|
5770
|
+
|
|
5771
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5772
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
5773
|
+
|
|
5774
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
5775
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
5776
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
5777
|
+
columns
|
|
5778
|
+
|
|
5779
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5780
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
5781
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
5782
|
+
identify issues within specific segments.
|
|
5783
|
+
|
|
5784
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
5785
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
5786
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
5787
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
5788
|
+
|
|
5189
5789
|
Thresholds
|
|
5190
5790
|
----------
|
|
5191
5791
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -5277,6 +5877,8 @@ class Validate:
|
|
|
5277
5877
|
|
|
5278
5878
|
_check_column(column=columns)
|
|
5279
5879
|
_check_pre(pre=pre)
|
|
5880
|
+
# TODO: add check for segments
|
|
5881
|
+
# _check_segments(segments=segments)
|
|
5280
5882
|
_check_thresholds(thresholds=thresholds)
|
|
5281
5883
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
5282
5884
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -5306,6 +5908,7 @@ class Validate:
|
|
|
5306
5908
|
values=pattern,
|
|
5307
5909
|
na_pass=na_pass,
|
|
5308
5910
|
pre=pre,
|
|
5911
|
+
segments=segments,
|
|
5309
5912
|
thresholds=thresholds,
|
|
5310
5913
|
actions=actions,
|
|
5311
5914
|
brief=brief,
|
|
@@ -5320,6 +5923,7 @@ class Validate:
|
|
|
5320
5923
|
self,
|
|
5321
5924
|
expr: any,
|
|
5322
5925
|
pre: Callable | None = None,
|
|
5926
|
+
segments: SegmentSpec | None = None,
|
|
5323
5927
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5324
5928
|
actions: Actions | None = None,
|
|
5325
5929
|
brief: str | bool | None = None,
|
|
@@ -5341,10 +5945,15 @@ class Validate:
|
|
|
5341
5945
|
be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
|
|
5342
5946
|
should either be a lambda expression or a Narwhals column expression.
|
|
5343
5947
|
pre
|
|
5344
|
-
|
|
5948
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
5345
5949
|
interrogation. This function should take a table as input and return a modified table.
|
|
5346
5950
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
5347
5951
|
argument.
|
|
5952
|
+
segments
|
|
5953
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
5954
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
5955
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
5956
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
5348
5957
|
thresholds
|
|
5349
5958
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
5350
5959
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -5384,6 +5993,42 @@ class Validate:
|
|
|
5384
5993
|
transformed table, it only exists during the validation step and is not stored in the
|
|
5385
5994
|
`Validate` object or used in subsequent validation steps.
|
|
5386
5995
|
|
|
5996
|
+
Segmentation
|
|
5997
|
+
------------
|
|
5998
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
5999
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
6000
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
6001
|
+
column.
|
|
6002
|
+
|
|
6003
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
6004
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
6005
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
6006
|
+
region.
|
|
6007
|
+
|
|
6008
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
6009
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
6010
|
+
segment on only specific dates, you can provide a tuple like
|
|
6011
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
6012
|
+
(i.e., no validation steps will be created for them).
|
|
6013
|
+
|
|
6014
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6015
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
6016
|
+
|
|
6017
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
6018
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
6019
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
6020
|
+
columns
|
|
6021
|
+
|
|
6022
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6023
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
6024
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
6025
|
+
identify issues within specific segments.
|
|
6026
|
+
|
|
6027
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
6028
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
6029
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
6030
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
6031
|
+
|
|
5387
6032
|
Thresholds
|
|
5388
6033
|
----------
|
|
5389
6034
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -5461,6 +6106,8 @@ class Validate:
|
|
|
5461
6106
|
# TODO: Add a check for the expression to ensure it's a valid expression object
|
|
5462
6107
|
# _check_expr(expr=expr)
|
|
5463
6108
|
_check_pre(pre=pre)
|
|
6109
|
+
# TODO: add check for segments
|
|
6110
|
+
# _check_segments(segments=segments)
|
|
5464
6111
|
_check_thresholds(thresholds=thresholds)
|
|
5465
6112
|
_check_boolean_input(param=active, param_name="active")
|
|
5466
6113
|
|
|
@@ -5477,6 +6124,7 @@ class Validate:
|
|
|
5477
6124
|
column=None,
|
|
5478
6125
|
values=expr,
|
|
5479
6126
|
pre=pre,
|
|
6127
|
+
segments=segments,
|
|
5480
6128
|
thresholds=thresholds,
|
|
5481
6129
|
actions=actions,
|
|
5482
6130
|
brief=brief,
|
|
@@ -5665,6 +6313,7 @@ class Validate:
|
|
|
5665
6313
|
self,
|
|
5666
6314
|
columns_subset: str | list[str] | None = None,
|
|
5667
6315
|
pre: Callable | None = None,
|
|
6316
|
+
segments: SegmentSpec | None = None,
|
|
5668
6317
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5669
6318
|
actions: Actions | None = None,
|
|
5670
6319
|
brief: str | bool | None = None,
|
|
@@ -5685,10 +6334,15 @@ class Validate:
|
|
|
5685
6334
|
columns are supplied, the distinct comparison will be made over the combination of
|
|
5686
6335
|
values in those columns.
|
|
5687
6336
|
pre
|
|
5688
|
-
|
|
6337
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
5689
6338
|
interrogation. This function should take a table as input and return a modified table.
|
|
5690
6339
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
5691
6340
|
argument.
|
|
6341
|
+
segments
|
|
6342
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
6343
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
6344
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
6345
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
5692
6346
|
thresholds
|
|
5693
6347
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
5694
6348
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -5730,6 +6384,42 @@ class Validate:
|
|
|
5730
6384
|
table, it only exists during the validation step and is not stored in the `Validate` object
|
|
5731
6385
|
or used in subsequent validation steps.
|
|
5732
6386
|
|
|
6387
|
+
Segmentation
|
|
6388
|
+
------------
|
|
6389
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
6390
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
6391
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
6392
|
+
column.
|
|
6393
|
+
|
|
6394
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
6395
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
6396
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
6397
|
+
region.
|
|
6398
|
+
|
|
6399
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
6400
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
6401
|
+
segment on only specific dates, you can provide a tuple like
|
|
6402
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
6403
|
+
(i.e., no validation steps will be created for them).
|
|
6404
|
+
|
|
6405
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6406
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
6407
|
+
|
|
6408
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
6409
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
6410
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
6411
|
+
columns
|
|
6412
|
+
|
|
6413
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6414
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
6415
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
6416
|
+
identify issues within specific segments.
|
|
6417
|
+
|
|
6418
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
6419
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
6420
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
6421
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
6422
|
+
|
|
5733
6423
|
Thresholds
|
|
5734
6424
|
----------
|
|
5735
6425
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -5823,6 +6513,8 @@ class Validate:
|
|
|
5823
6513
|
assertion_type = _get_fn_name()
|
|
5824
6514
|
|
|
5825
6515
|
_check_pre(pre=pre)
|
|
6516
|
+
# TODO: add check for segments
|
|
6517
|
+
# _check_segments(segments=segments)
|
|
5826
6518
|
_check_thresholds(thresholds=thresholds)
|
|
5827
6519
|
_check_boolean_input(param=active, param_name="active")
|
|
5828
6520
|
|
|
@@ -5843,6 +6535,7 @@ class Validate:
|
|
|
5843
6535
|
assertion_type=assertion_type,
|
|
5844
6536
|
column=columns_subset,
|
|
5845
6537
|
pre=pre,
|
|
6538
|
+
segments=segments,
|
|
5846
6539
|
thresholds=thresholds,
|
|
5847
6540
|
actions=actions,
|
|
5848
6541
|
brief=brief,
|
|
@@ -5903,7 +6596,7 @@ class Validate:
|
|
|
5903
6596
|
substring matches are allowed, so a schema data type of `Int` would match a target table
|
|
5904
6597
|
data type of `Int64`.
|
|
5905
6598
|
pre
|
|
5906
|
-
|
|
6599
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
5907
6600
|
interrogation. This function should take a table as input and return a modified table.
|
|
5908
6601
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
5909
6602
|
argument.
|
|
@@ -6116,7 +6809,7 @@ class Validate:
|
|
|
6116
6809
|
Should the validation step be inverted? If `True`, then the expectation is that the row
|
|
6117
6810
|
count of the target table should not match the specified `count=` value.
|
|
6118
6811
|
pre
|
|
6119
|
-
|
|
6812
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
6120
6813
|
interrogation. This function should take a table as input and return a modified table.
|
|
6121
6814
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
6122
6815
|
argument.
|
|
@@ -6326,7 +7019,7 @@ class Validate:
|
|
|
6326
7019
|
Should the validation step be inverted? If `True`, then the expectation is that the
|
|
6327
7020
|
column count of the target table should not match the specified `count=` value.
|
|
6328
7021
|
pre
|
|
6329
|
-
|
|
7022
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
6330
7023
|
interrogation. This function should take a table as input and return a modified table.
|
|
6331
7024
|
Have a look at the *Preprocessing* section for more information on how to use this
|
|
6332
7025
|
argument.
|
|
@@ -6844,10 +7537,14 @@ class Validate:
|
|
|
6844
7537
|
|
|
6845
7538
|
self.time_start = datetime.datetime.now(datetime.timezone.utc)
|
|
6846
7539
|
|
|
6847
|
-
# Expand `validation_info` by evaluating any column expressions in `
|
|
7540
|
+
# Expand `validation_info` by evaluating any column expressions in `columns=`
|
|
6848
7541
|
# (the `_evaluate_column_exprs()` method will eval and expand as needed)
|
|
6849
7542
|
self._evaluate_column_exprs(validation_info=self.validation_info)
|
|
6850
7543
|
|
|
7544
|
+
# Expand `validation_info` by evaluating for any segmentation directives
|
|
7545
|
+
# provided in `segments=` (the `_evaluate_segments()` method will eval and expand as needed)
|
|
7546
|
+
self._evaluate_segments(validation_info=self.validation_info)
|
|
7547
|
+
|
|
6851
7548
|
for validation in self.validation_info:
|
|
6852
7549
|
# Set the `i` value for the validation step (this is 1-indexed)
|
|
6853
7550
|
index_value = self.validation_info.index(validation) + 1
|
|
@@ -6883,6 +7580,10 @@ class Validate:
|
|
|
6883
7580
|
|
|
6884
7581
|
validation.autobrief = autobrief
|
|
6885
7582
|
|
|
7583
|
+
# ------------------------------------------------
|
|
7584
|
+
# Bypassing the validation step if conditions met
|
|
7585
|
+
# ------------------------------------------------
|
|
7586
|
+
|
|
6886
7587
|
# Skip the validation step if it is not active but still record the time of processing
|
|
6887
7588
|
if not validation.active:
|
|
6888
7589
|
end_time = datetime.datetime.now(datetime.timezone.utc)
|
|
@@ -6939,6 +7640,17 @@ class Validate:
|
|
|
6939
7640
|
elif isinstance(validation.pre, Callable):
|
|
6940
7641
|
data_tbl_step = validation.pre(data_tbl_step)
|
|
6941
7642
|
|
|
7643
|
+
# ------------------------------------------------
|
|
7644
|
+
# Segmentation stage
|
|
7645
|
+
# ------------------------------------------------
|
|
7646
|
+
|
|
7647
|
+
# Determine whether any segmentation directives are to be applied to the table
|
|
7648
|
+
|
|
7649
|
+
if validation.segments is not None:
|
|
7650
|
+
data_tbl_step = _apply_segments(
|
|
7651
|
+
data_tbl=data_tbl_step, segments_expr=validation.segments
|
|
7652
|
+
)
|
|
7653
|
+
|
|
6942
7654
|
validation.n = NumberOfTestUnits(df=data_tbl_step, column=column).get_test_units(
|
|
6943
7655
|
tbl_type=tbl_type
|
|
6944
7656
|
)
|
|
@@ -8840,6 +9552,13 @@ class Validate:
|
|
|
8840
9552
|
# will be made blank if the validation has not been performed
|
|
8841
9553
|
interrogation_performed = validation_info_dict.get("proc_duration_s", [None])[0] is not None
|
|
8842
9554
|
|
|
9555
|
+
# Determine which steps are those using segmented data
|
|
9556
|
+
segmented_steps = [
|
|
9557
|
+
i + 1
|
|
9558
|
+
for i, segment in enumerate(validation_info_dict["segments"])
|
|
9559
|
+
if segment is not None
|
|
9560
|
+
]
|
|
9561
|
+
|
|
8843
9562
|
# ------------------------------------------------
|
|
8844
9563
|
# Process the `type_upd` entry
|
|
8845
9564
|
# ------------------------------------------------
|
|
@@ -8849,6 +9568,7 @@ class Validate:
|
|
|
8849
9568
|
assertion_str=validation_info_dict["assertion_type"],
|
|
8850
9569
|
brief_str=validation_info_dict["brief"],
|
|
8851
9570
|
autobrief_str=validation_info_dict["autobrief"],
|
|
9571
|
+
segmentation_str=validation_info_dict["segments"],
|
|
8852
9572
|
lang=lang,
|
|
8853
9573
|
)
|
|
8854
9574
|
|
|
@@ -8980,11 +9700,14 @@ class Validate:
|
|
|
8980
9700
|
# Add the `tbl` entry
|
|
8981
9701
|
# ------------------------------------------------
|
|
8982
9702
|
|
|
8983
|
-
# Depending on if there was some preprocessing done, get the appropriate icon
|
|
8984
|
-
#
|
|
9703
|
+
# Depending on if there was some preprocessing done, get the appropriate icon for
|
|
9704
|
+
# the table processing status to be displayed in the report under the `tbl` column
|
|
9705
|
+
# TODO: add the icon for the segmented data option when the step is segmented
|
|
8985
9706
|
|
|
8986
9707
|
validation_info_dict["tbl"] = _transform_tbl_preprocessed(
|
|
8987
|
-
pre=validation_info_dict["pre"],
|
|
9708
|
+
pre=validation_info_dict["pre"],
|
|
9709
|
+
seg=validation_info_dict["segments"],
|
|
9710
|
+
interrogation_performed=interrogation_performed,
|
|
8988
9711
|
)
|
|
8989
9712
|
|
|
8990
9713
|
# ------------------------------------------------
|
|
@@ -9019,8 +9742,9 @@ class Validate:
|
|
|
9019
9742
|
# Process `pass` and `fail` entries
|
|
9020
9743
|
# ------------------------------------------------
|
|
9021
9744
|
|
|
9022
|
-
# Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries
|
|
9023
|
-
# of the `pass` entry should be equal to the length of the
|
|
9745
|
+
# Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries
|
|
9746
|
+
# (the length of the `pass` entry should be equal to the length of the
|
|
9747
|
+
# `n_passed` and `n_failed` entries)
|
|
9024
9748
|
|
|
9025
9749
|
validation_info_dict["pass"] = _transform_passed_failed(
|
|
9026
9750
|
n_passed_failed=validation_info_dict["n_passed"],
|
|
@@ -9173,6 +9897,9 @@ class Validate:
|
|
|
9173
9897
|
# Remove the `pre` entry from the dictionary
|
|
9174
9898
|
validation_info_dict.pop("pre")
|
|
9175
9899
|
|
|
9900
|
+
# Remove the `segments` entry from the dictionary
|
|
9901
|
+
validation_info_dict.pop("segments")
|
|
9902
|
+
|
|
9176
9903
|
# Remove the `proc_duration_s` entry from the dictionary
|
|
9177
9904
|
validation_info_dict.pop("proc_duration_s")
|
|
9178
9905
|
|
|
@@ -9255,6 +9982,10 @@ class Validate:
|
|
|
9255
9982
|
columns=["type_upd", "columns_upd", "values_upd", "test_units", "pass", "fail"]
|
|
9256
9983
|
),
|
|
9257
9984
|
)
|
|
9985
|
+
.tab_style(
|
|
9986
|
+
style=style.css("overflow-x: visible; white-space: nowrap;"),
|
|
9987
|
+
locations=loc.body(columns="type_upd", rows=segmented_steps),
|
|
9988
|
+
)
|
|
9258
9989
|
.tab_style(
|
|
9259
9990
|
style=style.fill(color="#FCFCFC" if interrogation_performed else "white"),
|
|
9260
9991
|
locations=loc.body(columns=["w_upd", "e_upd", "c_upd"]),
|
|
@@ -9429,8 +10160,8 @@ class Validate:
|
|
|
9429
10160
|
table object, which can be displayed in a notebook or exported to an HTML file.
|
|
9430
10161
|
|
|
9431
10162
|
:::{.callout-warning}
|
|
9432
|
-
The `get_step_report()` is still experimental. Please report any issues you encounter
|
|
9433
|
-
[Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
10163
|
+
The `get_step_report()` method is still experimental. Please report any issues you encounter
|
|
10164
|
+
in the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
9434
10165
|
:::
|
|
9435
10166
|
|
|
9436
10167
|
Parameters
|
|
@@ -9463,6 +10194,35 @@ class Validate:
|
|
|
9463
10194
|
GT
|
|
9464
10195
|
A GT table object that represents the detailed report for the validation step.
|
|
9465
10196
|
|
|
10197
|
+
Types of Step Reports
|
|
10198
|
+
---------------------
|
|
10199
|
+
The `get_step_report()` method produces a report based on the *type* of validation step.
|
|
10200
|
+
The following row-based validation methods will produce a report that shows the rows of the
|
|
10201
|
+
data that failed because of failing test units within one or more columns failed:
|
|
10202
|
+
|
|
10203
|
+
- [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
|
|
10204
|
+
- [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
|
|
10205
|
+
- [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
|
|
10206
|
+
- [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
|
|
10207
|
+
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
10208
|
+
- [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
|
|
10209
|
+
- [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
|
|
10210
|
+
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
10211
|
+
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
10212
|
+
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
10213
|
+
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
10214
|
+
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
10215
|
+
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
10216
|
+
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
10217
|
+
|
|
10218
|
+
The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
|
|
10219
|
+
report that shows duplicate rows (or duplicate values in one or a set of columns as defined
|
|
10220
|
+
in that method's `columns_subset=` parameter.
|
|
10221
|
+
|
|
10222
|
+
The [`col_schema_match()`](`pointblank.Validate.col_schema_match`) validation step will
|
|
10223
|
+
produce a report that shows the schema of the data table and the schema of the validation
|
|
10224
|
+
step. The report will indicate whether the schemas match or not.
|
|
10225
|
+
|
|
9466
10226
|
Examples
|
|
9467
10227
|
--------
|
|
9468
10228
|
```{python}
|
|
@@ -9488,7 +10248,7 @@ class Validate:
|
|
|
9488
10248
|
.col_vals_lt(columns="d", value=3500)
|
|
9489
10249
|
.col_vals_between(columns="c", left=1, right=8)
|
|
9490
10250
|
.col_vals_gt(columns="a", value=3)
|
|
9491
|
-
.col_vals_regex(columns="b", pattern=r"
|
|
10251
|
+
.col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
|
|
9492
10252
|
.interrogate()
|
|
9493
10253
|
)
|
|
9494
10254
|
|
|
@@ -9776,6 +10536,95 @@ class Validate:
|
|
|
9776
10536
|
|
|
9777
10537
|
return self
|
|
9778
10538
|
|
|
10539
|
+
def _evaluate_segments(self, validation_info):
|
|
10540
|
+
"""
|
|
10541
|
+
Evaluate any segmentation expressions stored in the `segments` attribute and expand each
|
|
10542
|
+
validation step with such directives into multiple. This is done by evaluating the
|
|
10543
|
+
segmentation expression and creating a new validation step for each segment. Errors in
|
|
10544
|
+
evaluation (such as no segments matched) will be caught and recorded in the `eval_error`
|
|
10545
|
+
attribute.
|
|
10546
|
+
|
|
10547
|
+
Parameters
|
|
10548
|
+
----------
|
|
10549
|
+
validation_info
|
|
10550
|
+
Information about the validation to evaluate and expand.
|
|
10551
|
+
"""
|
|
10552
|
+
|
|
10553
|
+
# Create a list to store the expanded validation steps
|
|
10554
|
+
expanded_validation_info = []
|
|
10555
|
+
|
|
10556
|
+
# Iterate over the validation steps
|
|
10557
|
+
for i, validation in enumerate(validation_info):
|
|
10558
|
+
# Get the segments expression
|
|
10559
|
+
segments_expr = validation.segments
|
|
10560
|
+
|
|
10561
|
+
# If the value is None, then skip the evaluation and append the validation step to the
|
|
10562
|
+
# list of expanded validation steps
|
|
10563
|
+
if segments_expr is None:
|
|
10564
|
+
expanded_validation_info.append(validation)
|
|
10565
|
+
continue
|
|
10566
|
+
|
|
10567
|
+
# Evaluate the segments expression
|
|
10568
|
+
try:
|
|
10569
|
+
# Get the table for this step, it can either be:
|
|
10570
|
+
# 1. the target table itself
|
|
10571
|
+
# 2. the target table modified by a `pre` attribute
|
|
10572
|
+
|
|
10573
|
+
if validation.pre is None:
|
|
10574
|
+
table = self.data
|
|
10575
|
+
else:
|
|
10576
|
+
table = validation.pre(self.data)
|
|
10577
|
+
|
|
10578
|
+
# If the `segments` expression is a string, that string is taken as a column name
|
|
10579
|
+
# for which segmentation should occur across unique values in the column
|
|
10580
|
+
if isinstance(segments_expr, str):
|
|
10581
|
+
seg_tuples = _seg_expr_from_string(data_tbl=table, segments_expr=segments_expr)
|
|
10582
|
+
|
|
10583
|
+
# If the 'segments' expression is a tuple, then normalize it to a list of tuples
|
|
10584
|
+
# - ("col", "value") -> [("col", "value")]
|
|
10585
|
+
# - ("col", ["value1", "value2"]) -> [("col", "value1"), ("col", "value2")]
|
|
10586
|
+
elif isinstance(segments_expr, tuple):
|
|
10587
|
+
seg_tuples = _seg_expr_from_tuple(segments_expr=segments_expr)
|
|
10588
|
+
|
|
10589
|
+
# If the 'segments' expression is a list of strings or tuples (can be mixed) then
|
|
10590
|
+
# normalize it to a list of tuples following the rules above
|
|
10591
|
+
elif isinstance(segments_expr, list):
|
|
10592
|
+
seg_tuples = []
|
|
10593
|
+
for seg in segments_expr:
|
|
10594
|
+
if isinstance(seg, str):
|
|
10595
|
+
# Use the utility function for string items
|
|
10596
|
+
str_seg_tuples = _seg_expr_from_string(
|
|
10597
|
+
data_tbl=table, segments_expr=seg
|
|
10598
|
+
)
|
|
10599
|
+
seg_tuples.extend(str_seg_tuples)
|
|
10600
|
+
elif isinstance(seg, tuple):
|
|
10601
|
+
# Use the utility function for tuple items
|
|
10602
|
+
tuple_seg_tuples = _seg_expr_from_tuple(segments_expr=seg)
|
|
10603
|
+
seg_tuples.extend(tuple_seg_tuples)
|
|
10604
|
+
else: # pragma: no cover
|
|
10605
|
+
# Handle invalid segment type
|
|
10606
|
+
raise ValueError(
|
|
10607
|
+
f"Invalid segment expression item type: {type(seg)}. "
|
|
10608
|
+
"Must be either string or tuple."
|
|
10609
|
+
)
|
|
10610
|
+
|
|
10611
|
+
except Exception: # pragma: no cover
|
|
10612
|
+
validation.eval_error = True
|
|
10613
|
+
|
|
10614
|
+
# For each segmentation resolved, create a new validation step and add it to the list of
|
|
10615
|
+
# expanded validation steps
|
|
10616
|
+
for seg in seg_tuples:
|
|
10617
|
+
new_validation = copy.deepcopy(validation)
|
|
10618
|
+
|
|
10619
|
+
new_validation.segments = seg
|
|
10620
|
+
|
|
10621
|
+
expanded_validation_info.append(new_validation)
|
|
10622
|
+
|
|
10623
|
+
# Replace the `validation_info` attribute with the expanded version
|
|
10624
|
+
self.validation_info = expanded_validation_info
|
|
10625
|
+
|
|
10626
|
+
return self
|
|
10627
|
+
|
|
9779
10628
|
def _get_validation_dict(self, i: int | list[int] | None, attr: str) -> dict[int, int]:
|
|
9780
10629
|
"""
|
|
9781
10630
|
Utility function to get a dictionary of validation attributes for each validation step.
|
|
@@ -10493,6 +11342,143 @@ def _prep_values_text(
|
|
|
10493
11342
|
return values_str
|
|
10494
11343
|
|
|
10495
11344
|
|
|
11345
|
+
def _seg_expr_from_string(data_tbl: any, segments_expr: str) -> tuple[str, str]:
|
|
11346
|
+
"""
|
|
11347
|
+
Obtain the segmentation categories from a table column.
|
|
11348
|
+
|
|
11349
|
+
The `segments_expr` value will have been checked to be a string, so there's no need to check for
|
|
11350
|
+
that here. The function will return a list of tuples representing pairings of a column name and
|
|
11351
|
+
a value. The task is to obtain the unique values in the column (handling different table types)
|
|
11352
|
+
and produce a normalized list of tuples of the form: `(column, value)`.
|
|
11353
|
+
|
|
11354
|
+
This function is used to create a list of segments for the validation step. And since there will
|
|
11355
|
+
usually be more than one segment, the validation step will be expanded into multiple during
|
|
11356
|
+
interrogation (where this function is called).
|
|
11357
|
+
|
|
11358
|
+
Parameters
|
|
11359
|
+
----------
|
|
11360
|
+
data_tbl
|
|
11361
|
+
The table from which to obtain the segmentation categories.
|
|
11362
|
+
segments_expr
|
|
11363
|
+
The column name for which segmentation should occur across unique values in the column.
|
|
11364
|
+
|
|
11365
|
+
Returns
|
|
11366
|
+
-------
|
|
11367
|
+
list[tuple[str, str]]
|
|
11368
|
+
A list of tuples representing pairings of a column name and a value in the column.
|
|
11369
|
+
"""
|
|
11370
|
+
# Determine if the table is a DataFrame or a DB table
|
|
11371
|
+
tbl_type = _get_tbl_type(data=data_tbl)
|
|
11372
|
+
|
|
11373
|
+
# Obtain the segmentation categories from the table column given as `segments_expr`
|
|
11374
|
+
if tbl_type == "polars":
|
|
11375
|
+
seg_categories = data_tbl[segments_expr].unique().to_list()
|
|
11376
|
+
elif tbl_type == "pandas":
|
|
11377
|
+
seg_categories = data_tbl[segments_expr].unique().tolist()
|
|
11378
|
+
elif tbl_type in IBIS_BACKENDS:
|
|
11379
|
+
distinct_col_vals = data_tbl.select(segments_expr).distinct()
|
|
11380
|
+
seg_categories = distinct_col_vals[segments_expr].to_list()
|
|
11381
|
+
else: # pragma: no cover
|
|
11382
|
+
raise ValueError(f"Unsupported table type: {tbl_type}")
|
|
11383
|
+
|
|
11384
|
+
# Ensure that the categories are sorted
|
|
11385
|
+
seg_categories.sort()
|
|
11386
|
+
|
|
11387
|
+
# Place each category and each value in a list of tuples as: `(column, value)`
|
|
11388
|
+
seg_tuples = [(segments_expr, category) for category in seg_categories]
|
|
11389
|
+
|
|
11390
|
+
return seg_tuples
|
|
11391
|
+
|
|
11392
|
+
|
|
11393
|
+
def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, str]]:
|
|
11394
|
+
"""
|
|
11395
|
+
Normalize the segments expression to a list of tuples, given a single tuple.
|
|
11396
|
+
|
|
11397
|
+
The `segments_expr` value will have been checked to be a tuple, so there's no need to check for
|
|
11398
|
+
that here. The function will return a list of tuples representing pairings of a column name and
|
|
11399
|
+
a value. The task is to normalize the tuple into a list of tuples of the form:
|
|
11400
|
+
`(column, value)`.
|
|
11401
|
+
|
|
11402
|
+
The following examples show how this normalzation works:
|
|
11403
|
+
- `("col", "value")` -> `[("col", "value")]` (single tuple, upgraded to a list of tuples)
|
|
11404
|
+
- `("col", ["value1", "value2"])` -> `[("col", "value1"), ("col", "value2")]` (tuple with a list
|
|
11405
|
+
of values, expanded into multiple tuples within a list)
|
|
11406
|
+
|
|
11407
|
+
This function is used to create a list of segments for the validation step. And since there will
|
|
11408
|
+
usually be more than one segment, the validation step will be expanded into multiple during
|
|
11409
|
+
interrogation (where this function is called).
|
|
11410
|
+
|
|
11411
|
+
Parameters
|
|
11412
|
+
----------
|
|
11413
|
+
segments_expr
|
|
11414
|
+
The segments expression to normalize. It can be a tuple of the form
|
|
11415
|
+
`(column, value)` or `(column, [value1, value2])`.
|
|
11416
|
+
|
|
11417
|
+
Returns
|
|
11418
|
+
-------
|
|
11419
|
+
list[tuple[str, str]]
|
|
11420
|
+
A list of tuples representing pairings of a column name and a value in the column.
|
|
11421
|
+
"""
|
|
11422
|
+
# Check if the first element is a string
|
|
11423
|
+
if isinstance(segments_expr[0], str):
|
|
11424
|
+
# If the second element is a list, create a list of tuples
|
|
11425
|
+
if isinstance(segments_expr[1], list):
|
|
11426
|
+
seg_tuples = [(segments_expr[0], value) for value in segments_expr[1]]
|
|
11427
|
+
# If the second element is not a list, create a single tuple
|
|
11428
|
+
else:
|
|
11429
|
+
seg_tuples = [(segments_expr[0], segments_expr[1])]
|
|
11430
|
+
# If the first element is not a string, raise an error
|
|
11431
|
+
else: # pragma: no cover
|
|
11432
|
+
raise ValueError("The first element of the segments expression must be a string.")
|
|
11433
|
+
|
|
11434
|
+
return seg_tuples
|
|
11435
|
+
|
|
11436
|
+
|
|
11437
|
+
def _apply_segments(data_tbl: any, segments_expr: tuple[str, str]) -> any:
|
|
11438
|
+
"""
|
|
11439
|
+
Apply the segments expression to the data table.
|
|
11440
|
+
|
|
11441
|
+
Filter the data table based on the `segments_expr=` value, where the first element is the
|
|
11442
|
+
column name and the second element is the value to filter by.
|
|
11443
|
+
|
|
11444
|
+
Parameters
|
|
11445
|
+
----------
|
|
11446
|
+
data_tbl
|
|
11447
|
+
The data table to filter. It can be a Pandas DataFrame, Polars DataFrame, or an Ibis
|
|
11448
|
+
backend table.
|
|
11449
|
+
segments_expr
|
|
11450
|
+
The segments expression to apply. It is a tuple of the form `(column, value)`.
|
|
11451
|
+
|
|
11452
|
+
Returns
|
|
11453
|
+
-------
|
|
11454
|
+
any
|
|
11455
|
+
The filtered data table. It will be of the same type as the input table.
|
|
11456
|
+
"""
|
|
11457
|
+
# Get the table type
|
|
11458
|
+
tbl_type = _get_tbl_type(data=data_tbl)
|
|
11459
|
+
|
|
11460
|
+
if tbl_type in ["pandas", "polars"]:
|
|
11461
|
+
# If the table is a Pandas or Polars DataFrame, transforming to a Narwhals table
|
|
11462
|
+
# and perform the filtering operation
|
|
11463
|
+
|
|
11464
|
+
# Transform to Narwhals table if a DataFrame
|
|
11465
|
+
data_tbl_nw = nw.from_native(data_tbl)
|
|
11466
|
+
|
|
11467
|
+
# Filter the data table based on the column name and value
|
|
11468
|
+
data_tbl_nw = data_tbl_nw.filter(nw.col(segments_expr[0]) == segments_expr[1])
|
|
11469
|
+
|
|
11470
|
+
# Transform back to the original table type
|
|
11471
|
+
data_tbl = data_tbl_nw.to_native()
|
|
11472
|
+
|
|
11473
|
+
elif tbl_type in IBIS_BACKENDS:
|
|
11474
|
+
# If the table is an Ibis backend table, perform the filtering operation directly
|
|
11475
|
+
|
|
11476
|
+
# Filter the data table based on the column name and value
|
|
11477
|
+
data_tbl = data_tbl[data_tbl[segments_expr[0]] == segments_expr[1]]
|
|
11478
|
+
|
|
11479
|
+
return data_tbl
|
|
11480
|
+
|
|
11481
|
+
|
|
10496
11482
|
def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
|
|
10497
11483
|
"""
|
|
10498
11484
|
Convert a `_ValidationInfo` object to a dictionary.
|
|
@@ -10517,6 +11503,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
|
|
|
10517
11503
|
"inclusive",
|
|
10518
11504
|
"na_pass",
|
|
10519
11505
|
"pre",
|
|
11506
|
+
"segments",
|
|
10520
11507
|
"label",
|
|
10521
11508
|
"brief",
|
|
10522
11509
|
"autobrief",
|
|
@@ -10631,7 +11618,7 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s
|
|
|
10631
11618
|
return title_text
|
|
10632
11619
|
|
|
10633
11620
|
|
|
10634
|
-
def _transform_tbl_preprocessed(pre:
|
|
11621
|
+
def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: bool) -> list[str]:
|
|
10635
11622
|
# If no interrogation was performed, return a list of empty strings
|
|
10636
11623
|
if not interrogation_performed:
|
|
10637
11624
|
return ["" for _ in range(len(pre))]
|
|
@@ -10640,11 +11627,13 @@ def _transform_tbl_preprocessed(pre: str, interrogation_performed: bool) -> list
|
|
|
10640
11627
|
# (either 'unchanged' (None) or 'modified' (not None))
|
|
10641
11628
|
status_list = []
|
|
10642
11629
|
|
|
10643
|
-
for
|
|
10644
|
-
if
|
|
10645
|
-
status_list.append("
|
|
10646
|
-
|
|
11630
|
+
for i in range(len(pre)):
|
|
11631
|
+
if seg[i] is not None:
|
|
11632
|
+
status_list.append("segmented")
|
|
11633
|
+
elif pre[i] is not None:
|
|
10647
11634
|
status_list.append("modified")
|
|
11635
|
+
else:
|
|
11636
|
+
status_list.append("unchanged")
|
|
10648
11637
|
|
|
10649
11638
|
return _get_preprocessed_table_icon(icon=status_list)
|
|
10650
11639
|
|
|
@@ -10752,7 +11741,11 @@ def _transform_w_e_c(values, color, interrogation_performed):
|
|
|
10752
11741
|
|
|
10753
11742
|
|
|
10754
11743
|
def _transform_assertion_str(
|
|
10755
|
-
assertion_str: list[str],
|
|
11744
|
+
assertion_str: list[str],
|
|
11745
|
+
brief_str: list[str | None],
|
|
11746
|
+
autobrief_str: list[str],
|
|
11747
|
+
segmentation_str: list[tuple | None],
|
|
11748
|
+
lang: str,
|
|
10756
11749
|
) -> list[str]:
|
|
10757
11750
|
# Get the SVG icons for the assertion types
|
|
10758
11751
|
svg_icon = _get_assertion_icon(icon=assertion_str)
|
|
@@ -10813,6 +11806,26 @@ def _transform_assertion_str(
|
|
|
10813
11806
|
for assertion, svg, size, brief_div in zip(assertion_str, svg_icon, text_size, brief_divs)
|
|
10814
11807
|
]
|
|
10815
11808
|
|
|
11809
|
+
# If the `segments` list is not empty, prepend a segmentation div to the `type_upd` strings
|
|
11810
|
+
if segmentation_str:
|
|
11811
|
+
for i in range(len(type_upd)):
|
|
11812
|
+
if segmentation_str[i] is not None:
|
|
11813
|
+
# Get the column name and value from the segmentation expression
|
|
11814
|
+
column_name = segmentation_str[i][0]
|
|
11815
|
+
column_value = segmentation_str[i][1]
|
|
11816
|
+
# Create the segmentation div
|
|
11817
|
+
segmentation_div = (
|
|
11818
|
+
"<div style='margin-top: 0px; margin-bottom: 0px; "
|
|
11819
|
+
"white-space: pre; font-size: 8px; color: darkblue; padding-bottom: 4px; "
|
|
11820
|
+
"'>"
|
|
11821
|
+
"<strong><span style='font-family: Helvetica, arial, sans-serif;'>"
|
|
11822
|
+
f"SEGMENT </span></strong><span>{column_name} / {column_value}"
|
|
11823
|
+
"</span>"
|
|
11824
|
+
"</div>"
|
|
11825
|
+
)
|
|
11826
|
+
# Prepend the segmentation div to the type_upd string
|
|
11827
|
+
type_upd[i] = f"{segmentation_div} {type_upd[i]}"
|
|
11828
|
+
|
|
10816
11829
|
return type_upd
|
|
10817
11830
|
|
|
10818
11831
|
|