pointblank 0.8.6__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -7,6 +7,7 @@ import datetime
7
7
  import inspect
8
8
  import json
9
9
  import re
10
+ import tempfile
10
11
  import threading
11
12
  from dataclasses import dataclass
12
13
  from importlib.metadata import version
@@ -57,6 +58,7 @@ from pointblank._interrogation import (
57
58
  RowCountMatch,
58
59
  RowsDistinct,
59
60
  )
61
+ from pointblank._typing import SegmentSpec
60
62
  from pointblank._utils import (
61
63
  _check_any_df_lib,
62
64
  _check_invalid_fields,
@@ -87,6 +89,8 @@ from pointblank.thresholds import (
87
89
  )
88
90
 
89
91
  if TYPE_CHECKING:
92
+ from collections.abc import Collection
93
+
90
94
  from pointblank._typing import AbsoluteBounds, Tolerance
91
95
 
92
96
  __all__ = [
@@ -117,16 +121,18 @@ def _action_context_manager(metadata):
117
121
  delattr(_action_context, "metadata")
118
122
 
119
123
 
120
- def get_action_metadata():
124
+ def get_action_metadata() -> dict | None:
121
125
  """Access step-level metadata when authoring custom actions.
122
126
 
123
127
  Get the metadata for the validation step where an action was triggered. This can be called by
124
- user functions to get the metadata for the current action.
128
+ user functions to get the metadata for the current action. This function can only be used within
129
+ callables crafted for the [`Actions`](`pointblank.Actions`) class.
125
130
 
126
131
  Returns
127
132
  -------
128
- dict
129
- A dictionary containing the metadata for the current step.
133
+ dict | None
134
+ A dictionary containing the metadata for the current step. If called outside of an action
135
+ (i.e., when no action is being executed), this function will return `None`.
130
136
 
131
137
  Description of the Metadata Fields
132
138
  ----------------------------------
@@ -161,7 +167,7 @@ def get_action_metadata():
161
167
  thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
162
168
  actions=pb.Actions(warning=log_issue),
163
169
  )
164
- .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
170
+ .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
165
171
  .col_vals_gt(columns="item_revenue", value=0.05)
166
172
  .col_vals_gt(
167
173
  columns="session_duration",
@@ -179,6 +185,11 @@ def get_action_metadata():
179
185
  - the `metadata` is a dictionary that is used to craft the log message
180
186
  - the action is passed as a bare function to the `Actions` object within the `Validate` object
181
187
  (placing it within `Validate(actions=)` ensures it's set as an action for every validation step)
188
+
189
+ See Also
190
+ --------
191
+ Have a look at [`Actions`](`pointblank.Actions`) for more information on how to create custom
192
+ actions for validation steps that exceed a set threshold value.
182
193
  """
183
194
  if hasattr(_action_context, "metadata"): # pragma: no cover
184
195
  return _action_context.metadata # pragma: no cover
@@ -202,17 +213,19 @@ def _final_action_context_manager(summary):
202
213
  delattr(_final_action_context, "summary")
203
214
 
204
215
 
205
- def get_validation_summary():
216
+ def get_validation_summary() -> dict | None:
206
217
  """Access validation summary information when authoring final actions.
207
218
 
208
219
  This function provides a convenient way to access summary information about the validation
209
220
  process within a final action. It returns a dictionary with key metrics from the validation
210
- process.
221
+ process. This function can only be used within callables crafted for the
222
+ [`FinalActions`](`pointblank.FinalActions`) class.
211
223
 
212
224
  Returns
213
225
  -------
214
226
  dict | None
215
- A dictionary containing validation metrics, or None if called outside a final action.
227
+ A dictionary containing validation metrics. If called outside of an final action context,
228
+ this function will return `None`.
216
229
 
217
230
  Description of the Summary Fields
218
231
  --------------------------------
@@ -302,6 +315,11 @@ def get_validation_summary():
302
315
 
303
316
  Final actions work well with both simple logging and more complex notification systems, allowing
304
317
  you to integrate validation results into your broader data quality workflows.
318
+
319
+ See Also
320
+ --------
321
+ Have a look at [`FinalActions`](`pointblank.FinalActions`) for more information on how to create
322
+ custom actions that are executed after all validation steps have been completed.
305
323
  """
306
324
  if hasattr(_final_action_context, "summary"):
307
325
  return _final_action_context.summary
@@ -514,10 +532,10 @@ def load_dataset(
514
532
  data_path = files("pointblank.data") / f"{dataset}-duckdb.zip"
515
533
 
516
534
  # Unzip the DuckDB dataset to a temporary directory
517
- with ZipFile(data_path, "r") as z:
518
- z.extractall(path="datasets")
535
+ with tempfile.TemporaryDirectory() as tmp, ZipFile(data_path, "r") as z:
536
+ z.extractall(path=tmp)
519
537
 
520
- data_path = f"datasets/{dataset}.ddb"
538
+ data_path = f"{tmp}/{dataset}.ddb"
521
539
 
522
540
  dataset = ibis.connect(f"duckdb://{data_path}").table(dataset)
523
541
 
@@ -1781,14 +1799,15 @@ class _ValidationInfo:
1781
1799
  assertion_type
1782
1800
  The type of assertion. This is the method name of the validation (e.g., `"col_vals_gt"`).
1783
1801
  column
1784
- The column to validate. Currently we don't allow for column expressions (which may map to
1785
- multiple columns).
1802
+ The column(s) to validate.
1786
1803
  values
1787
1804
  The value or values to compare against.
1788
1805
  na_pass
1789
1806
  Whether to pass test units that hold missing values.
1790
1807
  pre
1791
1808
  A preprocessing function or lambda to apply to the data table for the validation step.
1809
+ segments
1810
+ The segments to use for the validation step.
1792
1811
  thresholds
1793
1812
  The threshold values for the validation.
1794
1813
  actions
@@ -1839,11 +1858,12 @@ class _ValidationInfo:
1839
1858
  step_id: str | None = None
1840
1859
  sha1: str | None = None
1841
1860
  assertion_type: str | None = None
1842
- column: str | None = None
1861
+ column: any | None = None
1843
1862
  values: any | list[any] | tuple | None = None
1844
1863
  inclusive: tuple[bool, bool] | None = None
1845
1864
  na_pass: bool | None = None
1846
1865
  pre: Callable | None = None
1866
+ segments: any | None = None
1847
1867
  thresholds: Thresholds | None = None
1848
1868
  actions: Actions | None = None
1849
1869
  label: str | None = None
@@ -1907,7 +1927,7 @@ class Validate:
1907
1927
  The table to validate, which could be a DataFrame object or an Ibis table object. Read the
1908
1928
  *Supported Input Table Types* section for details on the supported table types.
1909
1929
  tbl_name
1910
- A optional name to assign to the input table object. If no value is provided, a name will
1930
+ An optional name to assign to the input table object. If no value is provided, a name will
1911
1931
  be generated based on whatever information is available. This table name will be displayed
1912
1932
  in the header area of the tabular report.
1913
1933
  label
@@ -2321,6 +2341,7 @@ class Validate:
2321
2341
  value: float | int | Column,
2322
2342
  na_pass: bool = False,
2323
2343
  pre: Callable | None = None,
2344
+ segments: SegmentSpec | None = None,
2324
2345
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
2325
2346
  actions: Actions | None = None,
2326
2347
  brief: str | bool | None = None,
@@ -2352,10 +2373,15 @@ class Validate:
2352
2373
  Should any encountered None, NA, or Null values be considered as passing test units? By
2353
2374
  default, this is `False`. Set to `True` to pass test units with missing values.
2354
2375
  pre
2355
- A optional preprocessing function or lambda to apply to the data table during
2376
+ An optional preprocessing function or lambda to apply to the data table during
2356
2377
  interrogation. This function should take a table as input and return a modified table.
2357
2378
  Have a look at the *Preprocessing* section for more information on how to use this
2358
2379
  argument.
2380
+ segments
2381
+ An optional directive on segmentation, which serves to split a validation step into
2382
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
2383
+ column name and its corresponding values to segment on, or a combination of both
2384
+ (provided as a list). Read the *Segmentation* section for usage information.
2359
2385
  thresholds
2360
2386
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
2361
2387
  The thresholds are set at the step level and will override any global thresholds set in
@@ -2418,6 +2444,42 @@ class Validate:
2418
2444
  lifetime of the transformed table, it only exists during the validation step and is not
2419
2445
  stored in the `Validate` object or used in subsequent validation steps.
2420
2446
 
2447
+ Segmentation
2448
+ ------------
2449
+ The `segments=` argument allows for the segmentation of a validation step into multiple
2450
+ segments. This is useful for applying the same validation step to different subsets of the
2451
+ data. The segmentation can be done based on a single column or specific fields within a
2452
+ column.
2453
+
2454
+ Providing a single column name will result in a separate validation step for each unique
2455
+ value in that column. For example, if you have a column called `"region"` with values
2456
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
2457
+ region.
2458
+
2459
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
2460
+ values to segment on. For example, if you have a column called `"date"` and you want to
2461
+ segment on only specific dates, you can provide a tuple like
2462
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
2463
+ (i.e., no validation steps will be created for them).
2464
+
2465
+ A list with a combination of column names and tuples can be provided as well. This allows
2466
+ for more complex segmentation scenarios. The following inputs are all valid:
2467
+
2468
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
2469
+ in the `"region"` column and specific dates in the `"date"` column
2470
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
2471
+ columns
2472
+
2473
+ The segmentation is performed during interrogation, and the resulting validation steps will
2474
+ be numbered sequentially. Each segment will have its own validation step, and the results
2475
+ will be reported separately. This allows for a more granular analysis of the data and helps
2476
+ identify issues within specific segments.
2477
+
2478
+ Importantly, the segmentation process will be performed after any preprocessing of the data
2479
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
2480
+ that can be used for segmentation. For example, you could create a new column called
2481
+ `"segment"` through use of `pre=` and then use that column for segmentation.
2482
+
2421
2483
  Thresholds
2422
2484
  ----------
2423
2485
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -2516,6 +2578,8 @@ class Validate:
2516
2578
  _check_column(column=columns)
2517
2579
  # _check_value_float_int(value=value)
2518
2580
  _check_pre(pre=pre)
2581
+ # TODO: add check for segments
2582
+ # _check_segments(segments=segments)
2519
2583
  _check_thresholds(thresholds=thresholds)
2520
2584
  _check_boolean_input(param=na_pass, param_name="na_pass")
2521
2585
  _check_boolean_input(param=active, param_name="active")
@@ -2548,6 +2612,7 @@ class Validate:
2548
2612
  values=value,
2549
2613
  na_pass=na_pass,
2550
2614
  pre=pre,
2615
+ segments=segments,
2551
2616
  thresholds=thresholds,
2552
2617
  actions=actions,
2553
2618
  brief=brief,
@@ -2564,6 +2629,7 @@ class Validate:
2564
2629
  value: float | int | Column,
2565
2630
  na_pass: bool = False,
2566
2631
  pre: Callable | None = None,
2632
+ segments: SegmentSpec | None = None,
2567
2633
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
2568
2634
  actions: Actions | None = None,
2569
2635
  brief: str | bool | None = None,
@@ -2595,10 +2661,15 @@ class Validate:
2595
2661
  Should any encountered None, NA, or Null values be considered as passing test units? By
2596
2662
  default, this is `False`. Set to `True` to pass test units with missing values.
2597
2663
  pre
2598
- A optional preprocessing function or lambda to apply to the data table during
2664
+ An optional preprocessing function or lambda to apply to the data table during
2599
2665
  interrogation. This function should take a table as input and return a modified table.
2600
2666
  Have a look at the *Preprocessing* section for more information on how to use this
2601
2667
  argument.
2668
+ segments
2669
+ An optional directive on segmentation, which serves to split a validation step into
2670
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
2671
+ column name and its corresponding values to segment on, or a combination of both
2672
+ (provided as a list). Read the *Segmentation* section for usage information.
2602
2673
  thresholds
2603
2674
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
2604
2675
  The thresholds are set at the step level and will override any global thresholds set in
@@ -2661,6 +2732,42 @@ class Validate:
2661
2732
  lifetime of the transformed table, it only exists during the validation step and is not
2662
2733
  stored in the `Validate` object or used in subsequent validation steps.
2663
2734
 
2735
+ Segmentation
2736
+ ------------
2737
+ The `segments=` argument allows for the segmentation of a validation step into multiple
2738
+ segments. This is useful for applying the same validation step to different subsets of the
2739
+ data. The segmentation can be done based on a single column or specific fields within a
2740
+ column.
2741
+
2742
+ Providing a single column name will result in a separate validation step for each unique
2743
+ value in that column. For example, if you have a column called `"region"` with values
2744
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
2745
+ region.
2746
+
2747
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
2748
+ values to segment on. For example, if you have a column called `"date"` and you want to
2749
+ segment on only specific dates, you can provide a tuple like
2750
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
2751
+ (i.e., no validation steps will be created for them).
2752
+
2753
+ A list with a combination of column names and tuples can be provided as well. This allows
2754
+ for more complex segmentation scenarios. The following inputs are all valid:
2755
+
2756
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
2757
+ in the `"region"` column and specific dates in the `"date"` column
2758
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
2759
+ columns
2760
+
2761
+ The segmentation is performed during interrogation, and the resulting validation steps will
2762
+ be numbered sequentially. Each segment will have its own validation step, and the results
2763
+ will be reported separately. This allows for a more granular analysis of the data and helps
2764
+ identify issues within specific segments.
2765
+
2766
+ Importantly, the segmentation process will be performed after any preprocessing of the data
2767
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
2768
+ that can be used for segmentation. For example, you could create a new column called
2769
+ `"segment"` through use of `pre=` and then use that column for segmentation.
2770
+
2664
2771
  Thresholds
2665
2772
  ----------
2666
2773
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -2758,6 +2865,8 @@ class Validate:
2758
2865
  _check_column(column=columns)
2759
2866
  # _check_value_float_int(value=value)
2760
2867
  _check_pre(pre=pre)
2868
+ # TODO: add check for segments
2869
+ # _check_segments(segments=segments)
2761
2870
  _check_thresholds(thresholds=thresholds)
2762
2871
  _check_boolean_input(param=na_pass, param_name="na_pass")
2763
2872
  _check_boolean_input(param=active, param_name="active")
@@ -2790,6 +2899,7 @@ class Validate:
2790
2899
  values=value,
2791
2900
  na_pass=na_pass,
2792
2901
  pre=pre,
2902
+ segments=segments,
2793
2903
  thresholds=thresholds,
2794
2904
  actions=actions,
2795
2905
  brief=brief,
@@ -2806,6 +2916,7 @@ class Validate:
2806
2916
  value: float | int | Column,
2807
2917
  na_pass: bool = False,
2808
2918
  pre: Callable | None = None,
2919
+ segments: SegmentSpec | None = None,
2809
2920
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
2810
2921
  actions: Actions | None = None,
2811
2922
  brief: str | bool | None = None,
@@ -2837,10 +2948,15 @@ class Validate:
2837
2948
  Should any encountered None, NA, or Null values be considered as passing test units? By
2838
2949
  default, this is `False`. Set to `True` to pass test units with missing values.
2839
2950
  pre
2840
- A optional preprocessing function or lambda to apply to the data table during
2951
+ An optional preprocessing function or lambda to apply to the data table during
2841
2952
  interrogation. This function should take a table as input and return a modified table.
2842
2953
  Have a look at the *Preprocessing* section for more information on how to use this
2843
2954
  argument.
2955
+ segments
2956
+ An optional directive on segmentation, which serves to split a validation step into
2957
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
2958
+ column name and its corresponding values to segment on, or a combination of both
2959
+ (provided as a list). Read the *Segmentation* section for usage information.
2844
2960
  thresholds
2845
2961
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
2846
2962
  The thresholds are set at the step level and will override any global thresholds set in
@@ -2903,6 +3019,42 @@ class Validate:
2903
3019
  lifetime of the transformed table, it only exists during the validation step and is not
2904
3020
  stored in the `Validate` object or used in subsequent validation steps.
2905
3021
 
3022
+ Segmentation
3023
+ ------------
3024
+ The `segments=` argument allows for the segmentation of a validation step into multiple
3025
+ segments. This is useful for applying the same validation step to different subsets of the
3026
+ data. The segmentation can be done based on a single column or specific fields within a
3027
+ column.
3028
+
3029
+ Providing a single column name will result in a separate validation step for each unique
3030
+ value in that column. For example, if you have a column called `"region"` with values
3031
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
3032
+ region.
3033
+
3034
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
3035
+ values to segment on. For example, if you have a column called `"date"` and you want to
3036
+ segment on only specific dates, you can provide a tuple like
3037
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
3038
+ (i.e., no validation steps will be created for them).
3039
+
3040
+ A list with a combination of column names and tuples can be provided as well. This allows
3041
+ for more complex segmentation scenarios. The following inputs are all valid:
3042
+
3043
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3044
+ in the `"region"` column and specific dates in the `"date"` column
3045
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3046
+ columns
3047
+
3048
+ The segmentation is performed during interrogation, and the resulting validation steps will
3049
+ be numbered sequentially. Each segment will have its own validation step, and the results
3050
+ will be reported separately. This allows for a more granular analysis of the data and helps
3051
+ identify issues within specific segments.
3052
+
3053
+ Importantly, the segmentation process will be performed after any preprocessing of the data
3054
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
3055
+ that can be used for segmentation. For example, you could create a new column called
3056
+ `"segment"` through use of `pre=` and then use that column for segmentation.
3057
+
2906
3058
  Thresholds
2907
3059
  ----------
2908
3060
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -2999,6 +3151,8 @@ class Validate:
2999
3151
  _check_column(column=columns)
3000
3152
  # _check_value_float_int(value=value)
3001
3153
  _check_pre(pre=pre)
3154
+ # TODO: add check for segments
3155
+ # _check_segments(segments=segments)
3002
3156
  _check_thresholds(thresholds=thresholds)
3003
3157
  _check_boolean_input(param=na_pass, param_name="na_pass")
3004
3158
  _check_boolean_input(param=active, param_name="active")
@@ -3031,6 +3185,7 @@ class Validate:
3031
3185
  values=value,
3032
3186
  na_pass=na_pass,
3033
3187
  pre=pre,
3188
+ segments=segments,
3034
3189
  thresholds=thresholds,
3035
3190
  actions=actions,
3036
3191
  brief=brief,
@@ -3047,6 +3202,7 @@ class Validate:
3047
3202
  value: float | int | Column,
3048
3203
  na_pass: bool = False,
3049
3204
  pre: Callable | None = None,
3205
+ segments: SegmentSpec | None = None,
3050
3206
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
3051
3207
  actions: Actions | None = None,
3052
3208
  brief: str | bool | None = None,
@@ -3078,10 +3234,15 @@ class Validate:
3078
3234
  Should any encountered None, NA, or Null values be considered as passing test units? By
3079
3235
  default, this is `False`. Set to `True` to pass test units with missing values.
3080
3236
  pre
3081
- A optional preprocessing function or lambda to apply to the data table during
3237
+ An optional preprocessing function or lambda to apply to the data table during
3082
3238
  interrogation. This function should take a table as input and return a modified table.
3083
3239
  Have a look at the *Preprocessing* section for more information on how to use this
3084
3240
  argument.
3241
+ segments
3242
+ An optional directive on segmentation, which serves to split a validation step into
3243
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
3244
+ column name and its corresponding values to segment on, or a combination of both
3245
+ (provided as a list). Read the *Segmentation* section for usage information.
3085
3246
  thresholds
3086
3247
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
3087
3248
  The thresholds are set at the step level and will override any global thresholds set in
@@ -3144,6 +3305,42 @@ class Validate:
3144
3305
  lifetime of the transformed table, it only exists during the validation step and is not
3145
3306
  stored in the `Validate` object or used in subsequent validation steps.
3146
3307
 
3308
+ Segmentation
3309
+ ------------
3310
+ The `segments=` argument allows for the segmentation of a validation step into multiple
3311
+ segments. This is useful for applying the same validation step to different subsets of the
3312
+ data. The segmentation can be done based on a single column or specific fields within a
3313
+ column.
3314
+
3315
+ Providing a single column name will result in a separate validation step for each unique
3316
+ value in that column. For example, if you have a column called `"region"` with values
3317
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
3318
+ region.
3319
+
3320
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
3321
+ values to segment on. For example, if you have a column called `"date"` and you want to
3322
+ segment on only specific dates, you can provide a tuple like
3323
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
3324
+ (i.e., no validation steps will be created for them).
3325
+
3326
+ A list with a combination of column names and tuples can be provided as well. This allows
3327
+ for more complex segmentation scenarios. The following inputs are all valid:
3328
+
3329
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3330
+ in the `"region"` column and specific dates in the `"date"` column
3331
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3332
+ columns
3333
+
3334
+ The segmentation is performed during interrogation, and the resulting validation steps will
3335
+ be numbered sequentially. Each segment will have its own validation step, and the results
3336
+ will be reported separately. This allows for a more granular analysis of the data and helps
3337
+ identify issues within specific segments.
3338
+
3339
+ Importantly, the segmentation process will be performed after any preprocessing of the data
3340
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
3341
+ that can be used for segmentation. For example, you could create a new column called
3342
+ `"segment"` through use of `pre=` and then use that column for segmentation.
3343
+
3147
3344
  Thresholds
3148
3345
  ----------
3149
3346
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3238,6 +3435,8 @@ class Validate:
3238
3435
  _check_column(column=columns)
3239
3436
  # _check_value_float_int(value=value)
3240
3437
  _check_pre(pre=pre)
3438
+ # TODO: add check for segments
3439
+ # _check_segments(segments=segments)
3241
3440
  _check_thresholds(thresholds=thresholds)
3242
3441
  _check_boolean_input(param=na_pass, param_name="na_pass")
3243
3442
  _check_boolean_input(param=active, param_name="active")
@@ -3270,6 +3469,7 @@ class Validate:
3270
3469
  values=value,
3271
3470
  na_pass=na_pass,
3272
3471
  pre=pre,
3472
+ segments=segments,
3273
3473
  thresholds=thresholds,
3274
3474
  actions=actions,
3275
3475
  brief=brief,
@@ -3286,6 +3486,7 @@ class Validate:
3286
3486
  value: float | int | Column,
3287
3487
  na_pass: bool = False,
3288
3488
  pre: Callable | None = None,
3489
+ segments: SegmentSpec | None = None,
3289
3490
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
3290
3491
  actions: Actions | None = None,
3291
3492
  brief: str | bool | None = None,
@@ -3317,10 +3518,15 @@ class Validate:
3317
3518
  Should any encountered None, NA, or Null values be considered as passing test units? By
3318
3519
  default, this is `False`. Set to `True` to pass test units with missing values.
3319
3520
  pre
3320
- A optional preprocessing function or lambda to apply to the data table during
3521
+ An optional preprocessing function or lambda to apply to the data table during
3321
3522
  interrogation. This function should take a table as input and return a modified table.
3322
3523
  Have a look at the *Preprocessing* section for more information on how to use this
3323
3524
  argument.
3525
+ segments
3526
+ An optional directive on segmentation, which serves to split a validation step into
3527
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
3528
+ column name and its corresponding values to segment on, or a combination of both
3529
+ (provided as a list). Read the *Segmentation* section for usage information.
3324
3530
  thresholds
3325
3531
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
3326
3532
  The thresholds are set at the step level and will override any global thresholds set in
@@ -3383,6 +3589,42 @@ class Validate:
3383
3589
  lifetime of the transformed table, it only exists during the validation step and is not
3384
3590
  stored in the `Validate` object or used in subsequent validation steps.
3385
3591
 
3592
+ Segmentation
3593
+ ------------
3594
+ The `segments=` argument allows for the segmentation of a validation step into multiple
3595
+ segments. This is useful for applying the same validation step to different subsets of the
3596
+ data. The segmentation can be done based on a single column or specific fields within a
3597
+ column.
3598
+
3599
+ Providing a single column name will result in a separate validation step for each unique
3600
+ value in that column. For example, if you have a column called `"region"` with values
3601
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
3602
+ region.
3603
+
3604
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
3605
+ values to segment on. For example, if you have a column called `"date"` and you want to
3606
+ segment on only specific dates, you can provide a tuple like
3607
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
3608
+ (i.e., no validation steps will be created for them).
3609
+
3610
+ A list with a combination of column names and tuples can be provided as well. This allows
3611
+ for more complex segmentation scenarios. The following inputs are all valid:
3612
+
3613
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3614
+ in the `"region"` column and specific dates in the `"date"` column
3615
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3616
+ columns
3617
+
3618
+ The segmentation is performed during interrogation, and the resulting validation steps will
3619
+ be numbered sequentially. Each segment will have its own validation step, and the results
3620
+ will be reported separately. This allows for a more granular analysis of the data and helps
3621
+ identify issues within specific segments.
3622
+
3623
+ Importantly, the segmentation process will be performed after any preprocessing of the data
3624
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
3625
+ that can be used for segmentation. For example, you could create a new column called
3626
+ `"segment"` through use of `pre=` and then use that column for segmentation.
3627
+
3386
3628
  Thresholds
3387
3629
  ----------
3388
3630
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3481,6 +3723,8 @@ class Validate:
3481
3723
  _check_column(column=columns)
3482
3724
  # _check_value_float_int(value=value)
3483
3725
  _check_pre(pre=pre)
3726
+ # TODO: add check for segments
3727
+ # _check_segments(segments=segments)
3484
3728
  _check_thresholds(thresholds=thresholds)
3485
3729
  _check_boolean_input(param=na_pass, param_name="na_pass")
3486
3730
  _check_boolean_input(param=active, param_name="active")
@@ -3513,6 +3757,7 @@ class Validate:
3513
3757
  values=value,
3514
3758
  na_pass=na_pass,
3515
3759
  pre=pre,
3760
+ segments=segments,
3516
3761
  thresholds=thresholds,
3517
3762
  actions=actions,
3518
3763
  brief=brief,
@@ -3529,6 +3774,7 @@ class Validate:
3529
3774
  value: float | int | Column,
3530
3775
  na_pass: bool = False,
3531
3776
  pre: Callable | None = None,
3777
+ segments: SegmentSpec | None = None,
3532
3778
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
3533
3779
  actions: Actions | None = None,
3534
3780
  brief: str | bool | None = None,
@@ -3560,10 +3806,15 @@ class Validate:
3560
3806
  Should any encountered None, NA, or Null values be considered as passing test units? By
3561
3807
  default, this is `False`. Set to `True` to pass test units with missing values.
3562
3808
  pre
3563
- A optional preprocessing function or lambda to apply to the data table during
3809
+ An optional preprocessing function or lambda to apply to the data table during
3564
3810
  interrogation. This function should take a table as input and return a modified table.
3565
3811
  Have a look at the *Preprocessing* section for more information on how to use this
3566
3812
  argument.
3813
+ segments
3814
+ An optional directive on segmentation, which serves to split a validation step into
3815
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
3816
+ column name and its corresponding values to segment on, or a combination of both
3817
+ (provided as a list). Read the *Segmentation* section for usage information.
3567
3818
  thresholds
3568
3819
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
3569
3820
  The thresholds are set at the step level and will override any global thresholds set in
@@ -3626,6 +3877,42 @@ class Validate:
3626
3877
  lifetime of the transformed table, it only exists during the validation step and is not
3627
3878
  stored in the `Validate` object or used in subsequent validation steps.
3628
3879
 
3880
+ Segmentation
3881
+ ------------
3882
+ The `segments=` argument allows for the segmentation of a validation step into multiple
3883
+ segments. This is useful for applying the same validation step to different subsets of the
3884
+ data. The segmentation can be done based on a single column or specific fields within a
3885
+ column.
3886
+
3887
+ Providing a single column name will result in a separate validation step for each unique
3888
+ value in that column. For example, if you have a column called `"region"` with values
3889
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
3890
+ region.
3891
+
3892
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
3893
+ values to segment on. For example, if you have a column called `"date"` and you want to
3894
+ segment on only specific dates, you can provide a tuple like
3895
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
3896
+ (i.e., no validation steps will be created for them).
3897
+
3898
+ A list with a combination of column names and tuples can be provided as well. This allows
3899
+ for more complex segmentation scenarios. The following inputs are all valid:
3900
+
3901
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3902
+ in the `"region"` column and specific dates in the `"date"` column
3903
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3904
+ columns
3905
+
3906
+ The segmentation is performed during interrogation, and the resulting validation steps will
3907
+ be numbered sequentially. Each segment will have its own validation step, and the results
3908
+ will be reported separately. This allows for a more granular analysis of the data and helps
3909
+ identify issues within specific segments.
3910
+
3911
+ Importantly, the segmentation process will be performed after any preprocessing of the data
3912
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
3913
+ that can be used for segmentation. For example, you could create a new column called
3914
+ `"segment"` through use of `pre=` and then use that column for segmentation.
3915
+
3629
3916
  Thresholds
3630
3917
  ----------
3631
3918
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3724,6 +4011,8 @@ class Validate:
3724
4011
  _check_column(column=columns)
3725
4012
  # _check_value_float_int(value=value)
3726
4013
  _check_pre(pre=pre)
4014
+ # TODO: add check for segments
4015
+ # _check_segments(segments=segments)
3727
4016
  _check_thresholds(thresholds=thresholds)
3728
4017
  _check_boolean_input(param=na_pass, param_name="na_pass")
3729
4018
  _check_boolean_input(param=active, param_name="active")
@@ -3756,6 +4045,7 @@ class Validate:
3756
4045
  values=value,
3757
4046
  na_pass=na_pass,
3758
4047
  pre=pre,
4048
+ segments=segments,
3759
4049
  thresholds=thresholds,
3760
4050
  actions=actions,
3761
4051
  brief=brief,
@@ -3774,6 +4064,7 @@ class Validate:
3774
4064
  inclusive: tuple[bool, bool] = (True, True),
3775
4065
  na_pass: bool = False,
3776
4066
  pre: Callable | None = None,
4067
+ segments: SegmentSpec | None = None,
3777
4068
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
3778
4069
  actions: Actions | None = None,
3779
4070
  brief: str | bool | None = None,
@@ -3815,10 +4106,15 @@ class Validate:
3815
4106
  Should any encountered None, NA, or Null values be considered as passing test units? By
3816
4107
  default, this is `False`. Set to `True` to pass test units with missing values.
3817
4108
  pre
3818
- A optional preprocessing function or lambda to apply to the data table during
4109
+ An optional preprocessing function or lambda to apply to the data table during
3819
4110
  interrogation. This function should take a table as input and return a modified table.
3820
4111
  Have a look at the *Preprocessing* section for more information on how to use this
3821
4112
  argument.
4113
+ segments
4114
+ An optional directive on segmentation, which serves to split a validation step into
4115
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4116
+ column name and its corresponding values to segment on, or a combination of both
4117
+ (provided as a list). Read the *Segmentation* section for usage information.
3822
4118
  thresholds
3823
4119
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
3824
4120
  The thresholds are set at the step level and will override any global thresholds set in
@@ -3883,6 +4179,42 @@ class Validate:
3883
4179
  lifetime of the transformed table, it only exists during the validation step and is not
3884
4180
  stored in the `Validate` object or used in subsequent validation steps.
3885
4181
 
4182
+ Segmentation
4183
+ ------------
4184
+ The `segments=` argument allows for the segmentation of a validation step into multiple
4185
+ segments. This is useful for applying the same validation step to different subsets of the
4186
+ data. The segmentation can be done based on a single column or specific fields within a
4187
+ column.
4188
+
4189
+ Providing a single column name will result in a separate validation step for each unique
4190
+ value in that column. For example, if you have a column called `"region"` with values
4191
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
4192
+ region.
4193
+
4194
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
4195
+ values to segment on. For example, if you have a column called `"date"` and you want to
4196
+ segment on only specific dates, you can provide a tuple like
4197
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
4198
+ (i.e., no validation steps will be created for them).
4199
+
4200
+ A list with a combination of column names and tuples can be provided as well. This allows
4201
+ for more complex segmentation scenarios. The following inputs are all valid:
4202
+
4203
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4204
+ in the `"region"` column and specific dates in the `"date"` column
4205
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4206
+ columns
4207
+
4208
+ The segmentation is performed during interrogation, and the resulting validation steps will
4209
+ be numbered sequentially. Each segment will have its own validation step, and the results
4210
+ will be reported separately. This allows for a more granular analysis of the data and helps
4211
+ identify issues within specific segments.
4212
+
4213
+ Importantly, the segmentation process will be performed after any preprocessing of the data
4214
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
4215
+ that can be used for segmentation. For example, you could create a new column called
4216
+ `"segment"` through use of `pre=` and then use that column for segmentation.
4217
+
3886
4218
  Thresholds
3887
4219
  ----------
3888
4220
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3990,6 +4322,8 @@ class Validate:
3990
4322
  # _check_value_float_int(value=left)
3991
4323
  # _check_value_float_int(value=right)
3992
4324
  _check_pre(pre=pre)
4325
+ # TODO: add check for segments
4326
+ # _check_segments(segments=segments)
3993
4327
  _check_thresholds(thresholds=thresholds)
3994
4328
  _check_boolean_input(param=na_pass, param_name="na_pass")
3995
4329
  _check_boolean_input(param=active, param_name="active")
@@ -4027,6 +4361,7 @@ class Validate:
4027
4361
  inclusive=inclusive,
4028
4362
  na_pass=na_pass,
4029
4363
  pre=pre,
4364
+ segments=segments,
4030
4365
  thresholds=thresholds,
4031
4366
  actions=actions,
4032
4367
  brief=brief,
@@ -4045,6 +4380,7 @@ class Validate:
4045
4380
  inclusive: tuple[bool, bool] = (True, True),
4046
4381
  na_pass: bool = False,
4047
4382
  pre: Callable | None = None,
4383
+ segments: SegmentSpec | None = None,
4048
4384
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4049
4385
  actions: Actions | None = None,
4050
4386
  brief: str | bool | None = None,
@@ -4086,10 +4422,15 @@ class Validate:
4086
4422
  Should any encountered None, NA, or Null values be considered as passing test units? By
4087
4423
  default, this is `False`. Set to `True` to pass test units with missing values.
4088
4424
  pre
4089
- A optional preprocessing function or lambda to apply to the data table during
4425
+ An optional preprocessing function or lambda to apply to the data table during
4090
4426
  interrogation. This function should take a table as input and return a modified table.
4091
4427
  Have a look at the *Preprocessing* section for more information on how to use this
4092
4428
  argument.
4429
+ segments
4430
+ An optional directive on segmentation, which serves to split a validation step into
4431
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4432
+ column name and its corresponding values to segment on, or a combination of both
4433
+ (provided as a list). Read the *Segmentation* section for usage information.
4093
4434
  thresholds
4094
4435
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4095
4436
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4154,6 +4495,42 @@ class Validate:
4154
4495
  lifetime of the transformed table, it only exists during the validation step and is not
4155
4496
  stored in the `Validate` object or used in subsequent validation steps.
4156
4497
 
4498
+ Segmentation
4499
+ ------------
4500
+ The `segments=` argument allows for the segmentation of a validation step into multiple
4501
+ segments. This is useful for applying the same validation step to different subsets of the
4502
+ data. The segmentation can be done based on a single column or specific fields within a
4503
+ column.
4504
+
4505
+ Providing a single column name will result in a separate validation step for each unique
4506
+ value in that column. For example, if you have a column called `"region"` with values
4507
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
4508
+ region.
4509
+
4510
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
4511
+ values to segment on. For example, if you have a column called `"date"` and you want to
4512
+ segment on only specific dates, you can provide a tuple like
4513
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
4514
+ (i.e., no validation steps will be created for them).
4515
+
4516
+ A list with a combination of column names and tuples can be provided as well. This allows
4517
+ for more complex segmentation scenarios. The following inputs are all valid:
4518
+
4519
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4520
+ in the `"region"` column and specific dates in the `"date"` column
4521
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4522
+ columns
4523
+
4524
+ The segmentation is performed during interrogation, and the resulting validation steps will
4525
+ be numbered sequentially. Each segment will have its own validation step, and the results
4526
+ will be reported separately. This allows for a more granular analysis of the data and helps
4527
+ identify issues within specific segments.
4528
+
4529
+ Importantly, the segmentation process will be performed after any preprocessing of the data
4530
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
4531
+ that can be used for segmentation. For example, you could create a new column called
4532
+ `"segment"` through use of `pre=` and then use that column for segmentation.
4533
+
4157
4534
  Thresholds
4158
4535
  ----------
4159
4536
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4261,6 +4638,8 @@ class Validate:
4261
4638
  # _check_value_float_int(value=left)
4262
4639
  # _check_value_float_int(value=right)
4263
4640
  _check_pre(pre=pre)
4641
+ # TODO: add check for segments
4642
+ # _check_segments(segments=segments)
4264
4643
  _check_thresholds(thresholds=thresholds)
4265
4644
  _check_boolean_input(param=na_pass, param_name="na_pass")
4266
4645
  _check_boolean_input(param=active, param_name="active")
@@ -4298,6 +4677,7 @@ class Validate:
4298
4677
  inclusive=inclusive,
4299
4678
  na_pass=na_pass,
4300
4679
  pre=pre,
4680
+ segments=segments,
4301
4681
  thresholds=thresholds,
4302
4682
  actions=actions,
4303
4683
  brief=brief,
@@ -4311,8 +4691,9 @@ class Validate:
4311
4691
  def col_vals_in_set(
4312
4692
  self,
4313
4693
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
4314
- set: list[float | int],
4694
+ set: Collection[Any],
4315
4695
  pre: Callable | None = None,
4696
+ segments: SegmentSpec | None = None,
4316
4697
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4317
4698
  actions: Actions | None = None,
4318
4699
  brief: str | bool | None = None,
@@ -4336,10 +4717,15 @@ class Validate:
4336
4717
  set
4337
4718
  A list of values to compare against.
4338
4719
  pre
4339
- A optional preprocessing function or lambda to apply to the data table during
4720
+ An optional preprocessing function or lambda to apply to the data table during
4340
4721
  interrogation. This function should take a table as input and return a modified table.
4341
4722
  Have a look at the *Preprocessing* section for more information on how to use this
4342
4723
  argument.
4724
+ segments
4725
+ An optional directive on segmentation, which serves to split a validation step into
4726
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4727
+ column name and its corresponding values to segment on, or a combination of both
4728
+ (provided as a list). Read the *Segmentation* section for usage information.
4343
4729
  thresholds
4344
4730
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4345
4731
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4381,6 +4767,42 @@ class Validate:
4381
4767
  only exists during the validation step and is not stored in the `Validate` object or used in
4382
4768
  subsequent validation steps.
4383
4769
 
4770
+ Segmentation
4771
+ ------------
4772
+ The `segments=` argument allows for the segmentation of a validation step into multiple
4773
+ segments. This is useful for applying the same validation step to different subsets of the
4774
+ data. The segmentation can be done based on a single column or specific fields within a
4775
+ column.
4776
+
4777
+ Providing a single column name will result in a separate validation step for each unique
4778
+ value in that column. For example, if you have a column called `"region"` with values
4779
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
4780
+ region.
4781
+
4782
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
4783
+ values to segment on. For example, if you have a column called `"date"` and you want to
4784
+ segment on only specific dates, you can provide a tuple like
4785
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
4786
+ (i.e., no validation steps will be created for them).
4787
+
4788
+ A list with a combination of column names and tuples can be provided as well. This allows
4789
+ for more complex segmentation scenarios. The following inputs are all valid:
4790
+
4791
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4792
+ in the `"region"` column and specific dates in the `"date"` column
4793
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4794
+ columns
4795
+
4796
+ The segmentation is performed during interrogation, and the resulting validation steps will
4797
+ be numbered sequentially. Each segment will have its own validation step, and the results
4798
+ will be reported separately. This allows for a more granular analysis of the data and helps
4799
+ identify issues within specific segments.
4800
+
4801
+ Importantly, the segmentation process will be performed after any preprocessing of the data
4802
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
4803
+ that can be used for segmentation. For example, you could create a new column called
4804
+ `"segment"` through use of `pre=` and then use that column for segmentation.
4805
+
4384
4806
  Thresholds
4385
4807
  ----------
4386
4808
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4471,8 +4893,16 @@ class Validate:
4471
4893
  assertion_type = _get_fn_name()
4472
4894
 
4473
4895
  _check_column(column=columns)
4474
- _check_set_types(set=set)
4896
+
4897
+ for val in set:
4898
+ if val is None:
4899
+ continue
4900
+ if not isinstance(val, (float, int, str)):
4901
+ raise ValueError("`set=` must be a list of floats, integers, or strings.")
4902
+
4475
4903
  _check_pre(pre=pre)
4904
+ # TODO: add check for segments
4905
+ # _check_segments(segments=segments)
4476
4906
  _check_thresholds(thresholds=thresholds)
4477
4907
  _check_boolean_input(param=active, param_name="active")
4478
4908
 
@@ -4500,6 +4930,7 @@ class Validate:
4500
4930
  column=column,
4501
4931
  values=set,
4502
4932
  pre=pre,
4933
+ segments=segments,
4503
4934
  thresholds=thresholds,
4504
4935
  actions=actions,
4505
4936
  brief=brief,
@@ -4515,6 +4946,7 @@ class Validate:
4515
4946
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
4516
4947
  set: list[float | int],
4517
4948
  pre: Callable | None = None,
4949
+ segments: SegmentSpec | None = None,
4518
4950
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4519
4951
  actions: Actions | None = None,
4520
4952
  brief: str | bool | None = None,
@@ -4538,10 +4970,15 @@ class Validate:
4538
4970
  set
4539
4971
  A list of values to compare against.
4540
4972
  pre
4541
- A optional preprocessing function or lambda to apply to the data table during
4973
+ An optional preprocessing function or lambda to apply to the data table during
4542
4974
  interrogation. This function should take a table as input and return a modified table.
4543
4975
  Have a look at the *Preprocessing* section for more information on how to use this
4544
4976
  argument.
4977
+ segments
4978
+ An optional directive on segmentation, which serves to split a validation step into
4979
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4980
+ column name and its corresponding values to segment on, or a combination of both
4981
+ (provided as a list). Read the *Segmentation* section for usage information.
4545
4982
  thresholds
4546
4983
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4547
4984
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4583,6 +5020,42 @@ class Validate:
4583
5020
  only exists during the validation step and is not stored in the `Validate` object or used in
4584
5021
  subsequent validation steps.
4585
5022
 
5023
+ Segmentation
5024
+ ------------
5025
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5026
+ segments. This is useful for applying the same validation step to different subsets of the
5027
+ data. The segmentation can be done based on a single column or specific fields within a
5028
+ column.
5029
+
5030
+ Providing a single column name will result in a separate validation step for each unique
5031
+ value in that column. For example, if you have a column called `"region"` with values
5032
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
5033
+ region.
5034
+
5035
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
5036
+ values to segment on. For example, if you have a column called `"date"` and you want to
5037
+ segment on only specific dates, you can provide a tuple like
5038
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
5039
+ (i.e., no validation steps will be created for them).
5040
+
5041
+ A list with a combination of column names and tuples can be provided as well. This allows
5042
+ for more complex segmentation scenarios. The following inputs are all valid:
5043
+
5044
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5045
+ in the `"region"` column and specific dates in the `"date"` column
5046
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5047
+ columns
5048
+
5049
+ The segmentation is performed during interrogation, and the resulting validation steps will
5050
+ be numbered sequentially. Each segment will have its own validation step, and the results
5051
+ will be reported separately. This allows for a more granular analysis of the data and helps
5052
+ identify issues within specific segments.
5053
+
5054
+ Importantly, the segmentation process will be performed after any preprocessing of the data
5055
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
5056
+ that can be used for segmentation. For example, you could create a new column called
5057
+ `"segment"` through use of `pre=` and then use that column for segmentation.
5058
+
4586
5059
  Thresholds
4587
5060
  ----------
4588
5061
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4676,6 +5149,8 @@ class Validate:
4676
5149
  _check_column(column=columns)
4677
5150
  _check_set_types(set=set)
4678
5151
  _check_pre(pre=pre)
5152
+ # TODO: add check for segments
5153
+ # _check_segments(segments=segments)
4679
5154
  _check_thresholds(thresholds=thresholds)
4680
5155
  _check_boolean_input(param=active, param_name="active")
4681
5156
 
@@ -4703,6 +5178,7 @@ class Validate:
4703
5178
  column=column,
4704
5179
  values=set,
4705
5180
  pre=pre,
5181
+ segments=segments,
4706
5182
  thresholds=thresholds,
4707
5183
  actions=actions,
4708
5184
  brief=brief,
@@ -4717,6 +5193,7 @@ class Validate:
4717
5193
  self,
4718
5194
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
4719
5195
  pre: Callable | None = None,
5196
+ segments: SegmentSpec | None = None,
4720
5197
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4721
5198
  actions: Actions | None = None,
4722
5199
  brief: str | bool | None = None,
@@ -4737,10 +5214,15 @@ class Validate:
4737
5214
  multiple columns are supplied or resolved, there will be a separate validation step
4738
5215
  generated for each column.
4739
5216
  pre
4740
- A optional preprocessing function or lambda to apply to the data table during
5217
+ An optional preprocessing function or lambda to apply to the data table during
4741
5218
  interrogation. This function should take a table as input and return a modified table.
4742
5219
  Have a look at the *Preprocessing* section for more information on how to use this
4743
5220
  argument.
5221
+ segments
5222
+ An optional directive on segmentation, which serves to split a validation step into
5223
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
5224
+ column name and its corresponding values to segment on, or a combination of both
5225
+ (provided as a list). Read the *Segmentation* section for usage information.
4744
5226
  thresholds
4745
5227
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4746
5228
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4782,6 +5264,42 @@ class Validate:
4782
5264
  only exists during the validation step and is not stored in the `Validate` object or used in
4783
5265
  subsequent validation steps.
4784
5266
 
5267
+ Segmentation
5268
+ ------------
5269
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5270
+ segments. This is useful for applying the same validation step to different subsets of the
5271
+ data. The segmentation can be done based on a single column or specific fields within a
5272
+ column.
5273
+
5274
+ Providing a single column name will result in a separate validation step for each unique
5275
+ value in that column. For example, if you have a column called `"region"` with values
5276
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
5277
+ region.
5278
+
5279
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
5280
+ values to segment on. For example, if you have a column called `"date"` and you want to
5281
+ segment on only specific dates, you can provide a tuple like
5282
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
5283
+ (i.e., no validation steps will be created for them).
5284
+
5285
+ A list with a combination of column names and tuples can be provided as well. This allows
5286
+ for more complex segmentation scenarios. The following inputs are all valid:
5287
+
5288
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5289
+ in the `"region"` column and specific dates in the `"date"` column
5290
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5291
+ columns
5292
+
5293
+ The segmentation is performed during interrogation, and the resulting validation steps will
5294
+ be numbered sequentially. Each segment will have its own validation step, and the results
5295
+ will be reported separately. This allows for a more granular analysis of the data and helps
5296
+ identify issues within specific segments.
5297
+
5298
+ Importantly, the segmentation process will be performed after any preprocessing of the data
5299
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
5300
+ that can be used for segmentation. For example, you could create a new column called
5301
+ `"segment"` through use of `pre=` and then use that column for segmentation.
5302
+
4785
5303
  Thresholds
4786
5304
  ----------
4787
5305
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4871,6 +5389,8 @@ class Validate:
4871
5389
 
4872
5390
  _check_column(column=columns)
4873
5391
  _check_pre(pre=pre)
5392
+ # TODO: add check for segments
5393
+ # _check_segments(segments=segments)
4874
5394
  _check_thresholds(thresholds=thresholds)
4875
5395
  _check_boolean_input(param=active, param_name="active")
4876
5396
 
@@ -4897,6 +5417,7 @@ class Validate:
4897
5417
  assertion_type=assertion_type,
4898
5418
  column=column,
4899
5419
  pre=pre,
5420
+ segments=segments,
4900
5421
  thresholds=thresholds,
4901
5422
  actions=actions,
4902
5423
  brief=brief,
@@ -4911,6 +5432,7 @@ class Validate:
4911
5432
  self,
4912
5433
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
4913
5434
  pre: Callable | None = None,
5435
+ segments: SegmentSpec | None = None,
4914
5436
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4915
5437
  actions: Actions | None = None,
4916
5438
  brief: str | bool | None = None,
@@ -4931,10 +5453,15 @@ class Validate:
4931
5453
  multiple columns are supplied or resolved, there will be a separate validation step
4932
5454
  generated for each column.
4933
5455
  pre
4934
- A optional preprocessing function or lambda to apply to the data table during
5456
+ An optional preprocessing function or lambda to apply to the data table during
4935
5457
  interrogation. This function should take a table as input and return a modified table.
4936
5458
  Have a look at the *Preprocessing* section for more information on how to use this
4937
5459
  argument.
5460
+ segments
5461
+ An optional directive on segmentation, which serves to split a validation step into
5462
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
5463
+ column name and its corresponding values to segment on, or a combination of both
5464
+ (provided as a list). Read the *Segmentation* section for usage information.
4938
5465
  thresholds
4939
5466
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4940
5467
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4976,6 +5503,42 @@ class Validate:
4976
5503
  only exists during the validation step and is not stored in the `Validate` object or used in
4977
5504
  subsequent validation steps.
4978
5505
 
5506
+ Segmentation
5507
+ ------------
5508
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5509
+ segments. This is useful for applying the same validation step to different subsets of the
5510
+ data. The segmentation can be done based on a single column or specific fields within a
5511
+ column.
5512
+
5513
+ Providing a single column name will result in a separate validation step for each unique
5514
+ value in that column. For example, if you have a column called `"region"` with values
5515
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
5516
+ region.
5517
+
5518
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
5519
+ values to segment on. For example, if you have a column called `"date"` and you want to
5520
+ segment on only specific dates, you can provide a tuple like
5521
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
5522
+ (i.e., no validation steps will be created for them).
5523
+
5524
+ A list with a combination of column names and tuples can be provided as well. This allows
5525
+ for more complex segmentation scenarios. The following inputs are all valid:
5526
+
5527
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5528
+ in the `"region"` column and specific dates in the `"date"` column
5529
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5530
+ columns
5531
+
5532
+ The segmentation is performed during interrogation, and the resulting validation steps will
5533
+ be numbered sequentially. Each segment will have its own validation step, and the results
5534
+ will be reported separately. This allows for a more granular analysis of the data and helps
5535
+ identify issues within specific segments.
5536
+
5537
+ Importantly, the segmentation process will be performed after any preprocessing of the data
5538
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
5539
+ that can be used for segmentation. For example, you could create a new column called
5540
+ `"segment"` through use of `pre=` and then use that column for segmentation.
5541
+
4979
5542
  Thresholds
4980
5543
  ----------
4981
5544
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5065,6 +5628,8 @@ class Validate:
5065
5628
 
5066
5629
  _check_column(column=columns)
5067
5630
  _check_pre(pre=pre)
5631
+ # TODO: add check for segments
5632
+ # _check_segments(segments=segments)
5068
5633
  _check_thresholds(thresholds=thresholds)
5069
5634
  _check_boolean_input(param=active, param_name="active")
5070
5635
 
@@ -5091,6 +5656,7 @@ class Validate:
5091
5656
  assertion_type=assertion_type,
5092
5657
  column=column,
5093
5658
  pre=pre,
5659
+ segments=segments,
5094
5660
  thresholds=thresholds,
5095
5661
  actions=actions,
5096
5662
  brief=brief,
@@ -5107,6 +5673,7 @@ class Validate:
5107
5673
  pattern: str,
5108
5674
  na_pass: bool = False,
5109
5675
  pre: Callable | None = None,
5676
+ segments: SegmentSpec | None = None,
5110
5677
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
5111
5678
  actions: Actions | None = None,
5112
5679
  brief: str | bool | None = None,
@@ -5133,10 +5700,15 @@ class Validate:
5133
5700
  Should any encountered None, NA, or Null values be considered as passing test units? By
5134
5701
  default, this is `False`. Set to `True` to pass test units with missing values.
5135
5702
  pre
5136
- A optional preprocessing function or lambda to apply to the data table during
5703
+ An optional preprocessing function or lambda to apply to the data table during
5137
5704
  interrogation. This function should take a table as input and return a modified table.
5138
5705
  Have a look at the *Preprocessing* section for more information on how to use this
5139
5706
  argument.
5707
+ segments
5708
+ An optional directive on segmentation, which serves to split a validation step into
5709
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
5710
+ column name and its corresponding values to segment on, or a combination of both
5711
+ (provided as a list). Read the *Segmentation* section for usage information.
5140
5712
  thresholds
5141
5713
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
5142
5714
  The thresholds are set at the step level and will override any global thresholds set in
@@ -5178,6 +5750,42 @@ class Validate:
5178
5750
  only exists during the validation step and is not stored in the `Validate` object or used in
5179
5751
  subsequent validation steps.
5180
5752
 
5753
+ Segmentation
5754
+ ------------
5755
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5756
+ segments. This is useful for applying the same validation step to different subsets of the
5757
+ data. The segmentation can be done based on a single column or specific fields within a
5758
+ column.
5759
+
5760
+ Providing a single column name will result in a separate validation step for each unique
5761
+ value in that column. For example, if you have a column called `"region"` with values
5762
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
5763
+ region.
5764
+
5765
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
5766
+ values to segment on. For example, if you have a column called `"date"` and you want to
5767
+ segment on only specific dates, you can provide a tuple like
5768
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
5769
+ (i.e., no validation steps will be created for them).
5770
+
5771
+ A list with a combination of column names and tuples can be provided as well. This allows
5772
+ for more complex segmentation scenarios. The following inputs are all valid:
5773
+
5774
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5775
+ in the `"region"` column and specific dates in the `"date"` column
5776
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5777
+ columns
5778
+
5779
+ The segmentation is performed during interrogation, and the resulting validation steps will
5780
+ be numbered sequentially. Each segment will have its own validation step, and the results
5781
+ will be reported separately. This allows for a more granular analysis of the data and helps
5782
+ identify issues within specific segments.
5783
+
5784
+ Importantly, the segmentation process will be performed after any preprocessing of the data
5785
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
5786
+ that can be used for segmentation. For example, you could create a new column called
5787
+ `"segment"` through use of `pre=` and then use that column for segmentation.
5788
+
5181
5789
  Thresholds
5182
5790
  ----------
5183
5791
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5269,6 +5877,8 @@ class Validate:
5269
5877
 
5270
5878
  _check_column(column=columns)
5271
5879
  _check_pre(pre=pre)
5880
+ # TODO: add check for segments
5881
+ # _check_segments(segments=segments)
5272
5882
  _check_thresholds(thresholds=thresholds)
5273
5883
  _check_boolean_input(param=na_pass, param_name="na_pass")
5274
5884
  _check_boolean_input(param=active, param_name="active")
@@ -5298,6 +5908,7 @@ class Validate:
5298
5908
  values=pattern,
5299
5909
  na_pass=na_pass,
5300
5910
  pre=pre,
5911
+ segments=segments,
5301
5912
  thresholds=thresholds,
5302
5913
  actions=actions,
5303
5914
  brief=brief,
@@ -5312,6 +5923,7 @@ class Validate:
5312
5923
  self,
5313
5924
  expr: any,
5314
5925
  pre: Callable | None = None,
5926
+ segments: SegmentSpec | None = None,
5315
5927
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
5316
5928
  actions: Actions | None = None,
5317
5929
  brief: str | bool | None = None,
@@ -5333,10 +5945,15 @@ class Validate:
5333
5945
  be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
5334
5946
  should either be a lambda expression or a Narwhals column expression.
5335
5947
  pre
5336
- A optional preprocessing function or lambda to apply to the data table during
5948
+ An optional preprocessing function or lambda to apply to the data table during
5337
5949
  interrogation. This function should take a table as input and return a modified table.
5338
5950
  Have a look at the *Preprocessing* section for more information on how to use this
5339
5951
  argument.
5952
+ segments
5953
+ An optional directive on segmentation, which serves to split a validation step into
5954
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
5955
+ column name and its corresponding values to segment on, or a combination of both
5956
+ (provided as a list). Read the *Segmentation* section for usage information.
5340
5957
  thresholds
5341
5958
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
5342
5959
  The thresholds are set at the step level and will override any global thresholds set in
@@ -5376,6 +5993,42 @@ class Validate:
5376
5993
  transformed table, it only exists during the validation step and is not stored in the
5377
5994
  `Validate` object or used in subsequent validation steps.
5378
5995
 
5996
+ Segmentation
5997
+ ------------
5998
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5999
+ segments. This is useful for applying the same validation step to different subsets of the
6000
+ data. The segmentation can be done based on a single column or specific fields within a
6001
+ column.
6002
+
6003
+ Providing a single column name will result in a separate validation step for each unique
6004
+ value in that column. For example, if you have a column called `"region"` with values
6005
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
6006
+ region.
6007
+
6008
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
6009
+ values to segment on. For example, if you have a column called `"date"` and you want to
6010
+ segment on only specific dates, you can provide a tuple like
6011
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
6012
+ (i.e., no validation steps will be created for them).
6013
+
6014
+ A list with a combination of column names and tuples can be provided as well. This allows
6015
+ for more complex segmentation scenarios. The following inputs are all valid:
6016
+
6017
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6018
+ in the `"region"` column and specific dates in the `"date"` column
6019
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6020
+ columns
6021
+
6022
+ The segmentation is performed during interrogation, and the resulting validation steps will
6023
+ be numbered sequentially. Each segment will have its own validation step, and the results
6024
+ will be reported separately. This allows for a more granular analysis of the data and helps
6025
+ identify issues within specific segments.
6026
+
6027
+ Importantly, the segmentation process will be performed after any preprocessing of the data
6028
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
6029
+ that can be used for segmentation. For example, you could create a new column called
6030
+ `"segment"` through use of `pre=` and then use that column for segmentation.
6031
+
5379
6032
  Thresholds
5380
6033
  ----------
5381
6034
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5453,6 +6106,8 @@ class Validate:
5453
6106
  # TODO: Add a check for the expression to ensure it's a valid expression object
5454
6107
  # _check_expr(expr=expr)
5455
6108
  _check_pre(pre=pre)
6109
+ # TODO: add check for segments
6110
+ # _check_segments(segments=segments)
5456
6111
  _check_thresholds(thresholds=thresholds)
5457
6112
  _check_boolean_input(param=active, param_name="active")
5458
6113
 
@@ -5469,6 +6124,7 @@ class Validate:
5469
6124
  column=None,
5470
6125
  values=expr,
5471
6126
  pre=pre,
6127
+ segments=segments,
5472
6128
  thresholds=thresholds,
5473
6129
  actions=actions,
5474
6130
  brief=brief,
@@ -5657,6 +6313,7 @@ class Validate:
5657
6313
  self,
5658
6314
  columns_subset: str | list[str] | None = None,
5659
6315
  pre: Callable | None = None,
6316
+ segments: SegmentSpec | None = None,
5660
6317
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
5661
6318
  actions: Actions | None = None,
5662
6319
  brief: str | bool | None = None,
@@ -5677,10 +6334,15 @@ class Validate:
5677
6334
  columns are supplied, the distinct comparison will be made over the combination of
5678
6335
  values in those columns.
5679
6336
  pre
5680
- A optional preprocessing function or lambda to apply to the data table during
6337
+ An optional preprocessing function or lambda to apply to the data table during
5681
6338
  interrogation. This function should take a table as input and return a modified table.
5682
6339
  Have a look at the *Preprocessing* section for more information on how to use this
5683
6340
  argument.
6341
+ segments
6342
+ An optional directive on segmentation, which serves to split a validation step into
6343
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
6344
+ column name and its corresponding values to segment on, or a combination of both
6345
+ (provided as a list). Read the *Segmentation* section for usage information.
5684
6346
  thresholds
5685
6347
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
5686
6348
  The thresholds are set at the step level and will override any global thresholds set in
@@ -5722,6 +6384,42 @@ class Validate:
5722
6384
  table, it only exists during the validation step and is not stored in the `Validate` object
5723
6385
  or used in subsequent validation steps.
5724
6386
 
6387
+ Segmentation
6388
+ ------------
6389
+ The `segments=` argument allows for the segmentation of a validation step into multiple
6390
+ segments. This is useful for applying the same validation step to different subsets of the
6391
+ data. The segmentation can be done based on a single column or specific fields within a
6392
+ column.
6393
+
6394
+ Providing a single column name will result in a separate validation step for each unique
6395
+ value in that column. For example, if you have a column called `"region"` with values
6396
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
6397
+ region.
6398
+
6399
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
6400
+ values to segment on. For example, if you have a column called `"date"` and you want to
6401
+ segment on only specific dates, you can provide a tuple like
6402
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
6403
+ (i.e., no validation steps will be created for them).
6404
+
6405
+ A list with a combination of column names and tuples can be provided as well. This allows
6406
+ for more complex segmentation scenarios. The following inputs are all valid:
6407
+
6408
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6409
+ in the `"region"` column and specific dates in the `"date"` column
6410
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6411
+ columns
6412
+
6413
+ The segmentation is performed during interrogation, and the resulting validation steps will
6414
+ be numbered sequentially. Each segment will have its own validation step, and the results
6415
+ will be reported separately. This allows for a more granular analysis of the data and helps
6416
+ identify issues within specific segments.
6417
+
6418
+ Importantly, the segmentation process will be performed after any preprocessing of the data
6419
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
6420
+ that can be used for segmentation. For example, you could create a new column called
6421
+ `"segment"` through use of `pre=` and then use that column for segmentation.
6422
+
5725
6423
  Thresholds
5726
6424
  ----------
5727
6425
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5815,6 +6513,8 @@ class Validate:
5815
6513
  assertion_type = _get_fn_name()
5816
6514
 
5817
6515
  _check_pre(pre=pre)
6516
+ # TODO: add check for segments
6517
+ # _check_segments(segments=segments)
5818
6518
  _check_thresholds(thresholds=thresholds)
5819
6519
  _check_boolean_input(param=active, param_name="active")
5820
6520
 
@@ -5835,6 +6535,7 @@ class Validate:
5835
6535
  assertion_type=assertion_type,
5836
6536
  column=columns_subset,
5837
6537
  pre=pre,
6538
+ segments=segments,
5838
6539
  thresholds=thresholds,
5839
6540
  actions=actions,
5840
6541
  brief=brief,
@@ -5895,7 +6596,7 @@ class Validate:
5895
6596
  substring matches are allowed, so a schema data type of `Int` would match a target table
5896
6597
  data type of `Int64`.
5897
6598
  pre
5898
- A optional preprocessing function or lambda to apply to the data table during
6599
+ An optional preprocessing function or lambda to apply to the data table during
5899
6600
  interrogation. This function should take a table as input and return a modified table.
5900
6601
  Have a look at the *Preprocessing* section for more information on how to use this
5901
6602
  argument.
@@ -6108,7 +6809,7 @@ class Validate:
6108
6809
  Should the validation step be inverted? If `True`, then the expectation is that the row
6109
6810
  count of the target table should not match the specified `count=` value.
6110
6811
  pre
6111
- A optional preprocessing function or lambda to apply to the data table during
6812
+ An optional preprocessing function or lambda to apply to the data table during
6112
6813
  interrogation. This function should take a table as input and return a modified table.
6113
6814
  Have a look at the *Preprocessing* section for more information on how to use this
6114
6815
  argument.
@@ -6318,7 +7019,7 @@ class Validate:
6318
7019
  Should the validation step be inverted? If `True`, then the expectation is that the
6319
7020
  column count of the target table should not match the specified `count=` value.
6320
7021
  pre
6321
- A optional preprocessing function or lambda to apply to the data table during
7022
+ An optional preprocessing function or lambda to apply to the data table during
6322
7023
  interrogation. This function should take a table as input and return a modified table.
6323
7024
  Have a look at the *Preprocessing* section for more information on how to use this
6324
7025
  argument.
@@ -6836,10 +7537,14 @@ class Validate:
6836
7537
 
6837
7538
  self.time_start = datetime.datetime.now(datetime.timezone.utc)
6838
7539
 
6839
- # Expand `validation_info` by evaluating any column expressions in `column`
7540
+ # Expand `validation_info` by evaluating any column expressions in `columns=`
6840
7541
  # (the `_evaluate_column_exprs()` method will eval and expand as needed)
6841
7542
  self._evaluate_column_exprs(validation_info=self.validation_info)
6842
7543
 
7544
+ # Expand `validation_info` by evaluating for any segmentation directives
7545
+ # provided in `segments=` (the `_evaluate_segments()` method will eval and expand as needed)
7546
+ self._evaluate_segments(validation_info=self.validation_info)
7547
+
6843
7548
  for validation in self.validation_info:
6844
7549
  # Set the `i` value for the validation step (this is 1-indexed)
6845
7550
  index_value = self.validation_info.index(validation) + 1
@@ -6875,6 +7580,10 @@ class Validate:
6875
7580
 
6876
7581
  validation.autobrief = autobrief
6877
7582
 
7583
+ # ------------------------------------------------
7584
+ # Bypassing the validation step if conditions met
7585
+ # ------------------------------------------------
7586
+
6878
7587
  # Skip the validation step if it is not active but still record the time of processing
6879
7588
  if not validation.active:
6880
7589
  end_time = datetime.datetime.now(datetime.timezone.utc)
@@ -6931,6 +7640,17 @@ class Validate:
6931
7640
  elif isinstance(validation.pre, Callable):
6932
7641
  data_tbl_step = validation.pre(data_tbl_step)
6933
7642
 
7643
+ # ------------------------------------------------
7644
+ # Segmentation stage
7645
+ # ------------------------------------------------
7646
+
7647
+ # Determine whether any segmentation directives are to be applied to the table
7648
+
7649
+ if validation.segments is not None:
7650
+ data_tbl_step = _apply_segments(
7651
+ data_tbl=data_tbl_step, segments_expr=validation.segments
7652
+ )
7653
+
6934
7654
  validation.n = NumberOfTestUnits(df=data_tbl_step, column=column).get_test_units(
6935
7655
  tbl_type=tbl_type
6936
7656
  )
@@ -8832,6 +9552,13 @@ class Validate:
8832
9552
  # will be made blank if the validation has not been performed
8833
9553
  interrogation_performed = validation_info_dict.get("proc_duration_s", [None])[0] is not None
8834
9554
 
9555
+ # Determine which steps are those using segmented data
9556
+ segmented_steps = [
9557
+ i + 1
9558
+ for i, segment in enumerate(validation_info_dict["segments"])
9559
+ if segment is not None
9560
+ ]
9561
+
8835
9562
  # ------------------------------------------------
8836
9563
  # Process the `type_upd` entry
8837
9564
  # ------------------------------------------------
@@ -8841,6 +9568,7 @@ class Validate:
8841
9568
  assertion_str=validation_info_dict["assertion_type"],
8842
9569
  brief_str=validation_info_dict["brief"],
8843
9570
  autobrief_str=validation_info_dict["autobrief"],
9571
+ segmentation_str=validation_info_dict["segments"],
8844
9572
  lang=lang,
8845
9573
  )
8846
9574
 
@@ -8972,11 +9700,14 @@ class Validate:
8972
9700
  # Add the `tbl` entry
8973
9701
  # ------------------------------------------------
8974
9702
 
8975
- # Depending on if there was some preprocessing done, get the appropriate icon
8976
- # for the table processing status to be displayed in the report under the `tbl` column
9703
+ # Depending on if there was some preprocessing done, get the appropriate icon for
9704
+ # the table processing status to be displayed in the report under the `tbl` column
9705
+ # TODO: add the icon for the segmented data option when the step is segmented
8977
9706
 
8978
9707
  validation_info_dict["tbl"] = _transform_tbl_preprocessed(
8979
- pre=validation_info_dict["pre"], interrogation_performed=interrogation_performed
9708
+ pre=validation_info_dict["pre"],
9709
+ seg=validation_info_dict["segments"],
9710
+ interrogation_performed=interrogation_performed,
8980
9711
  )
8981
9712
 
8982
9713
  # ------------------------------------------------
@@ -9011,8 +9742,9 @@ class Validate:
9011
9742
  # Process `pass` and `fail` entries
9012
9743
  # ------------------------------------------------
9013
9744
 
9014
- # Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries (the length
9015
- # of the `pass` entry should be equal to the length of the `n_passed` and `n_failed` entries)
9745
+ # Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries
9746
+ # (the length of the `pass` entry should be equal to the length of the
9747
+ # `n_passed` and `n_failed` entries)
9016
9748
 
9017
9749
  validation_info_dict["pass"] = _transform_passed_failed(
9018
9750
  n_passed_failed=validation_info_dict["n_passed"],
@@ -9165,6 +9897,9 @@ class Validate:
9165
9897
  # Remove the `pre` entry from the dictionary
9166
9898
  validation_info_dict.pop("pre")
9167
9899
 
9900
+ # Remove the `segments` entry from the dictionary
9901
+ validation_info_dict.pop("segments")
9902
+
9168
9903
  # Remove the `proc_duration_s` entry from the dictionary
9169
9904
  validation_info_dict.pop("proc_duration_s")
9170
9905
 
@@ -9247,6 +9982,10 @@ class Validate:
9247
9982
  columns=["type_upd", "columns_upd", "values_upd", "test_units", "pass", "fail"]
9248
9983
  ),
9249
9984
  )
9985
+ .tab_style(
9986
+ style=style.css("overflow-x: visible; white-space: nowrap;"),
9987
+ locations=loc.body(columns="type_upd", rows=segmented_steps),
9988
+ )
9250
9989
  .tab_style(
9251
9990
  style=style.fill(color="#FCFCFC" if interrogation_performed else "white"),
9252
9991
  locations=loc.body(columns=["w_upd", "e_upd", "c_upd"]),
@@ -9421,8 +10160,8 @@ class Validate:
9421
10160
  table object, which can be displayed in a notebook or exported to an HTML file.
9422
10161
 
9423
10162
  :::{.callout-warning}
9424
- The `get_step_report()` is still experimental. Please report any issues you encounter in the
9425
- [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
10163
+ The `get_step_report()` method is still experimental. Please report any issues you encounter
10164
+ in the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
9426
10165
  :::
9427
10166
 
9428
10167
  Parameters
@@ -9455,6 +10194,35 @@ class Validate:
9455
10194
  GT
9456
10195
  A GT table object that represents the detailed report for the validation step.
9457
10196
 
10197
+ Types of Step Reports
10198
+ ---------------------
10199
+ The `get_step_report()` method produces a report based on the *type* of validation step.
10200
+ The following row-based validation methods will produce a report that shows the rows of the
10201
+ data that failed because of failing test units within one or more columns failed:
10202
+
10203
+ - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
10204
+ - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
10205
+ - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
10206
+ - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
10207
+ - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
10208
+ - [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
10209
+ - [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
10210
+ - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
10211
+ - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
10212
+ - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
10213
+ - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
10214
+ - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
10215
+ - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
10216
+ - [`conjointly()`](`pointblank.Validate.conjointly`)
10217
+
10218
+ The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
10219
+ report that shows duplicate rows (or duplicate values in one or a set of columns as defined
10220
+ in that method's `columns_subset=` parameter.
10221
+
10222
+ The [`col_schema_match()`](`pointblank.Validate.col_schema_match`) validation step will
10223
+ produce a report that shows the schema of the data table and the schema of the validation
10224
+ step. The report will indicate whether the schemas match or not.
10225
+
9458
10226
  Examples
9459
10227
  --------
9460
10228
  ```{python}
@@ -9480,7 +10248,7 @@ class Validate:
9480
10248
  .col_vals_lt(columns="d", value=3500)
9481
10249
  .col_vals_between(columns="c", left=1, right=8)
9482
10250
  .col_vals_gt(columns="a", value=3)
9483
- .col_vals_regex(columns="b", pattern=r"\d-[a-z]{3}-\d{3}")
10251
+ .col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
9484
10252
  .interrogate()
9485
10253
  )
9486
10254
 
@@ -9768,6 +10536,95 @@ class Validate:
9768
10536
 
9769
10537
  return self
9770
10538
 
10539
+ def _evaluate_segments(self, validation_info):
10540
+ """
10541
+ Evaluate any segmentation expressions stored in the `segments` attribute and expand each
10542
+ validation step with such directives into multiple. This is done by evaluating the
10543
+ segmentation expression and creating a new validation step for each segment. Errors in
10544
+ evaluation (such as no segments matched) will be caught and recorded in the `eval_error`
10545
+ attribute.
10546
+
10547
+ Parameters
10548
+ ----------
10549
+ validation_info
10550
+ Information about the validation to evaluate and expand.
10551
+ """
10552
+
10553
+ # Create a list to store the expanded validation steps
10554
+ expanded_validation_info = []
10555
+
10556
+ # Iterate over the validation steps
10557
+ for i, validation in enumerate(validation_info):
10558
+ # Get the segments expression
10559
+ segments_expr = validation.segments
10560
+
10561
+ # If the value is None, then skip the evaluation and append the validation step to the
10562
+ # list of expanded validation steps
10563
+ if segments_expr is None:
10564
+ expanded_validation_info.append(validation)
10565
+ continue
10566
+
10567
+ # Evaluate the segments expression
10568
+ try:
10569
+ # Get the table for this step, it can either be:
10570
+ # 1. the target table itself
10571
+ # 2. the target table modified by a `pre` attribute
10572
+
10573
+ if validation.pre is None:
10574
+ table = self.data
10575
+ else:
10576
+ table = validation.pre(self.data)
10577
+
10578
+ # If the `segments` expression is a string, that string is taken as a column name
10579
+ # for which segmentation should occur across unique values in the column
10580
+ if isinstance(segments_expr, str):
10581
+ seg_tuples = _seg_expr_from_string(data_tbl=table, segments_expr=segments_expr)
10582
+
10583
+ # If the 'segments' expression is a tuple, then normalize it to a list of tuples
10584
+ # - ("col", "value") -> [("col", "value")]
10585
+ # - ("col", ["value1", "value2"]) -> [("col", "value1"), ("col", "value2")]
10586
+ elif isinstance(segments_expr, tuple):
10587
+ seg_tuples = _seg_expr_from_tuple(segments_expr=segments_expr)
10588
+
10589
+ # If the 'segments' expression is a list of strings or tuples (can be mixed) then
10590
+ # normalize it to a list of tuples following the rules above
10591
+ elif isinstance(segments_expr, list):
10592
+ seg_tuples = []
10593
+ for seg in segments_expr:
10594
+ if isinstance(seg, str):
10595
+ # Use the utility function for string items
10596
+ str_seg_tuples = _seg_expr_from_string(
10597
+ data_tbl=table, segments_expr=seg
10598
+ )
10599
+ seg_tuples.extend(str_seg_tuples)
10600
+ elif isinstance(seg, tuple):
10601
+ # Use the utility function for tuple items
10602
+ tuple_seg_tuples = _seg_expr_from_tuple(segments_expr=seg)
10603
+ seg_tuples.extend(tuple_seg_tuples)
10604
+ else: # pragma: no cover
10605
+ # Handle invalid segment type
10606
+ raise ValueError(
10607
+ f"Invalid segment expression item type: {type(seg)}. "
10608
+ "Must be either string or tuple."
10609
+ )
10610
+
10611
+ except Exception: # pragma: no cover
10612
+ validation.eval_error = True
10613
+
10614
+ # For each segmentation resolved, create a new validation step and add it to the list of
10615
+ # expanded validation steps
10616
+ for seg in seg_tuples:
10617
+ new_validation = copy.deepcopy(validation)
10618
+
10619
+ new_validation.segments = seg
10620
+
10621
+ expanded_validation_info.append(new_validation)
10622
+
10623
+ # Replace the `validation_info` attribute with the expanded version
10624
+ self.validation_info = expanded_validation_info
10625
+
10626
+ return self
10627
+
9771
10628
  def _get_validation_dict(self, i: int | list[int] | None, attr: str) -> dict[int, int]:
9772
10629
  """
9773
10630
  Utility function to get a dictionary of validation attributes for each validation step.
@@ -10485,6 +11342,143 @@ def _prep_values_text(
10485
11342
  return values_str
10486
11343
 
10487
11344
 
11345
+ def _seg_expr_from_string(data_tbl: any, segments_expr: str) -> tuple[str, str]:
11346
+ """
11347
+ Obtain the segmentation categories from a table column.
11348
+
11349
+ The `segments_expr` value will have been checked to be a string, so there's no need to check for
11350
+ that here. The function will return a list of tuples representing pairings of a column name and
11351
+ a value. The task is to obtain the unique values in the column (handling different table types)
11352
+ and produce a normalized list of tuples of the form: `(column, value)`.
11353
+
11354
+ This function is used to create a list of segments for the validation step. And since there will
11355
+ usually be more than one segment, the validation step will be expanded into multiple during
11356
+ interrogation (where this function is called).
11357
+
11358
+ Parameters
11359
+ ----------
11360
+ data_tbl
11361
+ The table from which to obtain the segmentation categories.
11362
+ segments_expr
11363
+ The column name for which segmentation should occur across unique values in the column.
11364
+
11365
+ Returns
11366
+ -------
11367
+ list[tuple[str, str]]
11368
+ A list of tuples representing pairings of a column name and a value in the column.
11369
+ """
11370
+ # Determine if the table is a DataFrame or a DB table
11371
+ tbl_type = _get_tbl_type(data=data_tbl)
11372
+
11373
+ # Obtain the segmentation categories from the table column given as `segments_expr`
11374
+ if tbl_type == "polars":
11375
+ seg_categories = data_tbl[segments_expr].unique().to_list()
11376
+ elif tbl_type == "pandas":
11377
+ seg_categories = data_tbl[segments_expr].unique().tolist()
11378
+ elif tbl_type in IBIS_BACKENDS:
11379
+ distinct_col_vals = data_tbl.select(segments_expr).distinct()
11380
+ seg_categories = distinct_col_vals[segments_expr].to_list()
11381
+ else: # pragma: no cover
11382
+ raise ValueError(f"Unsupported table type: {tbl_type}")
11383
+
11384
+ # Ensure that the categories are sorted
11385
+ seg_categories.sort()
11386
+
11387
+ # Place each category and each value in a list of tuples as: `(column, value)`
11388
+ seg_tuples = [(segments_expr, category) for category in seg_categories]
11389
+
11390
+ return seg_tuples
11391
+
11392
+
11393
+ def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, str]]:
11394
+ """
11395
+ Normalize the segments expression to a list of tuples, given a single tuple.
11396
+
11397
+ The `segments_expr` value will have been checked to be a tuple, so there's no need to check for
11398
+ that here. The function will return a list of tuples representing pairings of a column name and
11399
+ a value. The task is to normalize the tuple into a list of tuples of the form:
11400
+ `(column, value)`.
11401
+
11402
+ The following examples show how this normalzation works:
11403
+ - `("col", "value")` -> `[("col", "value")]` (single tuple, upgraded to a list of tuples)
11404
+ - `("col", ["value1", "value2"])` -> `[("col", "value1"), ("col", "value2")]` (tuple with a list
11405
+ of values, expanded into multiple tuples within a list)
11406
+
11407
+ This function is used to create a list of segments for the validation step. And since there will
11408
+ usually be more than one segment, the validation step will be expanded into multiple during
11409
+ interrogation (where this function is called).
11410
+
11411
+ Parameters
11412
+ ----------
11413
+ segments_expr
11414
+ The segments expression to normalize. It can be a tuple of the form
11415
+ `(column, value)` or `(column, [value1, value2])`.
11416
+
11417
+ Returns
11418
+ -------
11419
+ list[tuple[str, str]]
11420
+ A list of tuples representing pairings of a column name and a value in the column.
11421
+ """
11422
+ # Check if the first element is a string
11423
+ if isinstance(segments_expr[0], str):
11424
+ # If the second element is a list, create a list of tuples
11425
+ if isinstance(segments_expr[1], list):
11426
+ seg_tuples = [(segments_expr[0], value) for value in segments_expr[1]]
11427
+ # If the second element is not a list, create a single tuple
11428
+ else:
11429
+ seg_tuples = [(segments_expr[0], segments_expr[1])]
11430
+ # If the first element is not a string, raise an error
11431
+ else: # pragma: no cover
11432
+ raise ValueError("The first element of the segments expression must be a string.")
11433
+
11434
+ return seg_tuples
11435
+
11436
+
11437
+ def _apply_segments(data_tbl: any, segments_expr: tuple[str, str]) -> any:
11438
+ """
11439
+ Apply the segments expression to the data table.
11440
+
11441
+ Filter the data table based on the `segments_expr=` value, where the first element is the
11442
+ column name and the second element is the value to filter by.
11443
+
11444
+ Parameters
11445
+ ----------
11446
+ data_tbl
11447
+ The data table to filter. It can be a Pandas DataFrame, Polars DataFrame, or an Ibis
11448
+ backend table.
11449
+ segments_expr
11450
+ The segments expression to apply. It is a tuple of the form `(column, value)`.
11451
+
11452
+ Returns
11453
+ -------
11454
+ any
11455
+ The filtered data table. It will be of the same type as the input table.
11456
+ """
11457
+ # Get the table type
11458
+ tbl_type = _get_tbl_type(data=data_tbl)
11459
+
11460
+ if tbl_type in ["pandas", "polars"]:
11461
+ # If the table is a Pandas or Polars DataFrame, transforming to a Narwhals table
11462
+ # and perform the filtering operation
11463
+
11464
+ # Transform to Narwhals table if a DataFrame
11465
+ data_tbl_nw = nw.from_native(data_tbl)
11466
+
11467
+ # Filter the data table based on the column name and value
11468
+ data_tbl_nw = data_tbl_nw.filter(nw.col(segments_expr[0]) == segments_expr[1])
11469
+
11470
+ # Transform back to the original table type
11471
+ data_tbl = data_tbl_nw.to_native()
11472
+
11473
+ elif tbl_type in IBIS_BACKENDS:
11474
+ # If the table is an Ibis backend table, perform the filtering operation directly
11475
+
11476
+ # Filter the data table based on the column name and value
11477
+ data_tbl = data_tbl[data_tbl[segments_expr[0]] == segments_expr[1]]
11478
+
11479
+ return data_tbl
11480
+
11481
+
10488
11482
  def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
10489
11483
  """
10490
11484
  Convert a `_ValidationInfo` object to a dictionary.
@@ -10509,6 +11503,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
10509
11503
  "inclusive",
10510
11504
  "na_pass",
10511
11505
  "pre",
11506
+ "segments",
10512
11507
  "label",
10513
11508
  "brief",
10514
11509
  "autobrief",
@@ -10623,7 +11618,7 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s
10623
11618
  return title_text
10624
11619
 
10625
11620
 
10626
- def _transform_tbl_preprocessed(pre: str, interrogation_performed: bool) -> list[str]:
11621
+ def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: bool) -> list[str]:
10627
11622
  # If no interrogation was performed, return a list of empty strings
10628
11623
  if not interrogation_performed:
10629
11624
  return ["" for _ in range(len(pre))]
@@ -10632,11 +11627,13 @@ def _transform_tbl_preprocessed(pre: str, interrogation_performed: bool) -> list
10632
11627
  # (either 'unchanged' (None) or 'modified' (not None))
10633
11628
  status_list = []
10634
11629
 
10635
- for status in pre:
10636
- if status is None:
10637
- status_list.append("unchanged")
10638
- else:
11630
+ for i in range(len(pre)):
11631
+ if seg[i] is not None:
11632
+ status_list.append("segmented")
11633
+ elif pre[i] is not None:
10639
11634
  status_list.append("modified")
11635
+ else:
11636
+ status_list.append("unchanged")
10640
11637
 
10641
11638
  return _get_preprocessed_table_icon(icon=status_list)
10642
11639
 
@@ -10744,7 +11741,11 @@ def _transform_w_e_c(values, color, interrogation_performed):
10744
11741
 
10745
11742
 
10746
11743
  def _transform_assertion_str(
10747
- assertion_str: list[str], brief_str: list[str | None], autobrief_str: list[str], lang: str
11744
+ assertion_str: list[str],
11745
+ brief_str: list[str | None],
11746
+ autobrief_str: list[str],
11747
+ segmentation_str: list[tuple | None],
11748
+ lang: str,
10748
11749
  ) -> list[str]:
10749
11750
  # Get the SVG icons for the assertion types
10750
11751
  svg_icon = _get_assertion_icon(icon=assertion_str)
@@ -10805,6 +11806,26 @@ def _transform_assertion_str(
10805
11806
  for assertion, svg, size, brief_div in zip(assertion_str, svg_icon, text_size, brief_divs)
10806
11807
  ]
10807
11808
 
11809
+ # If the `segments` list is not empty, prepend a segmentation div to the `type_upd` strings
11810
+ if segmentation_str:
11811
+ for i in range(len(type_upd)):
11812
+ if segmentation_str[i] is not None:
11813
+ # Get the column name and value from the segmentation expression
11814
+ column_name = segmentation_str[i][0]
11815
+ column_value = segmentation_str[i][1]
11816
+ # Create the segmentation div
11817
+ segmentation_div = (
11818
+ "<div style='margin-top: 0px; margin-bottom: 0px; "
11819
+ "white-space: pre; font-size: 8px; color: darkblue; padding-bottom: 4px; "
11820
+ "'>"
11821
+ "<strong><span style='font-family: Helvetica, arial, sans-serif;'>"
11822
+ f"SEGMENT&nbsp;&nbsp;</span></strong><span>{column_name} / {column_value}"
11823
+ "</span>"
11824
+ "</div>"
11825
+ )
11826
+ # Prepend the segmentation div to the type_upd string
11827
+ type_upd[i] = f"{segmentation_div} {type_upd[i]}"
11828
+
10808
11829
  return type_upd
10809
11830
 
10810
11831