pointblank 0.8.7__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -7,6 +7,7 @@ import datetime
7
7
  import inspect
8
8
  import json
9
9
  import re
10
+ import tempfile
10
11
  import threading
11
12
  from dataclasses import dataclass
12
13
  from importlib.metadata import version
@@ -57,6 +58,7 @@ from pointblank._interrogation import (
57
58
  RowCountMatch,
58
59
  RowsDistinct,
59
60
  )
61
+ from pointblank._typing import SegmentSpec
60
62
  from pointblank._utils import (
61
63
  _check_any_df_lib,
62
64
  _check_invalid_fields,
@@ -119,16 +121,18 @@ def _action_context_manager(metadata):
119
121
  delattr(_action_context, "metadata")
120
122
 
121
123
 
122
- def get_action_metadata():
124
+ def get_action_metadata() -> dict | None:
123
125
  """Access step-level metadata when authoring custom actions.
124
126
 
125
127
  Get the metadata for the validation step where an action was triggered. This can be called by
126
- user functions to get the metadata for the current action.
128
+ user functions to get the metadata for the current action. This function can only be used within
129
+ callables crafted for the [`Actions`](`pointblank.Actions`) class.
127
130
 
128
131
  Returns
129
132
  -------
130
- dict
131
- A dictionary containing the metadata for the current step.
133
+ dict | None
134
+ A dictionary containing the metadata for the current step. If called outside of an action
135
+ (i.e., when no action is being executed), this function will return `None`.
132
136
 
133
137
  Description of the Metadata Fields
134
138
  ----------------------------------
@@ -163,7 +167,7 @@ def get_action_metadata():
163
167
  thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
164
168
  actions=pb.Actions(warning=log_issue),
165
169
  )
166
- .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
170
+ .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
167
171
  .col_vals_gt(columns="item_revenue", value=0.05)
168
172
  .col_vals_gt(
169
173
  columns="session_duration",
@@ -181,6 +185,11 @@ def get_action_metadata():
181
185
  - the `metadata` is a dictionary that is used to craft the log message
182
186
  - the action is passed as a bare function to the `Actions` object within the `Validate` object
183
187
  (placing it within `Validate(actions=)` ensures it's set as an action for every validation step)
188
+
189
+ See Also
190
+ --------
191
+ Have a look at [`Actions`](`pointblank.Actions`) for more information on how to create custom
192
+ actions for validation steps that exceed a set threshold value.
184
193
  """
185
194
  if hasattr(_action_context, "metadata"): # pragma: no cover
186
195
  return _action_context.metadata # pragma: no cover
@@ -204,17 +213,19 @@ def _final_action_context_manager(summary):
204
213
  delattr(_final_action_context, "summary")
205
214
 
206
215
 
207
- def get_validation_summary():
216
+ def get_validation_summary() -> dict | None:
208
217
  """Access validation summary information when authoring final actions.
209
218
 
210
219
  This function provides a convenient way to access summary information about the validation
211
220
  process within a final action. It returns a dictionary with key metrics from the validation
212
- process.
221
+ process. This function can only be used within callables crafted for the
222
+ [`FinalActions`](`pointblank.FinalActions`) class.
213
223
 
214
224
  Returns
215
225
  -------
216
226
  dict | None
217
- A dictionary containing validation metrics, or None if called outside a final action.
227
+ A dictionary containing validation metrics. If called outside of an final action context,
228
+ this function will return `None`.
218
229
 
219
230
  Description of the Summary Fields
220
231
  --------------------------------
@@ -304,6 +315,11 @@ def get_validation_summary():
304
315
 
305
316
  Final actions work well with both simple logging and more complex notification systems, allowing
306
317
  you to integrate validation results into your broader data quality workflows.
318
+
319
+ See Also
320
+ --------
321
+ Have a look at [`FinalActions`](`pointblank.FinalActions`) for more information on how to create
322
+ custom actions that are executed after all validation steps have been completed.
307
323
  """
308
324
  if hasattr(_final_action_context, "summary"):
309
325
  return _final_action_context.summary
@@ -516,10 +532,10 @@ def load_dataset(
516
532
  data_path = files("pointblank.data") / f"{dataset}-duckdb.zip"
517
533
 
518
534
  # Unzip the DuckDB dataset to a temporary directory
519
- with ZipFile(data_path, "r") as z:
520
- z.extractall(path="datasets")
535
+ with tempfile.TemporaryDirectory() as tmp, ZipFile(data_path, "r") as z:
536
+ z.extractall(path=tmp)
521
537
 
522
- data_path = f"datasets/{dataset}.ddb"
538
+ data_path = f"{tmp}/{dataset}.ddb"
523
539
 
524
540
  dataset = ibis.connect(f"duckdb://{data_path}").table(dataset)
525
541
 
@@ -1783,14 +1799,15 @@ class _ValidationInfo:
1783
1799
  assertion_type
1784
1800
  The type of assertion. This is the method name of the validation (e.g., `"col_vals_gt"`).
1785
1801
  column
1786
- The column to validate. Currently we don't allow for column expressions (which may map to
1787
- multiple columns).
1802
+ The column(s) to validate.
1788
1803
  values
1789
1804
  The value or values to compare against.
1790
1805
  na_pass
1791
1806
  Whether to pass test units that hold missing values.
1792
1807
  pre
1793
1808
  A preprocessing function or lambda to apply to the data table for the validation step.
1809
+ segments
1810
+ The segments to use for the validation step.
1794
1811
  thresholds
1795
1812
  The threshold values for the validation.
1796
1813
  actions
@@ -1841,11 +1858,12 @@ class _ValidationInfo:
1841
1858
  step_id: str | None = None
1842
1859
  sha1: str | None = None
1843
1860
  assertion_type: str | None = None
1844
- column: str | None = None
1861
+ column: any | None = None
1845
1862
  values: any | list[any] | tuple | None = None
1846
1863
  inclusive: tuple[bool, bool] | None = None
1847
1864
  na_pass: bool | None = None
1848
1865
  pre: Callable | None = None
1866
+ segments: any | None = None
1849
1867
  thresholds: Thresholds | None = None
1850
1868
  actions: Actions | None = None
1851
1869
  label: str | None = None
@@ -1909,7 +1927,7 @@ class Validate:
1909
1927
  The table to validate, which could be a DataFrame object or an Ibis table object. Read the
1910
1928
  *Supported Input Table Types* section for details on the supported table types.
1911
1929
  tbl_name
1912
- A optional name to assign to the input table object. If no value is provided, a name will
1930
+ An optional name to assign to the input table object. If no value is provided, a name will
1913
1931
  be generated based on whatever information is available. This table name will be displayed
1914
1932
  in the header area of the tabular report.
1915
1933
  label
@@ -2323,6 +2341,7 @@ class Validate:
2323
2341
  value: float | int | Column,
2324
2342
  na_pass: bool = False,
2325
2343
  pre: Callable | None = None,
2344
+ segments: SegmentSpec | None = None,
2326
2345
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
2327
2346
  actions: Actions | None = None,
2328
2347
  brief: str | bool | None = None,
@@ -2354,10 +2373,15 @@ class Validate:
2354
2373
  Should any encountered None, NA, or Null values be considered as passing test units? By
2355
2374
  default, this is `False`. Set to `True` to pass test units with missing values.
2356
2375
  pre
2357
- A optional preprocessing function or lambda to apply to the data table during
2376
+ An optional preprocessing function or lambda to apply to the data table during
2358
2377
  interrogation. This function should take a table as input and return a modified table.
2359
2378
  Have a look at the *Preprocessing* section for more information on how to use this
2360
2379
  argument.
2380
+ segments
2381
+ An optional directive on segmentation, which serves to split a validation step into
2382
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
2383
+ column name and its corresponding values to segment on, or a combination of both
2384
+ (provided as a list). Read the *Segmentation* section for usage information.
2361
2385
  thresholds
2362
2386
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
2363
2387
  The thresholds are set at the step level and will override any global thresholds set in
@@ -2420,6 +2444,42 @@ class Validate:
2420
2444
  lifetime of the transformed table, it only exists during the validation step and is not
2421
2445
  stored in the `Validate` object or used in subsequent validation steps.
2422
2446
 
2447
+ Segmentation
2448
+ ------------
2449
+ The `segments=` argument allows for the segmentation of a validation step into multiple
2450
+ segments. This is useful for applying the same validation step to different subsets of the
2451
+ data. The segmentation can be done based on a single column or specific fields within a
2452
+ column.
2453
+
2454
+ Providing a single column name will result in a separate validation step for each unique
2455
+ value in that column. For example, if you have a column called `"region"` with values
2456
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
2457
+ region.
2458
+
2459
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
2460
+ values to segment on. For example, if you have a column called `"date"` and you want to
2461
+ segment on only specific dates, you can provide a tuple like
2462
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
2463
+ (i.e., no validation steps will be created for them).
2464
+
2465
+ A list with a combination of column names and tuples can be provided as well. This allows
2466
+ for more complex segmentation scenarios. The following inputs are all valid:
2467
+
2468
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
2469
+ in the `"region"` column and specific dates in the `"date"` column
2470
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
2471
+ columns
2472
+
2473
+ The segmentation is performed during interrogation, and the resulting validation steps will
2474
+ be numbered sequentially. Each segment will have its own validation step, and the results
2475
+ will be reported separately. This allows for a more granular analysis of the data and helps
2476
+ identify issues within specific segments.
2477
+
2478
+ Importantly, the segmentation process will be performed after any preprocessing of the data
2479
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
2480
+ that can be used for segmentation. For example, you could create a new column called
2481
+ `"segment"` through use of `pre=` and then use that column for segmentation.
2482
+
2423
2483
  Thresholds
2424
2484
  ----------
2425
2485
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -2518,6 +2578,8 @@ class Validate:
2518
2578
  _check_column(column=columns)
2519
2579
  # _check_value_float_int(value=value)
2520
2580
  _check_pre(pre=pre)
2581
+ # TODO: add check for segments
2582
+ # _check_segments(segments=segments)
2521
2583
  _check_thresholds(thresholds=thresholds)
2522
2584
  _check_boolean_input(param=na_pass, param_name="na_pass")
2523
2585
  _check_boolean_input(param=active, param_name="active")
@@ -2550,6 +2612,7 @@ class Validate:
2550
2612
  values=value,
2551
2613
  na_pass=na_pass,
2552
2614
  pre=pre,
2615
+ segments=segments,
2553
2616
  thresholds=thresholds,
2554
2617
  actions=actions,
2555
2618
  brief=brief,
@@ -2566,6 +2629,7 @@ class Validate:
2566
2629
  value: float | int | Column,
2567
2630
  na_pass: bool = False,
2568
2631
  pre: Callable | None = None,
2632
+ segments: SegmentSpec | None = None,
2569
2633
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
2570
2634
  actions: Actions | None = None,
2571
2635
  brief: str | bool | None = None,
@@ -2597,10 +2661,15 @@ class Validate:
2597
2661
  Should any encountered None, NA, or Null values be considered as passing test units? By
2598
2662
  default, this is `False`. Set to `True` to pass test units with missing values.
2599
2663
  pre
2600
- A optional preprocessing function or lambda to apply to the data table during
2664
+ An optional preprocessing function or lambda to apply to the data table during
2601
2665
  interrogation. This function should take a table as input and return a modified table.
2602
2666
  Have a look at the *Preprocessing* section for more information on how to use this
2603
2667
  argument.
2668
+ segments
2669
+ An optional directive on segmentation, which serves to split a validation step into
2670
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
2671
+ column name and its corresponding values to segment on, or a combination of both
2672
+ (provided as a list). Read the *Segmentation* section for usage information.
2604
2673
  thresholds
2605
2674
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
2606
2675
  The thresholds are set at the step level and will override any global thresholds set in
@@ -2663,6 +2732,42 @@ class Validate:
2663
2732
  lifetime of the transformed table, it only exists during the validation step and is not
2664
2733
  stored in the `Validate` object or used in subsequent validation steps.
2665
2734
 
2735
+ Segmentation
2736
+ ------------
2737
+ The `segments=` argument allows for the segmentation of a validation step into multiple
2738
+ segments. This is useful for applying the same validation step to different subsets of the
2739
+ data. The segmentation can be done based on a single column or specific fields within a
2740
+ column.
2741
+
2742
+ Providing a single column name will result in a separate validation step for each unique
2743
+ value in that column. For example, if you have a column called `"region"` with values
2744
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
2745
+ region.
2746
+
2747
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
2748
+ values to segment on. For example, if you have a column called `"date"` and you want to
2749
+ segment on only specific dates, you can provide a tuple like
2750
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
2751
+ (i.e., no validation steps will be created for them).
2752
+
2753
+ A list with a combination of column names and tuples can be provided as well. This allows
2754
+ for more complex segmentation scenarios. The following inputs are all valid:
2755
+
2756
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
2757
+ in the `"region"` column and specific dates in the `"date"` column
2758
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
2759
+ columns
2760
+
2761
+ The segmentation is performed during interrogation, and the resulting validation steps will
2762
+ be numbered sequentially. Each segment will have its own validation step, and the results
2763
+ will be reported separately. This allows for a more granular analysis of the data and helps
2764
+ identify issues within specific segments.
2765
+
2766
+ Importantly, the segmentation process will be performed after any preprocessing of the data
2767
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
2768
+ that can be used for segmentation. For example, you could create a new column called
2769
+ `"segment"` through use of `pre=` and then use that column for segmentation.
2770
+
2666
2771
  Thresholds
2667
2772
  ----------
2668
2773
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -2760,6 +2865,8 @@ class Validate:
2760
2865
  _check_column(column=columns)
2761
2866
  # _check_value_float_int(value=value)
2762
2867
  _check_pre(pre=pre)
2868
+ # TODO: add check for segments
2869
+ # _check_segments(segments=segments)
2763
2870
  _check_thresholds(thresholds=thresholds)
2764
2871
  _check_boolean_input(param=na_pass, param_name="na_pass")
2765
2872
  _check_boolean_input(param=active, param_name="active")
@@ -2792,6 +2899,7 @@ class Validate:
2792
2899
  values=value,
2793
2900
  na_pass=na_pass,
2794
2901
  pre=pre,
2902
+ segments=segments,
2795
2903
  thresholds=thresholds,
2796
2904
  actions=actions,
2797
2905
  brief=brief,
@@ -2808,6 +2916,7 @@ class Validate:
2808
2916
  value: float | int | Column,
2809
2917
  na_pass: bool = False,
2810
2918
  pre: Callable | None = None,
2919
+ segments: SegmentSpec | None = None,
2811
2920
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
2812
2921
  actions: Actions | None = None,
2813
2922
  brief: str | bool | None = None,
@@ -2839,10 +2948,15 @@ class Validate:
2839
2948
  Should any encountered None, NA, or Null values be considered as passing test units? By
2840
2949
  default, this is `False`. Set to `True` to pass test units with missing values.
2841
2950
  pre
2842
- A optional preprocessing function or lambda to apply to the data table during
2951
+ An optional preprocessing function or lambda to apply to the data table during
2843
2952
  interrogation. This function should take a table as input and return a modified table.
2844
2953
  Have a look at the *Preprocessing* section for more information on how to use this
2845
2954
  argument.
2955
+ segments
2956
+ An optional directive on segmentation, which serves to split a validation step into
2957
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
2958
+ column name and its corresponding values to segment on, or a combination of both
2959
+ (provided as a list). Read the *Segmentation* section for usage information.
2846
2960
  thresholds
2847
2961
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
2848
2962
  The thresholds are set at the step level and will override any global thresholds set in
@@ -2905,6 +3019,42 @@ class Validate:
2905
3019
  lifetime of the transformed table, it only exists during the validation step and is not
2906
3020
  stored in the `Validate` object or used in subsequent validation steps.
2907
3021
 
3022
+ Segmentation
3023
+ ------------
3024
+ The `segments=` argument allows for the segmentation of a validation step into multiple
3025
+ segments. This is useful for applying the same validation step to different subsets of the
3026
+ data. The segmentation can be done based on a single column or specific fields within a
3027
+ column.
3028
+
3029
+ Providing a single column name will result in a separate validation step for each unique
3030
+ value in that column. For example, if you have a column called `"region"` with values
3031
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
3032
+ region.
3033
+
3034
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
3035
+ values to segment on. For example, if you have a column called `"date"` and you want to
3036
+ segment on only specific dates, you can provide a tuple like
3037
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
3038
+ (i.e., no validation steps will be created for them).
3039
+
3040
+ A list with a combination of column names and tuples can be provided as well. This allows
3041
+ for more complex segmentation scenarios. The following inputs are all valid:
3042
+
3043
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3044
+ in the `"region"` column and specific dates in the `"date"` column
3045
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3046
+ columns
3047
+
3048
+ The segmentation is performed during interrogation, and the resulting validation steps will
3049
+ be numbered sequentially. Each segment will have its own validation step, and the results
3050
+ will be reported separately. This allows for a more granular analysis of the data and helps
3051
+ identify issues within specific segments.
3052
+
3053
+ Importantly, the segmentation process will be performed after any preprocessing of the data
3054
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
3055
+ that can be used for segmentation. For example, you could create a new column called
3056
+ `"segment"` through use of `pre=` and then use that column for segmentation.
3057
+
2908
3058
  Thresholds
2909
3059
  ----------
2910
3060
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3001,6 +3151,8 @@ class Validate:
3001
3151
  _check_column(column=columns)
3002
3152
  # _check_value_float_int(value=value)
3003
3153
  _check_pre(pre=pre)
3154
+ # TODO: add check for segments
3155
+ # _check_segments(segments=segments)
3004
3156
  _check_thresholds(thresholds=thresholds)
3005
3157
  _check_boolean_input(param=na_pass, param_name="na_pass")
3006
3158
  _check_boolean_input(param=active, param_name="active")
@@ -3033,6 +3185,7 @@ class Validate:
3033
3185
  values=value,
3034
3186
  na_pass=na_pass,
3035
3187
  pre=pre,
3188
+ segments=segments,
3036
3189
  thresholds=thresholds,
3037
3190
  actions=actions,
3038
3191
  brief=brief,
@@ -3049,6 +3202,7 @@ class Validate:
3049
3202
  value: float | int | Column,
3050
3203
  na_pass: bool = False,
3051
3204
  pre: Callable | None = None,
3205
+ segments: SegmentSpec | None = None,
3052
3206
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
3053
3207
  actions: Actions | None = None,
3054
3208
  brief: str | bool | None = None,
@@ -3080,10 +3234,15 @@ class Validate:
3080
3234
  Should any encountered None, NA, or Null values be considered as passing test units? By
3081
3235
  default, this is `False`. Set to `True` to pass test units with missing values.
3082
3236
  pre
3083
- A optional preprocessing function or lambda to apply to the data table during
3237
+ An optional preprocessing function or lambda to apply to the data table during
3084
3238
  interrogation. This function should take a table as input and return a modified table.
3085
3239
  Have a look at the *Preprocessing* section for more information on how to use this
3086
3240
  argument.
3241
+ segments
3242
+ An optional directive on segmentation, which serves to split a validation step into
3243
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
3244
+ column name and its corresponding values to segment on, or a combination of both
3245
+ (provided as a list). Read the *Segmentation* section for usage information.
3087
3246
  thresholds
3088
3247
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
3089
3248
  The thresholds are set at the step level and will override any global thresholds set in
@@ -3146,6 +3305,42 @@ class Validate:
3146
3305
  lifetime of the transformed table, it only exists during the validation step and is not
3147
3306
  stored in the `Validate` object or used in subsequent validation steps.
3148
3307
 
3308
+ Segmentation
3309
+ ------------
3310
+ The `segments=` argument allows for the segmentation of a validation step into multiple
3311
+ segments. This is useful for applying the same validation step to different subsets of the
3312
+ data. The segmentation can be done based on a single column or specific fields within a
3313
+ column.
3314
+
3315
+ Providing a single column name will result in a separate validation step for each unique
3316
+ value in that column. For example, if you have a column called `"region"` with values
3317
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
3318
+ region.
3319
+
3320
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
3321
+ values to segment on. For example, if you have a column called `"date"` and you want to
3322
+ segment on only specific dates, you can provide a tuple like
3323
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
3324
+ (i.e., no validation steps will be created for them).
3325
+
3326
+ A list with a combination of column names and tuples can be provided as well. This allows
3327
+ for more complex segmentation scenarios. The following inputs are all valid:
3328
+
3329
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3330
+ in the `"region"` column and specific dates in the `"date"` column
3331
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3332
+ columns
3333
+
3334
+ The segmentation is performed during interrogation, and the resulting validation steps will
3335
+ be numbered sequentially. Each segment will have its own validation step, and the results
3336
+ will be reported separately. This allows for a more granular analysis of the data and helps
3337
+ identify issues within specific segments.
3338
+
3339
+ Importantly, the segmentation process will be performed after any preprocessing of the data
3340
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
3341
+ that can be used for segmentation. For example, you could create a new column called
3342
+ `"segment"` through use of `pre=` and then use that column for segmentation.
3343
+
3149
3344
  Thresholds
3150
3345
  ----------
3151
3346
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3240,6 +3435,8 @@ class Validate:
3240
3435
  _check_column(column=columns)
3241
3436
  # _check_value_float_int(value=value)
3242
3437
  _check_pre(pre=pre)
3438
+ # TODO: add check for segments
3439
+ # _check_segments(segments=segments)
3243
3440
  _check_thresholds(thresholds=thresholds)
3244
3441
  _check_boolean_input(param=na_pass, param_name="na_pass")
3245
3442
  _check_boolean_input(param=active, param_name="active")
@@ -3272,6 +3469,7 @@ class Validate:
3272
3469
  values=value,
3273
3470
  na_pass=na_pass,
3274
3471
  pre=pre,
3472
+ segments=segments,
3275
3473
  thresholds=thresholds,
3276
3474
  actions=actions,
3277
3475
  brief=brief,
@@ -3288,6 +3486,7 @@ class Validate:
3288
3486
  value: float | int | Column,
3289
3487
  na_pass: bool = False,
3290
3488
  pre: Callable | None = None,
3489
+ segments: SegmentSpec | None = None,
3291
3490
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
3292
3491
  actions: Actions | None = None,
3293
3492
  brief: str | bool | None = None,
@@ -3319,10 +3518,15 @@ class Validate:
3319
3518
  Should any encountered None, NA, or Null values be considered as passing test units? By
3320
3519
  default, this is `False`. Set to `True` to pass test units with missing values.
3321
3520
  pre
3322
- A optional preprocessing function or lambda to apply to the data table during
3521
+ An optional preprocessing function or lambda to apply to the data table during
3323
3522
  interrogation. This function should take a table as input and return a modified table.
3324
3523
  Have a look at the *Preprocessing* section for more information on how to use this
3325
3524
  argument.
3525
+ segments
3526
+ An optional directive on segmentation, which serves to split a validation step into
3527
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
3528
+ column name and its corresponding values to segment on, or a combination of both
3529
+ (provided as a list). Read the *Segmentation* section for usage information.
3326
3530
  thresholds
3327
3531
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
3328
3532
  The thresholds are set at the step level and will override any global thresholds set in
@@ -3385,6 +3589,42 @@ class Validate:
3385
3589
  lifetime of the transformed table, it only exists during the validation step and is not
3386
3590
  stored in the `Validate` object or used in subsequent validation steps.
3387
3591
 
3592
+ Segmentation
3593
+ ------------
3594
+ The `segments=` argument allows for the segmentation of a validation step into multiple
3595
+ segments. This is useful for applying the same validation step to different subsets of the
3596
+ data. The segmentation can be done based on a single column or specific fields within a
3597
+ column.
3598
+
3599
+ Providing a single column name will result in a separate validation step for each unique
3600
+ value in that column. For example, if you have a column called `"region"` with values
3601
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
3602
+ region.
3603
+
3604
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
3605
+ values to segment on. For example, if you have a column called `"date"` and you want to
3606
+ segment on only specific dates, you can provide a tuple like
3607
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
3608
+ (i.e., no validation steps will be created for them).
3609
+
3610
+ A list with a combination of column names and tuples can be provided as well. This allows
3611
+ for more complex segmentation scenarios. The following inputs are all valid:
3612
+
3613
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3614
+ in the `"region"` column and specific dates in the `"date"` column
3615
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3616
+ columns
3617
+
3618
+ The segmentation is performed during interrogation, and the resulting validation steps will
3619
+ be numbered sequentially. Each segment will have its own validation step, and the results
3620
+ will be reported separately. This allows for a more granular analysis of the data and helps
3621
+ identify issues within specific segments.
3622
+
3623
+ Importantly, the segmentation process will be performed after any preprocessing of the data
3624
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
3625
+ that can be used for segmentation. For example, you could create a new column called
3626
+ `"segment"` through use of `pre=` and then use that column for segmentation.
3627
+
3388
3628
  Thresholds
3389
3629
  ----------
3390
3630
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3483,6 +3723,8 @@ class Validate:
3483
3723
  _check_column(column=columns)
3484
3724
  # _check_value_float_int(value=value)
3485
3725
  _check_pre(pre=pre)
3726
+ # TODO: add check for segments
3727
+ # _check_segments(segments=segments)
3486
3728
  _check_thresholds(thresholds=thresholds)
3487
3729
  _check_boolean_input(param=na_pass, param_name="na_pass")
3488
3730
  _check_boolean_input(param=active, param_name="active")
@@ -3515,6 +3757,7 @@ class Validate:
3515
3757
  values=value,
3516
3758
  na_pass=na_pass,
3517
3759
  pre=pre,
3760
+ segments=segments,
3518
3761
  thresholds=thresholds,
3519
3762
  actions=actions,
3520
3763
  brief=brief,
@@ -3531,6 +3774,7 @@ class Validate:
3531
3774
  value: float | int | Column,
3532
3775
  na_pass: bool = False,
3533
3776
  pre: Callable | None = None,
3777
+ segments: SegmentSpec | None = None,
3534
3778
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
3535
3779
  actions: Actions | None = None,
3536
3780
  brief: str | bool | None = None,
@@ -3562,10 +3806,15 @@ class Validate:
3562
3806
  Should any encountered None, NA, or Null values be considered as passing test units? By
3563
3807
  default, this is `False`. Set to `True` to pass test units with missing values.
3564
3808
  pre
3565
- A optional preprocessing function or lambda to apply to the data table during
3809
+ An optional preprocessing function or lambda to apply to the data table during
3566
3810
  interrogation. This function should take a table as input and return a modified table.
3567
3811
  Have a look at the *Preprocessing* section for more information on how to use this
3568
3812
  argument.
3813
+ segments
3814
+ An optional directive on segmentation, which serves to split a validation step into
3815
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
3816
+ column name and its corresponding values to segment on, or a combination of both
3817
+ (provided as a list). Read the *Segmentation* section for usage information.
3569
3818
  thresholds
3570
3819
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
3571
3820
  The thresholds are set at the step level and will override any global thresholds set in
@@ -3628,6 +3877,42 @@ class Validate:
3628
3877
  lifetime of the transformed table, it only exists during the validation step and is not
3629
3878
  stored in the `Validate` object or used in subsequent validation steps.
3630
3879
 
3880
+ Segmentation
3881
+ ------------
3882
+ The `segments=` argument allows for the segmentation of a validation step into multiple
3883
+ segments. This is useful for applying the same validation step to different subsets of the
3884
+ data. The segmentation can be done based on a single column or specific fields within a
3885
+ column.
3886
+
3887
+ Providing a single column name will result in a separate validation step for each unique
3888
+ value in that column. For example, if you have a column called `"region"` with values
3889
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
3890
+ region.
3891
+
3892
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
3893
+ values to segment on. For example, if you have a column called `"date"` and you want to
3894
+ segment on only specific dates, you can provide a tuple like
3895
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
3896
+ (i.e., no validation steps will be created for them).
3897
+
3898
+ A list with a combination of column names and tuples can be provided as well. This allows
3899
+ for more complex segmentation scenarios. The following inputs are all valid:
3900
+
3901
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3902
+ in the `"region"` column and specific dates in the `"date"` column
3903
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3904
+ columns
3905
+
3906
+ The segmentation is performed during interrogation, and the resulting validation steps will
3907
+ be numbered sequentially. Each segment will have its own validation step, and the results
3908
+ will be reported separately. This allows for a more granular analysis of the data and helps
3909
+ identify issues within specific segments.
3910
+
3911
+ Importantly, the segmentation process will be performed after any preprocessing of the data
3912
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
3913
+ that can be used for segmentation. For example, you could create a new column called
3914
+ `"segment"` through use of `pre=` and then use that column for segmentation.
3915
+
3631
3916
  Thresholds
3632
3917
  ----------
3633
3918
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3726,6 +4011,8 @@ class Validate:
3726
4011
  _check_column(column=columns)
3727
4012
  # _check_value_float_int(value=value)
3728
4013
  _check_pre(pre=pre)
4014
+ # TODO: add check for segments
4015
+ # _check_segments(segments=segments)
3729
4016
  _check_thresholds(thresholds=thresholds)
3730
4017
  _check_boolean_input(param=na_pass, param_name="na_pass")
3731
4018
  _check_boolean_input(param=active, param_name="active")
@@ -3758,6 +4045,7 @@ class Validate:
3758
4045
  values=value,
3759
4046
  na_pass=na_pass,
3760
4047
  pre=pre,
4048
+ segments=segments,
3761
4049
  thresholds=thresholds,
3762
4050
  actions=actions,
3763
4051
  brief=brief,
@@ -3776,6 +4064,7 @@ class Validate:
3776
4064
  inclusive: tuple[bool, bool] = (True, True),
3777
4065
  na_pass: bool = False,
3778
4066
  pre: Callable | None = None,
4067
+ segments: SegmentSpec | None = None,
3779
4068
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
3780
4069
  actions: Actions | None = None,
3781
4070
  brief: str | bool | None = None,
@@ -3817,10 +4106,15 @@ class Validate:
3817
4106
  Should any encountered None, NA, or Null values be considered as passing test units? By
3818
4107
  default, this is `False`. Set to `True` to pass test units with missing values.
3819
4108
  pre
3820
- A optional preprocessing function or lambda to apply to the data table during
4109
+ An optional preprocessing function or lambda to apply to the data table during
3821
4110
  interrogation. This function should take a table as input and return a modified table.
3822
4111
  Have a look at the *Preprocessing* section for more information on how to use this
3823
4112
  argument.
4113
+ segments
4114
+ An optional directive on segmentation, which serves to split a validation step into
4115
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4116
+ column name and its corresponding values to segment on, or a combination of both
4117
+ (provided as a list). Read the *Segmentation* section for usage information.
3824
4118
  thresholds
3825
4119
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
3826
4120
  The thresholds are set at the step level and will override any global thresholds set in
@@ -3885,6 +4179,42 @@ class Validate:
3885
4179
  lifetime of the transformed table, it only exists during the validation step and is not
3886
4180
  stored in the `Validate` object or used in subsequent validation steps.
3887
4181
 
4182
+ Segmentation
4183
+ ------------
4184
+ The `segments=` argument allows for the segmentation of a validation step into multiple
4185
+ segments. This is useful for applying the same validation step to different subsets of the
4186
+ data. The segmentation can be done based on a single column or specific fields within a
4187
+ column.
4188
+
4189
+ Providing a single column name will result in a separate validation step for each unique
4190
+ value in that column. For example, if you have a column called `"region"` with values
4191
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
4192
+ region.
4193
+
4194
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
4195
+ values to segment on. For example, if you have a column called `"date"` and you want to
4196
+ segment on only specific dates, you can provide a tuple like
4197
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
4198
+ (i.e., no validation steps will be created for them).
4199
+
4200
+ A list with a combination of column names and tuples can be provided as well. This allows
4201
+ for more complex segmentation scenarios. The following inputs are all valid:
4202
+
4203
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4204
+ in the `"region"` column and specific dates in the `"date"` column
4205
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4206
+ columns
4207
+
4208
+ The segmentation is performed during interrogation, and the resulting validation steps will
4209
+ be numbered sequentially. Each segment will have its own validation step, and the results
4210
+ will be reported separately. This allows for a more granular analysis of the data and helps
4211
+ identify issues within specific segments.
4212
+
4213
+ Importantly, the segmentation process will be performed after any preprocessing of the data
4214
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
4215
+ that can be used for segmentation. For example, you could create a new column called
4216
+ `"segment"` through use of `pre=` and then use that column for segmentation.
4217
+
3888
4218
  Thresholds
3889
4219
  ----------
3890
4220
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3992,6 +4322,8 @@ class Validate:
3992
4322
  # _check_value_float_int(value=left)
3993
4323
  # _check_value_float_int(value=right)
3994
4324
  _check_pre(pre=pre)
4325
+ # TODO: add check for segments
4326
+ # _check_segments(segments=segments)
3995
4327
  _check_thresholds(thresholds=thresholds)
3996
4328
  _check_boolean_input(param=na_pass, param_name="na_pass")
3997
4329
  _check_boolean_input(param=active, param_name="active")
@@ -4029,6 +4361,7 @@ class Validate:
4029
4361
  inclusive=inclusive,
4030
4362
  na_pass=na_pass,
4031
4363
  pre=pre,
4364
+ segments=segments,
4032
4365
  thresholds=thresholds,
4033
4366
  actions=actions,
4034
4367
  brief=brief,
@@ -4047,6 +4380,7 @@ class Validate:
4047
4380
  inclusive: tuple[bool, bool] = (True, True),
4048
4381
  na_pass: bool = False,
4049
4382
  pre: Callable | None = None,
4383
+ segments: SegmentSpec | None = None,
4050
4384
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4051
4385
  actions: Actions | None = None,
4052
4386
  brief: str | bool | None = None,
@@ -4088,10 +4422,15 @@ class Validate:
4088
4422
  Should any encountered None, NA, or Null values be considered as passing test units? By
4089
4423
  default, this is `False`. Set to `True` to pass test units with missing values.
4090
4424
  pre
4091
- A optional preprocessing function or lambda to apply to the data table during
4425
+ An optional preprocessing function or lambda to apply to the data table during
4092
4426
  interrogation. This function should take a table as input and return a modified table.
4093
4427
  Have a look at the *Preprocessing* section for more information on how to use this
4094
4428
  argument.
4429
+ segments
4430
+ An optional directive on segmentation, which serves to split a validation step into
4431
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4432
+ column name and its corresponding values to segment on, or a combination of both
4433
+ (provided as a list). Read the *Segmentation* section for usage information.
4095
4434
  thresholds
4096
4435
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4097
4436
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4156,6 +4495,42 @@ class Validate:
4156
4495
  lifetime of the transformed table, it only exists during the validation step and is not
4157
4496
  stored in the `Validate` object or used in subsequent validation steps.
4158
4497
 
4498
+ Segmentation
4499
+ ------------
4500
+ The `segments=` argument allows for the segmentation of a validation step into multiple
4501
+ segments. This is useful for applying the same validation step to different subsets of the
4502
+ data. The segmentation can be done based on a single column or specific fields within a
4503
+ column.
4504
+
4505
+ Providing a single column name will result in a separate validation step for each unique
4506
+ value in that column. For example, if you have a column called `"region"` with values
4507
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
4508
+ region.
4509
+
4510
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
4511
+ values to segment on. For example, if you have a column called `"date"` and you want to
4512
+ segment on only specific dates, you can provide a tuple like
4513
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
4514
+ (i.e., no validation steps will be created for them).
4515
+
4516
+ A list with a combination of column names and tuples can be provided as well. This allows
4517
+ for more complex segmentation scenarios. The following inputs are all valid:
4518
+
4519
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4520
+ in the `"region"` column and specific dates in the `"date"` column
4521
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4522
+ columns
4523
+
4524
+ The segmentation is performed during interrogation, and the resulting validation steps will
4525
+ be numbered sequentially. Each segment will have its own validation step, and the results
4526
+ will be reported separately. This allows for a more granular analysis of the data and helps
4527
+ identify issues within specific segments.
4528
+
4529
+ Importantly, the segmentation process will be performed after any preprocessing of the data
4530
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
4531
+ that can be used for segmentation. For example, you could create a new column called
4532
+ `"segment"` through use of `pre=` and then use that column for segmentation.
4533
+
4159
4534
  Thresholds
4160
4535
  ----------
4161
4536
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4263,6 +4638,8 @@ class Validate:
4263
4638
  # _check_value_float_int(value=left)
4264
4639
  # _check_value_float_int(value=right)
4265
4640
  _check_pre(pre=pre)
4641
+ # TODO: add check for segments
4642
+ # _check_segments(segments=segments)
4266
4643
  _check_thresholds(thresholds=thresholds)
4267
4644
  _check_boolean_input(param=na_pass, param_name="na_pass")
4268
4645
  _check_boolean_input(param=active, param_name="active")
@@ -4300,6 +4677,7 @@ class Validate:
4300
4677
  inclusive=inclusive,
4301
4678
  na_pass=na_pass,
4302
4679
  pre=pre,
4680
+ segments=segments,
4303
4681
  thresholds=thresholds,
4304
4682
  actions=actions,
4305
4683
  brief=brief,
@@ -4315,6 +4693,7 @@ class Validate:
4315
4693
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
4316
4694
  set: Collection[Any],
4317
4695
  pre: Callable | None = None,
4696
+ segments: SegmentSpec | None = None,
4318
4697
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4319
4698
  actions: Actions | None = None,
4320
4699
  brief: str | bool | None = None,
@@ -4338,10 +4717,15 @@ class Validate:
4338
4717
  set
4339
4718
  A list of values to compare against.
4340
4719
  pre
4341
- A optional preprocessing function or lambda to apply to the data table during
4720
+ An optional preprocessing function or lambda to apply to the data table during
4342
4721
  interrogation. This function should take a table as input and return a modified table.
4343
4722
  Have a look at the *Preprocessing* section for more information on how to use this
4344
4723
  argument.
4724
+ segments
4725
+ An optional directive on segmentation, which serves to split a validation step into
4726
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4727
+ column name and its corresponding values to segment on, or a combination of both
4728
+ (provided as a list). Read the *Segmentation* section for usage information.
4345
4729
  thresholds
4346
4730
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4347
4731
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4383,6 +4767,42 @@ class Validate:
4383
4767
  only exists during the validation step and is not stored in the `Validate` object or used in
4384
4768
  subsequent validation steps.
4385
4769
 
4770
+ Segmentation
4771
+ ------------
4772
+ The `segments=` argument allows for the segmentation of a validation step into multiple
4773
+ segments. This is useful for applying the same validation step to different subsets of the
4774
+ data. The segmentation can be done based on a single column or specific fields within a
4775
+ column.
4776
+
4777
+ Providing a single column name will result in a separate validation step for each unique
4778
+ value in that column. For example, if you have a column called `"region"` with values
4779
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
4780
+ region.
4781
+
4782
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
4783
+ values to segment on. For example, if you have a column called `"date"` and you want to
4784
+ segment on only specific dates, you can provide a tuple like
4785
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
4786
+ (i.e., no validation steps will be created for them).
4787
+
4788
+ A list with a combination of column names and tuples can be provided as well. This allows
4789
+ for more complex segmentation scenarios. The following inputs are all valid:
4790
+
4791
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4792
+ in the `"region"` column and specific dates in the `"date"` column
4793
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4794
+ columns
4795
+
4796
+ The segmentation is performed during interrogation, and the resulting validation steps will
4797
+ be numbered sequentially. Each segment will have its own validation step, and the results
4798
+ will be reported separately. This allows for a more granular analysis of the data and helps
4799
+ identify issues within specific segments.
4800
+
4801
+ Importantly, the segmentation process will be performed after any preprocessing of the data
4802
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
4803
+ that can be used for segmentation. For example, you could create a new column called
4804
+ `"segment"` through use of `pre=` and then use that column for segmentation.
4805
+
4386
4806
  Thresholds
4387
4807
  ----------
4388
4808
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4481,6 +4901,8 @@ class Validate:
4481
4901
  raise ValueError("`set=` must be a list of floats, integers, or strings.")
4482
4902
 
4483
4903
  _check_pre(pre=pre)
4904
+ # TODO: add check for segments
4905
+ # _check_segments(segments=segments)
4484
4906
  _check_thresholds(thresholds=thresholds)
4485
4907
  _check_boolean_input(param=active, param_name="active")
4486
4908
 
@@ -4508,6 +4930,7 @@ class Validate:
4508
4930
  column=column,
4509
4931
  values=set,
4510
4932
  pre=pre,
4933
+ segments=segments,
4511
4934
  thresholds=thresholds,
4512
4935
  actions=actions,
4513
4936
  brief=brief,
@@ -4523,6 +4946,7 @@ class Validate:
4523
4946
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
4524
4947
  set: list[float | int],
4525
4948
  pre: Callable | None = None,
4949
+ segments: SegmentSpec | None = None,
4526
4950
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4527
4951
  actions: Actions | None = None,
4528
4952
  brief: str | bool | None = None,
@@ -4546,10 +4970,15 @@ class Validate:
4546
4970
  set
4547
4971
  A list of values to compare against.
4548
4972
  pre
4549
- A optional preprocessing function or lambda to apply to the data table during
4973
+ An optional preprocessing function or lambda to apply to the data table during
4550
4974
  interrogation. This function should take a table as input and return a modified table.
4551
4975
  Have a look at the *Preprocessing* section for more information on how to use this
4552
4976
  argument.
4977
+ segments
4978
+ An optional directive on segmentation, which serves to split a validation step into
4979
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4980
+ column name and its corresponding values to segment on, or a combination of both
4981
+ (provided as a list). Read the *Segmentation* section for usage information.
4553
4982
  thresholds
4554
4983
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4555
4984
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4591,6 +5020,42 @@ class Validate:
4591
5020
  only exists during the validation step and is not stored in the `Validate` object or used in
4592
5021
  subsequent validation steps.
4593
5022
 
5023
+ Segmentation
5024
+ ------------
5025
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5026
+ segments. This is useful for applying the same validation step to different subsets of the
5027
+ data. The segmentation can be done based on a single column or specific fields within a
5028
+ column.
5029
+
5030
+ Providing a single column name will result in a separate validation step for each unique
5031
+ value in that column. For example, if you have a column called `"region"` with values
5032
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
5033
+ region.
5034
+
5035
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
5036
+ values to segment on. For example, if you have a column called `"date"` and you want to
5037
+ segment on only specific dates, you can provide a tuple like
5038
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
5039
+ (i.e., no validation steps will be created for them).
5040
+
5041
+ A list with a combination of column names and tuples can be provided as well. This allows
5042
+ for more complex segmentation scenarios. The following inputs are all valid:
5043
+
5044
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5045
+ in the `"region"` column and specific dates in the `"date"` column
5046
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5047
+ columns
5048
+
5049
+ The segmentation is performed during interrogation, and the resulting validation steps will
5050
+ be numbered sequentially. Each segment will have its own validation step, and the results
5051
+ will be reported separately. This allows for a more granular analysis of the data and helps
5052
+ identify issues within specific segments.
5053
+
5054
+ Importantly, the segmentation process will be performed after any preprocessing of the data
5055
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
5056
+ that can be used for segmentation. For example, you could create a new column called
5057
+ `"segment"` through use of `pre=` and then use that column for segmentation.
5058
+
4594
5059
  Thresholds
4595
5060
  ----------
4596
5061
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4684,6 +5149,8 @@ class Validate:
4684
5149
  _check_column(column=columns)
4685
5150
  _check_set_types(set=set)
4686
5151
  _check_pre(pre=pre)
5152
+ # TODO: add check for segments
5153
+ # _check_segments(segments=segments)
4687
5154
  _check_thresholds(thresholds=thresholds)
4688
5155
  _check_boolean_input(param=active, param_name="active")
4689
5156
 
@@ -4711,6 +5178,7 @@ class Validate:
4711
5178
  column=column,
4712
5179
  values=set,
4713
5180
  pre=pre,
5181
+ segments=segments,
4714
5182
  thresholds=thresholds,
4715
5183
  actions=actions,
4716
5184
  brief=brief,
@@ -4725,6 +5193,7 @@ class Validate:
4725
5193
  self,
4726
5194
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
4727
5195
  pre: Callable | None = None,
5196
+ segments: SegmentSpec | None = None,
4728
5197
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4729
5198
  actions: Actions | None = None,
4730
5199
  brief: str | bool | None = None,
@@ -4745,10 +5214,15 @@ class Validate:
4745
5214
  multiple columns are supplied or resolved, there will be a separate validation step
4746
5215
  generated for each column.
4747
5216
  pre
4748
- A optional preprocessing function or lambda to apply to the data table during
5217
+ An optional preprocessing function or lambda to apply to the data table during
4749
5218
  interrogation. This function should take a table as input and return a modified table.
4750
5219
  Have a look at the *Preprocessing* section for more information on how to use this
4751
5220
  argument.
5221
+ segments
5222
+ An optional directive on segmentation, which serves to split a validation step into
5223
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
5224
+ column name and its corresponding values to segment on, or a combination of both
5225
+ (provided as a list). Read the *Segmentation* section for usage information.
4752
5226
  thresholds
4753
5227
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4754
5228
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4790,6 +5264,42 @@ class Validate:
4790
5264
  only exists during the validation step and is not stored in the `Validate` object or used in
4791
5265
  subsequent validation steps.
4792
5266
 
5267
+ Segmentation
5268
+ ------------
5269
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5270
+ segments. This is useful for applying the same validation step to different subsets of the
5271
+ data. The segmentation can be done based on a single column or specific fields within a
5272
+ column.
5273
+
5274
+ Providing a single column name will result in a separate validation step for each unique
5275
+ value in that column. For example, if you have a column called `"region"` with values
5276
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
5277
+ region.
5278
+
5279
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
5280
+ values to segment on. For example, if you have a column called `"date"` and you want to
5281
+ segment on only specific dates, you can provide a tuple like
5282
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
5283
+ (i.e., no validation steps will be created for them).
5284
+
5285
+ A list with a combination of column names and tuples can be provided as well. This allows
5286
+ for more complex segmentation scenarios. The following inputs are all valid:
5287
+
5288
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5289
+ in the `"region"` column and specific dates in the `"date"` column
5290
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5291
+ columns
5292
+
5293
+ The segmentation is performed during interrogation, and the resulting validation steps will
5294
+ be numbered sequentially. Each segment will have its own validation step, and the results
5295
+ will be reported separately. This allows for a more granular analysis of the data and helps
5296
+ identify issues within specific segments.
5297
+
5298
+ Importantly, the segmentation process will be performed after any preprocessing of the data
5299
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
5300
+ that can be used for segmentation. For example, you could create a new column called
5301
+ `"segment"` through use of `pre=` and then use that column for segmentation.
5302
+
4793
5303
  Thresholds
4794
5304
  ----------
4795
5305
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4879,6 +5389,8 @@ class Validate:
4879
5389
 
4880
5390
  _check_column(column=columns)
4881
5391
  _check_pre(pre=pre)
5392
+ # TODO: add check for segments
5393
+ # _check_segments(segments=segments)
4882
5394
  _check_thresholds(thresholds=thresholds)
4883
5395
  _check_boolean_input(param=active, param_name="active")
4884
5396
 
@@ -4905,6 +5417,7 @@ class Validate:
4905
5417
  assertion_type=assertion_type,
4906
5418
  column=column,
4907
5419
  pre=pre,
5420
+ segments=segments,
4908
5421
  thresholds=thresholds,
4909
5422
  actions=actions,
4910
5423
  brief=brief,
@@ -4919,6 +5432,7 @@ class Validate:
4919
5432
  self,
4920
5433
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
4921
5434
  pre: Callable | None = None,
5435
+ segments: SegmentSpec | None = None,
4922
5436
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4923
5437
  actions: Actions | None = None,
4924
5438
  brief: str | bool | None = None,
@@ -4939,10 +5453,15 @@ class Validate:
4939
5453
  multiple columns are supplied or resolved, there will be a separate validation step
4940
5454
  generated for each column.
4941
5455
  pre
4942
- A optional preprocessing function or lambda to apply to the data table during
5456
+ An optional preprocessing function or lambda to apply to the data table during
4943
5457
  interrogation. This function should take a table as input and return a modified table.
4944
5458
  Have a look at the *Preprocessing* section for more information on how to use this
4945
5459
  argument.
5460
+ segments
5461
+ An optional directive on segmentation, which serves to split a validation step into
5462
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
5463
+ column name and its corresponding values to segment on, or a combination of both
5464
+ (provided as a list). Read the *Segmentation* section for usage information.
4946
5465
  thresholds
4947
5466
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4948
5467
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4984,6 +5503,42 @@ class Validate:
4984
5503
  only exists during the validation step and is not stored in the `Validate` object or used in
4985
5504
  subsequent validation steps.
4986
5505
 
5506
+ Segmentation
5507
+ ------------
5508
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5509
+ segments. This is useful for applying the same validation step to different subsets of the
5510
+ data. The segmentation can be done based on a single column or specific fields within a
5511
+ column.
5512
+
5513
+ Providing a single column name will result in a separate validation step for each unique
5514
+ value in that column. For example, if you have a column called `"region"` with values
5515
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
5516
+ region.
5517
+
5518
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
5519
+ values to segment on. For example, if you have a column called `"date"` and you want to
5520
+ segment on only specific dates, you can provide a tuple like
5521
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
5522
+ (i.e., no validation steps will be created for them).
5523
+
5524
+ A list with a combination of column names and tuples can be provided as well. This allows
5525
+ for more complex segmentation scenarios. The following inputs are all valid:
5526
+
5527
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5528
+ in the `"region"` column and specific dates in the `"date"` column
5529
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5530
+ columns
5531
+
5532
+ The segmentation is performed during interrogation, and the resulting validation steps will
5533
+ be numbered sequentially. Each segment will have its own validation step, and the results
5534
+ will be reported separately. This allows for a more granular analysis of the data and helps
5535
+ identify issues within specific segments.
5536
+
5537
+ Importantly, the segmentation process will be performed after any preprocessing of the data
5538
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
5539
+ that can be used for segmentation. For example, you could create a new column called
5540
+ `"segment"` through use of `pre=` and then use that column for segmentation.
5541
+
4987
5542
  Thresholds
4988
5543
  ----------
4989
5544
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5073,6 +5628,8 @@ class Validate:
5073
5628
 
5074
5629
  _check_column(column=columns)
5075
5630
  _check_pre(pre=pre)
5631
+ # TODO: add check for segments
5632
+ # _check_segments(segments=segments)
5076
5633
  _check_thresholds(thresholds=thresholds)
5077
5634
  _check_boolean_input(param=active, param_name="active")
5078
5635
 
@@ -5099,6 +5656,7 @@ class Validate:
5099
5656
  assertion_type=assertion_type,
5100
5657
  column=column,
5101
5658
  pre=pre,
5659
+ segments=segments,
5102
5660
  thresholds=thresholds,
5103
5661
  actions=actions,
5104
5662
  brief=brief,
@@ -5115,6 +5673,7 @@ class Validate:
5115
5673
  pattern: str,
5116
5674
  na_pass: bool = False,
5117
5675
  pre: Callable | None = None,
5676
+ segments: SegmentSpec | None = None,
5118
5677
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
5119
5678
  actions: Actions | None = None,
5120
5679
  brief: str | bool | None = None,
@@ -5141,10 +5700,15 @@ class Validate:
5141
5700
  Should any encountered None, NA, or Null values be considered as passing test units? By
5142
5701
  default, this is `False`. Set to `True` to pass test units with missing values.
5143
5702
  pre
5144
- A optional preprocessing function or lambda to apply to the data table during
5703
+ An optional preprocessing function or lambda to apply to the data table during
5145
5704
  interrogation. This function should take a table as input and return a modified table.
5146
5705
  Have a look at the *Preprocessing* section for more information on how to use this
5147
5706
  argument.
5707
+ segments
5708
+ An optional directive on segmentation, which serves to split a validation step into
5709
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
5710
+ column name and its corresponding values to segment on, or a combination of both
5711
+ (provided as a list). Read the *Segmentation* section for usage information.
5148
5712
  thresholds
5149
5713
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
5150
5714
  The thresholds are set at the step level and will override any global thresholds set in
@@ -5186,6 +5750,42 @@ class Validate:
5186
5750
  only exists during the validation step and is not stored in the `Validate` object or used in
5187
5751
  subsequent validation steps.
5188
5752
 
5753
+ Segmentation
5754
+ ------------
5755
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5756
+ segments. This is useful for applying the same validation step to different subsets of the
5757
+ data. The segmentation can be done based on a single column or specific fields within a
5758
+ column.
5759
+
5760
+ Providing a single column name will result in a separate validation step for each unique
5761
+ value in that column. For example, if you have a column called `"region"` with values
5762
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
5763
+ region.
5764
+
5765
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
5766
+ values to segment on. For example, if you have a column called `"date"` and you want to
5767
+ segment on only specific dates, you can provide a tuple like
5768
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
5769
+ (i.e., no validation steps will be created for them).
5770
+
5771
+ A list with a combination of column names and tuples can be provided as well. This allows
5772
+ for more complex segmentation scenarios. The following inputs are all valid:
5773
+
5774
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5775
+ in the `"region"` column and specific dates in the `"date"` column
5776
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5777
+ columns
5778
+
5779
+ The segmentation is performed during interrogation, and the resulting validation steps will
5780
+ be numbered sequentially. Each segment will have its own validation step, and the results
5781
+ will be reported separately. This allows for a more granular analysis of the data and helps
5782
+ identify issues within specific segments.
5783
+
5784
+ Importantly, the segmentation process will be performed after any preprocessing of the data
5785
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
5786
+ that can be used for segmentation. For example, you could create a new column called
5787
+ `"segment"` through use of `pre=` and then use that column for segmentation.
5788
+
5189
5789
  Thresholds
5190
5790
  ----------
5191
5791
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5277,6 +5877,8 @@ class Validate:
5277
5877
 
5278
5878
  _check_column(column=columns)
5279
5879
  _check_pre(pre=pre)
5880
+ # TODO: add check for segments
5881
+ # _check_segments(segments=segments)
5280
5882
  _check_thresholds(thresholds=thresholds)
5281
5883
  _check_boolean_input(param=na_pass, param_name="na_pass")
5282
5884
  _check_boolean_input(param=active, param_name="active")
@@ -5306,6 +5908,7 @@ class Validate:
5306
5908
  values=pattern,
5307
5909
  na_pass=na_pass,
5308
5910
  pre=pre,
5911
+ segments=segments,
5309
5912
  thresholds=thresholds,
5310
5913
  actions=actions,
5311
5914
  brief=brief,
@@ -5320,6 +5923,7 @@ class Validate:
5320
5923
  self,
5321
5924
  expr: any,
5322
5925
  pre: Callable | None = None,
5926
+ segments: SegmentSpec | None = None,
5323
5927
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
5324
5928
  actions: Actions | None = None,
5325
5929
  brief: str | bool | None = None,
@@ -5341,10 +5945,15 @@ class Validate:
5341
5945
  be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
5342
5946
  should either be a lambda expression or a Narwhals column expression.
5343
5947
  pre
5344
- A optional preprocessing function or lambda to apply to the data table during
5948
+ An optional preprocessing function or lambda to apply to the data table during
5345
5949
  interrogation. This function should take a table as input and return a modified table.
5346
5950
  Have a look at the *Preprocessing* section for more information on how to use this
5347
5951
  argument.
5952
+ segments
5953
+ An optional directive on segmentation, which serves to split a validation step into
5954
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
5955
+ column name and its corresponding values to segment on, or a combination of both
5956
+ (provided as a list). Read the *Segmentation* section for usage information.
5348
5957
  thresholds
5349
5958
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
5350
5959
  The thresholds are set at the step level and will override any global thresholds set in
@@ -5384,6 +5993,42 @@ class Validate:
5384
5993
  transformed table, it only exists during the validation step and is not stored in the
5385
5994
  `Validate` object or used in subsequent validation steps.
5386
5995
 
5996
+ Segmentation
5997
+ ------------
5998
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5999
+ segments. This is useful for applying the same validation step to different subsets of the
6000
+ data. The segmentation can be done based on a single column or specific fields within a
6001
+ column.
6002
+
6003
+ Providing a single column name will result in a separate validation step for each unique
6004
+ value in that column. For example, if you have a column called `"region"` with values
6005
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
6006
+ region.
6007
+
6008
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
6009
+ values to segment on. For example, if you have a column called `"date"` and you want to
6010
+ segment on only specific dates, you can provide a tuple like
6011
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
6012
+ (i.e., no validation steps will be created for them).
6013
+
6014
+ A list with a combination of column names and tuples can be provided as well. This allows
6015
+ for more complex segmentation scenarios. The following inputs are all valid:
6016
+
6017
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6018
+ in the `"region"` column and specific dates in the `"date"` column
6019
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6020
+ columns
6021
+
6022
+ The segmentation is performed during interrogation, and the resulting validation steps will
6023
+ be numbered sequentially. Each segment will have its own validation step, and the results
6024
+ will be reported separately. This allows for a more granular analysis of the data and helps
6025
+ identify issues within specific segments.
6026
+
6027
+ Importantly, the segmentation process will be performed after any preprocessing of the data
6028
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
6029
+ that can be used for segmentation. For example, you could create a new column called
6030
+ `"segment"` through use of `pre=` and then use that column for segmentation.
6031
+
5387
6032
  Thresholds
5388
6033
  ----------
5389
6034
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5461,6 +6106,8 @@ class Validate:
5461
6106
  # TODO: Add a check for the expression to ensure it's a valid expression object
5462
6107
  # _check_expr(expr=expr)
5463
6108
  _check_pre(pre=pre)
6109
+ # TODO: add check for segments
6110
+ # _check_segments(segments=segments)
5464
6111
  _check_thresholds(thresholds=thresholds)
5465
6112
  _check_boolean_input(param=active, param_name="active")
5466
6113
 
@@ -5477,6 +6124,7 @@ class Validate:
5477
6124
  column=None,
5478
6125
  values=expr,
5479
6126
  pre=pre,
6127
+ segments=segments,
5480
6128
  thresholds=thresholds,
5481
6129
  actions=actions,
5482
6130
  brief=brief,
@@ -5665,6 +6313,7 @@ class Validate:
5665
6313
  self,
5666
6314
  columns_subset: str | list[str] | None = None,
5667
6315
  pre: Callable | None = None,
6316
+ segments: SegmentSpec | None = None,
5668
6317
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
5669
6318
  actions: Actions | None = None,
5670
6319
  brief: str | bool | None = None,
@@ -5685,10 +6334,15 @@ class Validate:
5685
6334
  columns are supplied, the distinct comparison will be made over the combination of
5686
6335
  values in those columns.
5687
6336
  pre
5688
- A optional preprocessing function or lambda to apply to the data table during
6337
+ An optional preprocessing function or lambda to apply to the data table during
5689
6338
  interrogation. This function should take a table as input and return a modified table.
5690
6339
  Have a look at the *Preprocessing* section for more information on how to use this
5691
6340
  argument.
6341
+ segments
6342
+ An optional directive on segmentation, which serves to split a validation step into
6343
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
6344
+ column name and its corresponding values to segment on, or a combination of both
6345
+ (provided as a list). Read the *Segmentation* section for usage information.
5692
6346
  thresholds
5693
6347
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
5694
6348
  The thresholds are set at the step level and will override any global thresholds set in
@@ -5730,6 +6384,42 @@ class Validate:
5730
6384
  table, it only exists during the validation step and is not stored in the `Validate` object
5731
6385
  or used in subsequent validation steps.
5732
6386
 
6387
+ Segmentation
6388
+ ------------
6389
+ The `segments=` argument allows for the segmentation of a validation step into multiple
6390
+ segments. This is useful for applying the same validation step to different subsets of the
6391
+ data. The segmentation can be done based on a single column or specific fields within a
6392
+ column.
6393
+
6394
+ Providing a single column name will result in a separate validation step for each unique
6395
+ value in that column. For example, if you have a column called `"region"` with values
6396
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
6397
+ region.
6398
+
6399
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
6400
+ values to segment on. For example, if you have a column called `"date"` and you want to
6401
+ segment on only specific dates, you can provide a tuple like
6402
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
6403
+ (i.e., no validation steps will be created for them).
6404
+
6405
+ A list with a combination of column names and tuples can be provided as well. This allows
6406
+ for more complex segmentation scenarios. The following inputs are all valid:
6407
+
6408
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6409
+ in the `"region"` column and specific dates in the `"date"` column
6410
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6411
+ columns
6412
+
6413
+ The segmentation is performed during interrogation, and the resulting validation steps will
6414
+ be numbered sequentially. Each segment will have its own validation step, and the results
6415
+ will be reported separately. This allows for a more granular analysis of the data and helps
6416
+ identify issues within specific segments.
6417
+
6418
+ Importantly, the segmentation process will be performed after any preprocessing of the data
6419
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
6420
+ that can be used for segmentation. For example, you could create a new column called
6421
+ `"segment"` through use of `pre=` and then use that column for segmentation.
6422
+
5733
6423
  Thresholds
5734
6424
  ----------
5735
6425
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5823,6 +6513,8 @@ class Validate:
5823
6513
  assertion_type = _get_fn_name()
5824
6514
 
5825
6515
  _check_pre(pre=pre)
6516
+ # TODO: add check for segments
6517
+ # _check_segments(segments=segments)
5826
6518
  _check_thresholds(thresholds=thresholds)
5827
6519
  _check_boolean_input(param=active, param_name="active")
5828
6520
 
@@ -5843,6 +6535,7 @@ class Validate:
5843
6535
  assertion_type=assertion_type,
5844
6536
  column=columns_subset,
5845
6537
  pre=pre,
6538
+ segments=segments,
5846
6539
  thresholds=thresholds,
5847
6540
  actions=actions,
5848
6541
  brief=brief,
@@ -5903,7 +6596,7 @@ class Validate:
5903
6596
  substring matches are allowed, so a schema data type of `Int` would match a target table
5904
6597
  data type of `Int64`.
5905
6598
  pre
5906
- A optional preprocessing function or lambda to apply to the data table during
6599
+ An optional preprocessing function or lambda to apply to the data table during
5907
6600
  interrogation. This function should take a table as input and return a modified table.
5908
6601
  Have a look at the *Preprocessing* section for more information on how to use this
5909
6602
  argument.
@@ -6116,7 +6809,7 @@ class Validate:
6116
6809
  Should the validation step be inverted? If `True`, then the expectation is that the row
6117
6810
  count of the target table should not match the specified `count=` value.
6118
6811
  pre
6119
- A optional preprocessing function or lambda to apply to the data table during
6812
+ An optional preprocessing function or lambda to apply to the data table during
6120
6813
  interrogation. This function should take a table as input and return a modified table.
6121
6814
  Have a look at the *Preprocessing* section for more information on how to use this
6122
6815
  argument.
@@ -6326,7 +7019,7 @@ class Validate:
6326
7019
  Should the validation step be inverted? If `True`, then the expectation is that the
6327
7020
  column count of the target table should not match the specified `count=` value.
6328
7021
  pre
6329
- A optional preprocessing function or lambda to apply to the data table during
7022
+ An optional preprocessing function or lambda to apply to the data table during
6330
7023
  interrogation. This function should take a table as input and return a modified table.
6331
7024
  Have a look at the *Preprocessing* section for more information on how to use this
6332
7025
  argument.
@@ -6844,10 +7537,14 @@ class Validate:
6844
7537
 
6845
7538
  self.time_start = datetime.datetime.now(datetime.timezone.utc)
6846
7539
 
6847
- # Expand `validation_info` by evaluating any column expressions in `column`
7540
+ # Expand `validation_info` by evaluating any column expressions in `columns=`
6848
7541
  # (the `_evaluate_column_exprs()` method will eval and expand as needed)
6849
7542
  self._evaluate_column_exprs(validation_info=self.validation_info)
6850
7543
 
7544
+ # Expand `validation_info` by evaluating for any segmentation directives
7545
+ # provided in `segments=` (the `_evaluate_segments()` method will eval and expand as needed)
7546
+ self._evaluate_segments(validation_info=self.validation_info)
7547
+
6851
7548
  for validation in self.validation_info:
6852
7549
  # Set the `i` value for the validation step (this is 1-indexed)
6853
7550
  index_value = self.validation_info.index(validation) + 1
@@ -6883,6 +7580,10 @@ class Validate:
6883
7580
 
6884
7581
  validation.autobrief = autobrief
6885
7582
 
7583
+ # ------------------------------------------------
7584
+ # Bypassing the validation step if conditions met
7585
+ # ------------------------------------------------
7586
+
6886
7587
  # Skip the validation step if it is not active but still record the time of processing
6887
7588
  if not validation.active:
6888
7589
  end_time = datetime.datetime.now(datetime.timezone.utc)
@@ -6939,6 +7640,17 @@ class Validate:
6939
7640
  elif isinstance(validation.pre, Callable):
6940
7641
  data_tbl_step = validation.pre(data_tbl_step)
6941
7642
 
7643
+ # ------------------------------------------------
7644
+ # Segmentation stage
7645
+ # ------------------------------------------------
7646
+
7647
+ # Determine whether any segmentation directives are to be applied to the table
7648
+
7649
+ if validation.segments is not None:
7650
+ data_tbl_step = _apply_segments(
7651
+ data_tbl=data_tbl_step, segments_expr=validation.segments
7652
+ )
7653
+
6942
7654
  validation.n = NumberOfTestUnits(df=data_tbl_step, column=column).get_test_units(
6943
7655
  tbl_type=tbl_type
6944
7656
  )
@@ -8840,6 +9552,13 @@ class Validate:
8840
9552
  # will be made blank if the validation has not been performed
8841
9553
  interrogation_performed = validation_info_dict.get("proc_duration_s", [None])[0] is not None
8842
9554
 
9555
+ # Determine which steps are those using segmented data
9556
+ segmented_steps = [
9557
+ i + 1
9558
+ for i, segment in enumerate(validation_info_dict["segments"])
9559
+ if segment is not None
9560
+ ]
9561
+
8843
9562
  # ------------------------------------------------
8844
9563
  # Process the `type_upd` entry
8845
9564
  # ------------------------------------------------
@@ -8849,6 +9568,7 @@ class Validate:
8849
9568
  assertion_str=validation_info_dict["assertion_type"],
8850
9569
  brief_str=validation_info_dict["brief"],
8851
9570
  autobrief_str=validation_info_dict["autobrief"],
9571
+ segmentation_str=validation_info_dict["segments"],
8852
9572
  lang=lang,
8853
9573
  )
8854
9574
 
@@ -8980,11 +9700,14 @@ class Validate:
8980
9700
  # Add the `tbl` entry
8981
9701
  # ------------------------------------------------
8982
9702
 
8983
- # Depending on if there was some preprocessing done, get the appropriate icon
8984
- # for the table processing status to be displayed in the report under the `tbl` column
9703
+ # Depending on if there was some preprocessing done, get the appropriate icon for
9704
+ # the table processing status to be displayed in the report under the `tbl` column
9705
+ # TODO: add the icon for the segmented data option when the step is segmented
8985
9706
 
8986
9707
  validation_info_dict["tbl"] = _transform_tbl_preprocessed(
8987
- pre=validation_info_dict["pre"], interrogation_performed=interrogation_performed
9708
+ pre=validation_info_dict["pre"],
9709
+ seg=validation_info_dict["segments"],
9710
+ interrogation_performed=interrogation_performed,
8988
9711
  )
8989
9712
 
8990
9713
  # ------------------------------------------------
@@ -9019,8 +9742,9 @@ class Validate:
9019
9742
  # Process `pass` and `fail` entries
9020
9743
  # ------------------------------------------------
9021
9744
 
9022
- # Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries (the length
9023
- # of the `pass` entry should be equal to the length of the `n_passed` and `n_failed` entries)
9745
+ # Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries
9746
+ # (the length of the `pass` entry should be equal to the length of the
9747
+ # `n_passed` and `n_failed` entries)
9024
9748
 
9025
9749
  validation_info_dict["pass"] = _transform_passed_failed(
9026
9750
  n_passed_failed=validation_info_dict["n_passed"],
@@ -9173,6 +9897,9 @@ class Validate:
9173
9897
  # Remove the `pre` entry from the dictionary
9174
9898
  validation_info_dict.pop("pre")
9175
9899
 
9900
+ # Remove the `segments` entry from the dictionary
9901
+ validation_info_dict.pop("segments")
9902
+
9176
9903
  # Remove the `proc_duration_s` entry from the dictionary
9177
9904
  validation_info_dict.pop("proc_duration_s")
9178
9905
 
@@ -9255,6 +9982,10 @@ class Validate:
9255
9982
  columns=["type_upd", "columns_upd", "values_upd", "test_units", "pass", "fail"]
9256
9983
  ),
9257
9984
  )
9985
+ .tab_style(
9986
+ style=style.css("overflow-x: visible; white-space: nowrap;"),
9987
+ locations=loc.body(columns="type_upd", rows=segmented_steps),
9988
+ )
9258
9989
  .tab_style(
9259
9990
  style=style.fill(color="#FCFCFC" if interrogation_performed else "white"),
9260
9991
  locations=loc.body(columns=["w_upd", "e_upd", "c_upd"]),
@@ -9429,8 +10160,8 @@ class Validate:
9429
10160
  table object, which can be displayed in a notebook or exported to an HTML file.
9430
10161
 
9431
10162
  :::{.callout-warning}
9432
- The `get_step_report()` is still experimental. Please report any issues you encounter in the
9433
- [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
10163
+ The `get_step_report()` method is still experimental. Please report any issues you encounter
10164
+ in the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
9434
10165
  :::
9435
10166
 
9436
10167
  Parameters
@@ -9463,6 +10194,35 @@ class Validate:
9463
10194
  GT
9464
10195
  A GT table object that represents the detailed report for the validation step.
9465
10196
 
10197
+ Types of Step Reports
10198
+ ---------------------
10199
+ The `get_step_report()` method produces a report based on the *type* of validation step.
10200
+ The following row-based validation methods will produce a report that shows the rows of the
10201
+ data that failed because of failing test units within one or more columns failed:
10202
+
10203
+ - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
10204
+ - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
10205
+ - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
10206
+ - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
10207
+ - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
10208
+ - [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
10209
+ - [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
10210
+ - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
10211
+ - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
10212
+ - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
10213
+ - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
10214
+ - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
10215
+ - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
10216
+ - [`conjointly()`](`pointblank.Validate.conjointly`)
10217
+
10218
+ The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
10219
+ report that shows duplicate rows (or duplicate values in one or a set of columns as defined
10220
+ in that method's `columns_subset=` parameter.
10221
+
10222
+ The [`col_schema_match()`](`pointblank.Validate.col_schema_match`) validation step will
10223
+ produce a report that shows the schema of the data table and the schema of the validation
10224
+ step. The report will indicate whether the schemas match or not.
10225
+
9466
10226
  Examples
9467
10227
  --------
9468
10228
  ```{python}
@@ -9488,7 +10248,7 @@ class Validate:
9488
10248
  .col_vals_lt(columns="d", value=3500)
9489
10249
  .col_vals_between(columns="c", left=1, right=8)
9490
10250
  .col_vals_gt(columns="a", value=3)
9491
- .col_vals_regex(columns="b", pattern=r"\d-[a-z]{3}-\d{3}")
10251
+ .col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
9492
10252
  .interrogate()
9493
10253
  )
9494
10254
 
@@ -9776,6 +10536,95 @@ class Validate:
9776
10536
 
9777
10537
  return self
9778
10538
 
10539
+ def _evaluate_segments(self, validation_info):
10540
+ """
10541
+ Evaluate any segmentation expressions stored in the `segments` attribute and expand each
10542
+ validation step with such directives into multiple. This is done by evaluating the
10543
+ segmentation expression and creating a new validation step for each segment. Errors in
10544
+ evaluation (such as no segments matched) will be caught and recorded in the `eval_error`
10545
+ attribute.
10546
+
10547
+ Parameters
10548
+ ----------
10549
+ validation_info
10550
+ Information about the validation to evaluate and expand.
10551
+ """
10552
+
10553
+ # Create a list to store the expanded validation steps
10554
+ expanded_validation_info = []
10555
+
10556
+ # Iterate over the validation steps
10557
+ for i, validation in enumerate(validation_info):
10558
+ # Get the segments expression
10559
+ segments_expr = validation.segments
10560
+
10561
+ # If the value is None, then skip the evaluation and append the validation step to the
10562
+ # list of expanded validation steps
10563
+ if segments_expr is None:
10564
+ expanded_validation_info.append(validation)
10565
+ continue
10566
+
10567
+ # Evaluate the segments expression
10568
+ try:
10569
+ # Get the table for this step, it can either be:
10570
+ # 1. the target table itself
10571
+ # 2. the target table modified by a `pre` attribute
10572
+
10573
+ if validation.pre is None:
10574
+ table = self.data
10575
+ else:
10576
+ table = validation.pre(self.data)
10577
+
10578
+ # If the `segments` expression is a string, that string is taken as a column name
10579
+ # for which segmentation should occur across unique values in the column
10580
+ if isinstance(segments_expr, str):
10581
+ seg_tuples = _seg_expr_from_string(data_tbl=table, segments_expr=segments_expr)
10582
+
10583
+ # If the 'segments' expression is a tuple, then normalize it to a list of tuples
10584
+ # - ("col", "value") -> [("col", "value")]
10585
+ # - ("col", ["value1", "value2"]) -> [("col", "value1"), ("col", "value2")]
10586
+ elif isinstance(segments_expr, tuple):
10587
+ seg_tuples = _seg_expr_from_tuple(segments_expr=segments_expr)
10588
+
10589
+ # If the 'segments' expression is a list of strings or tuples (can be mixed) then
10590
+ # normalize it to a list of tuples following the rules above
10591
+ elif isinstance(segments_expr, list):
10592
+ seg_tuples = []
10593
+ for seg in segments_expr:
10594
+ if isinstance(seg, str):
10595
+ # Use the utility function for string items
10596
+ str_seg_tuples = _seg_expr_from_string(
10597
+ data_tbl=table, segments_expr=seg
10598
+ )
10599
+ seg_tuples.extend(str_seg_tuples)
10600
+ elif isinstance(seg, tuple):
10601
+ # Use the utility function for tuple items
10602
+ tuple_seg_tuples = _seg_expr_from_tuple(segments_expr=seg)
10603
+ seg_tuples.extend(tuple_seg_tuples)
10604
+ else: # pragma: no cover
10605
+ # Handle invalid segment type
10606
+ raise ValueError(
10607
+ f"Invalid segment expression item type: {type(seg)}. "
10608
+ "Must be either string or tuple."
10609
+ )
10610
+
10611
+ except Exception: # pragma: no cover
10612
+ validation.eval_error = True
10613
+
10614
+ # For each segmentation resolved, create a new validation step and add it to the list of
10615
+ # expanded validation steps
10616
+ for seg in seg_tuples:
10617
+ new_validation = copy.deepcopy(validation)
10618
+
10619
+ new_validation.segments = seg
10620
+
10621
+ expanded_validation_info.append(new_validation)
10622
+
10623
+ # Replace the `validation_info` attribute with the expanded version
10624
+ self.validation_info = expanded_validation_info
10625
+
10626
+ return self
10627
+
9779
10628
  def _get_validation_dict(self, i: int | list[int] | None, attr: str) -> dict[int, int]:
9780
10629
  """
9781
10630
  Utility function to get a dictionary of validation attributes for each validation step.
@@ -10493,6 +11342,143 @@ def _prep_values_text(
10493
11342
  return values_str
10494
11343
 
10495
11344
 
11345
+ def _seg_expr_from_string(data_tbl: any, segments_expr: str) -> tuple[str, str]:
11346
+ """
11347
+ Obtain the segmentation categories from a table column.
11348
+
11349
+ The `segments_expr` value will have been checked to be a string, so there's no need to check for
11350
+ that here. The function will return a list of tuples representing pairings of a column name and
11351
+ a value. The task is to obtain the unique values in the column (handling different table types)
11352
+ and produce a normalized list of tuples of the form: `(column, value)`.
11353
+
11354
+ This function is used to create a list of segments for the validation step. And since there will
11355
+ usually be more than one segment, the validation step will be expanded into multiple during
11356
+ interrogation (where this function is called).
11357
+
11358
+ Parameters
11359
+ ----------
11360
+ data_tbl
11361
+ The table from which to obtain the segmentation categories.
11362
+ segments_expr
11363
+ The column name for which segmentation should occur across unique values in the column.
11364
+
11365
+ Returns
11366
+ -------
11367
+ list[tuple[str, str]]
11368
+ A list of tuples representing pairings of a column name and a value in the column.
11369
+ """
11370
+ # Determine if the table is a DataFrame or a DB table
11371
+ tbl_type = _get_tbl_type(data=data_tbl)
11372
+
11373
+ # Obtain the segmentation categories from the table column given as `segments_expr`
11374
+ if tbl_type == "polars":
11375
+ seg_categories = data_tbl[segments_expr].unique().to_list()
11376
+ elif tbl_type == "pandas":
11377
+ seg_categories = data_tbl[segments_expr].unique().tolist()
11378
+ elif tbl_type in IBIS_BACKENDS:
11379
+ distinct_col_vals = data_tbl.select(segments_expr).distinct()
11380
+ seg_categories = distinct_col_vals[segments_expr].to_list()
11381
+ else: # pragma: no cover
11382
+ raise ValueError(f"Unsupported table type: {tbl_type}")
11383
+
11384
+ # Ensure that the categories are sorted
11385
+ seg_categories.sort()
11386
+
11387
+ # Place each category and each value in a list of tuples as: `(column, value)`
11388
+ seg_tuples = [(segments_expr, category) for category in seg_categories]
11389
+
11390
+ return seg_tuples
11391
+
11392
+
11393
+ def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, str]]:
11394
+ """
11395
+ Normalize the segments expression to a list of tuples, given a single tuple.
11396
+
11397
+ The `segments_expr` value will have been checked to be a tuple, so there's no need to check for
11398
+ that here. The function will return a list of tuples representing pairings of a column name and
11399
+ a value. The task is to normalize the tuple into a list of tuples of the form:
11400
+ `(column, value)`.
11401
+
11402
+ The following examples show how this normalzation works:
11403
+ - `("col", "value")` -> `[("col", "value")]` (single tuple, upgraded to a list of tuples)
11404
+ - `("col", ["value1", "value2"])` -> `[("col", "value1"), ("col", "value2")]` (tuple with a list
11405
+ of values, expanded into multiple tuples within a list)
11406
+
11407
+ This function is used to create a list of segments for the validation step. And since there will
11408
+ usually be more than one segment, the validation step will be expanded into multiple during
11409
+ interrogation (where this function is called).
11410
+
11411
+ Parameters
11412
+ ----------
11413
+ segments_expr
11414
+ The segments expression to normalize. It can be a tuple of the form
11415
+ `(column, value)` or `(column, [value1, value2])`.
11416
+
11417
+ Returns
11418
+ -------
11419
+ list[tuple[str, str]]
11420
+ A list of tuples representing pairings of a column name and a value in the column.
11421
+ """
11422
+ # Check if the first element is a string
11423
+ if isinstance(segments_expr[0], str):
11424
+ # If the second element is a list, create a list of tuples
11425
+ if isinstance(segments_expr[1], list):
11426
+ seg_tuples = [(segments_expr[0], value) for value in segments_expr[1]]
11427
+ # If the second element is not a list, create a single tuple
11428
+ else:
11429
+ seg_tuples = [(segments_expr[0], segments_expr[1])]
11430
+ # If the first element is not a string, raise an error
11431
+ else: # pragma: no cover
11432
+ raise ValueError("The first element of the segments expression must be a string.")
11433
+
11434
+ return seg_tuples
11435
+
11436
+
11437
+ def _apply_segments(data_tbl: any, segments_expr: tuple[str, str]) -> any:
11438
+ """
11439
+ Apply the segments expression to the data table.
11440
+
11441
+ Filter the data table based on the `segments_expr=` value, where the first element is the
11442
+ column name and the second element is the value to filter by.
11443
+
11444
+ Parameters
11445
+ ----------
11446
+ data_tbl
11447
+ The data table to filter. It can be a Pandas DataFrame, Polars DataFrame, or an Ibis
11448
+ backend table.
11449
+ segments_expr
11450
+ The segments expression to apply. It is a tuple of the form `(column, value)`.
11451
+
11452
+ Returns
11453
+ -------
11454
+ any
11455
+ The filtered data table. It will be of the same type as the input table.
11456
+ """
11457
+ # Get the table type
11458
+ tbl_type = _get_tbl_type(data=data_tbl)
11459
+
11460
+ if tbl_type in ["pandas", "polars"]:
11461
+ # If the table is a Pandas or Polars DataFrame, transforming to a Narwhals table
11462
+ # and perform the filtering operation
11463
+
11464
+ # Transform to Narwhals table if a DataFrame
11465
+ data_tbl_nw = nw.from_native(data_tbl)
11466
+
11467
+ # Filter the data table based on the column name and value
11468
+ data_tbl_nw = data_tbl_nw.filter(nw.col(segments_expr[0]) == segments_expr[1])
11469
+
11470
+ # Transform back to the original table type
11471
+ data_tbl = data_tbl_nw.to_native()
11472
+
11473
+ elif tbl_type in IBIS_BACKENDS:
11474
+ # If the table is an Ibis backend table, perform the filtering operation directly
11475
+
11476
+ # Filter the data table based on the column name and value
11477
+ data_tbl = data_tbl[data_tbl[segments_expr[0]] == segments_expr[1]]
11478
+
11479
+ return data_tbl
11480
+
11481
+
10496
11482
  def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
10497
11483
  """
10498
11484
  Convert a `_ValidationInfo` object to a dictionary.
@@ -10517,6 +11503,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
10517
11503
  "inclusive",
10518
11504
  "na_pass",
10519
11505
  "pre",
11506
+ "segments",
10520
11507
  "label",
10521
11508
  "brief",
10522
11509
  "autobrief",
@@ -10631,7 +11618,7 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s
10631
11618
  return title_text
10632
11619
 
10633
11620
 
10634
- def _transform_tbl_preprocessed(pre: str, interrogation_performed: bool) -> list[str]:
11621
+ def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: bool) -> list[str]:
10635
11622
  # If no interrogation was performed, return a list of empty strings
10636
11623
  if not interrogation_performed:
10637
11624
  return ["" for _ in range(len(pre))]
@@ -10640,11 +11627,13 @@ def _transform_tbl_preprocessed(pre: str, interrogation_performed: bool) -> list
10640
11627
  # (either 'unchanged' (None) or 'modified' (not None))
10641
11628
  status_list = []
10642
11629
 
10643
- for status in pre:
10644
- if status is None:
10645
- status_list.append("unchanged")
10646
- else:
11630
+ for i in range(len(pre)):
11631
+ if seg[i] is not None:
11632
+ status_list.append("segmented")
11633
+ elif pre[i] is not None:
10647
11634
  status_list.append("modified")
11635
+ else:
11636
+ status_list.append("unchanged")
10648
11637
 
10649
11638
  return _get_preprocessed_table_icon(icon=status_list)
10650
11639
 
@@ -10752,7 +11741,11 @@ def _transform_w_e_c(values, color, interrogation_performed):
10752
11741
 
10753
11742
 
10754
11743
  def _transform_assertion_str(
10755
- assertion_str: list[str], brief_str: list[str | None], autobrief_str: list[str], lang: str
11744
+ assertion_str: list[str],
11745
+ brief_str: list[str | None],
11746
+ autobrief_str: list[str],
11747
+ segmentation_str: list[tuple | None],
11748
+ lang: str,
10756
11749
  ) -> list[str]:
10757
11750
  # Get the SVG icons for the assertion types
10758
11751
  svg_icon = _get_assertion_icon(icon=assertion_str)
@@ -10813,6 +11806,26 @@ def _transform_assertion_str(
10813
11806
  for assertion, svg, size, brief_div in zip(assertion_str, svg_icon, text_size, brief_divs)
10814
11807
  ]
10815
11808
 
11809
+ # If the `segments` list is not empty, prepend a segmentation div to the `type_upd` strings
11810
+ if segmentation_str:
11811
+ for i in range(len(type_upd)):
11812
+ if segmentation_str[i] is not None:
11813
+ # Get the column name and value from the segmentation expression
11814
+ column_name = segmentation_str[i][0]
11815
+ column_value = segmentation_str[i][1]
11816
+ # Create the segmentation div
11817
+ segmentation_div = (
11818
+ "<div style='margin-top: 0px; margin-bottom: 0px; "
11819
+ "white-space: pre; font-size: 8px; color: darkblue; padding-bottom: 4px; "
11820
+ "'>"
11821
+ "<strong><span style='font-family: Helvetica, arial, sans-serif;'>"
11822
+ f"SEGMENT&nbsp;&nbsp;</span></strong><span>{column_name} / {column_value}"
11823
+ "</span>"
11824
+ "</div>"
11825
+ )
11826
+ # Prepend the segmentation div to the type_upd string
11827
+ type_upd[i] = f"{segmentation_div} {type_upd[i]}"
11828
+
10816
11829
  return type_upd
10817
11830
 
10818
11831