pointblank 0.8.7__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -7,6 +7,7 @@ import datetime
7
7
  import inspect
8
8
  import json
9
9
  import re
10
+ import tempfile
10
11
  import threading
11
12
  from dataclasses import dataclass
12
13
  from importlib.metadata import version
@@ -55,8 +56,10 @@ from pointblank._interrogation import (
55
56
  ConjointlyValidation,
56
57
  NumberOfTestUnits,
57
58
  RowCountMatch,
59
+ RowsComplete,
58
60
  RowsDistinct,
59
61
  )
62
+ from pointblank._typing import SegmentSpec
60
63
  from pointblank._utils import (
61
64
  _check_any_df_lib,
62
65
  _check_invalid_fields,
@@ -119,16 +122,18 @@ def _action_context_manager(metadata):
119
122
  delattr(_action_context, "metadata")
120
123
 
121
124
 
122
- def get_action_metadata():
125
+ def get_action_metadata() -> dict | None:
123
126
  """Access step-level metadata when authoring custom actions.
124
127
 
125
128
  Get the metadata for the validation step where an action was triggered. This can be called by
126
- user functions to get the metadata for the current action.
129
+ user functions to get the metadata for the current action. This function can only be used within
130
+ callables crafted for the [`Actions`](`pointblank.Actions`) class.
127
131
 
128
132
  Returns
129
133
  -------
130
- dict
131
- A dictionary containing the metadata for the current step.
134
+ dict | None
135
+ A dictionary containing the metadata for the current step. If called outside of an action
136
+ (i.e., when no action is being executed), this function will return `None`.
132
137
 
133
138
  Description of the Metadata Fields
134
139
  ----------------------------------
@@ -163,7 +168,7 @@ def get_action_metadata():
163
168
  thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
164
169
  actions=pb.Actions(warning=log_issue),
165
170
  )
166
- .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
171
+ .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
167
172
  .col_vals_gt(columns="item_revenue", value=0.05)
168
173
  .col_vals_gt(
169
174
  columns="session_duration",
@@ -181,6 +186,11 @@ def get_action_metadata():
181
186
  - the `metadata` is a dictionary that is used to craft the log message
182
187
  - the action is passed as a bare function to the `Actions` object within the `Validate` object
183
188
  (placing it within `Validate(actions=)` ensures it's set as an action for every validation step)
189
+
190
+ See Also
191
+ --------
192
+ Have a look at [`Actions`](`pointblank.Actions`) for more information on how to create custom
193
+ actions for validation steps that exceed a set threshold value.
184
194
  """
185
195
  if hasattr(_action_context, "metadata"): # pragma: no cover
186
196
  return _action_context.metadata # pragma: no cover
@@ -204,17 +214,19 @@ def _final_action_context_manager(summary):
204
214
  delattr(_final_action_context, "summary")
205
215
 
206
216
 
207
- def get_validation_summary():
217
+ def get_validation_summary() -> dict | None:
208
218
  """Access validation summary information when authoring final actions.
209
219
 
210
220
  This function provides a convenient way to access summary information about the validation
211
221
  process within a final action. It returns a dictionary with key metrics from the validation
212
- process.
222
+ process. This function can only be used within callables crafted for the
223
+ [`FinalActions`](`pointblank.FinalActions`) class.
213
224
 
214
225
  Returns
215
226
  -------
216
227
  dict | None
217
- A dictionary containing validation metrics, or None if called outside a final action.
228
+ A dictionary containing validation metrics. If called outside of an final action context,
229
+ this function will return `None`.
218
230
 
219
231
  Description of the Summary Fields
220
232
  --------------------------------
@@ -304,6 +316,11 @@ def get_validation_summary():
304
316
 
305
317
  Final actions work well with both simple logging and more complex notification systems, allowing
306
318
  you to integrate validation results into your broader data quality workflows.
319
+
320
+ See Also
321
+ --------
322
+ Have a look at [`FinalActions`](`pointblank.FinalActions`) for more information on how to create
323
+ custom actions that are executed after all validation steps have been completed.
307
324
  """
308
325
  if hasattr(_final_action_context, "summary"):
309
326
  return _final_action_context.summary
@@ -516,10 +533,10 @@ def load_dataset(
516
533
  data_path = files("pointblank.data") / f"{dataset}-duckdb.zip"
517
534
 
518
535
  # Unzip the DuckDB dataset to a temporary directory
519
- with ZipFile(data_path, "r") as z:
520
- z.extractall(path="datasets")
536
+ with tempfile.TemporaryDirectory() as tmp, ZipFile(data_path, "r") as z:
537
+ z.extractall(path=tmp)
521
538
 
522
- data_path = f"datasets/{dataset}.ddb"
539
+ data_path = f"{tmp}/{dataset}.ddb"
523
540
 
524
541
  dataset = ibis.connect(f"duckdb://{data_path}").table(dataset)
525
542
 
@@ -1783,14 +1800,15 @@ class _ValidationInfo:
1783
1800
  assertion_type
1784
1801
  The type of assertion. This is the method name of the validation (e.g., `"col_vals_gt"`).
1785
1802
  column
1786
- The column to validate. Currently we don't allow for column expressions (which may map to
1787
- multiple columns).
1803
+ The column(s) to validate.
1788
1804
  values
1789
1805
  The value or values to compare against.
1790
1806
  na_pass
1791
1807
  Whether to pass test units that hold missing values.
1792
1808
  pre
1793
1809
  A preprocessing function or lambda to apply to the data table for the validation step.
1810
+ segments
1811
+ The segments to use for the validation step.
1794
1812
  thresholds
1795
1813
  The threshold values for the validation.
1796
1814
  actions
@@ -1841,11 +1859,12 @@ class _ValidationInfo:
1841
1859
  step_id: str | None = None
1842
1860
  sha1: str | None = None
1843
1861
  assertion_type: str | None = None
1844
- column: str | None = None
1862
+ column: any | None = None
1845
1863
  values: any | list[any] | tuple | None = None
1846
1864
  inclusive: tuple[bool, bool] | None = None
1847
1865
  na_pass: bool | None = None
1848
1866
  pre: Callable | None = None
1867
+ segments: any | None = None
1849
1868
  thresholds: Thresholds | None = None
1850
1869
  actions: Actions | None = None
1851
1870
  label: str | None = None
@@ -1909,7 +1928,7 @@ class Validate:
1909
1928
  The table to validate, which could be a DataFrame object or an Ibis table object. Read the
1910
1929
  *Supported Input Table Types* section for details on the supported table types.
1911
1930
  tbl_name
1912
- A optional name to assign to the input table object. If no value is provided, a name will
1931
+ An optional name to assign to the input table object. If no value is provided, a name will
1913
1932
  be generated based on whatever information is available. This table name will be displayed
1914
1933
  in the header area of the tabular report.
1915
1934
  label
@@ -2323,6 +2342,7 @@ class Validate:
2323
2342
  value: float | int | Column,
2324
2343
  na_pass: bool = False,
2325
2344
  pre: Callable | None = None,
2345
+ segments: SegmentSpec | None = None,
2326
2346
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
2327
2347
  actions: Actions | None = None,
2328
2348
  brief: str | bool | None = None,
@@ -2354,10 +2374,15 @@ class Validate:
2354
2374
  Should any encountered None, NA, or Null values be considered as passing test units? By
2355
2375
  default, this is `False`. Set to `True` to pass test units with missing values.
2356
2376
  pre
2357
- A optional preprocessing function or lambda to apply to the data table during
2377
+ An optional preprocessing function or lambda to apply to the data table during
2358
2378
  interrogation. This function should take a table as input and return a modified table.
2359
2379
  Have a look at the *Preprocessing* section for more information on how to use this
2360
2380
  argument.
2381
+ segments
2382
+ An optional directive on segmentation, which serves to split a validation step into
2383
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
2384
+ column name and its corresponding values to segment on, or a combination of both
2385
+ (provided as a list). Read the *Segmentation* section for usage information.
2361
2386
  thresholds
2362
2387
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
2363
2388
  The thresholds are set at the step level and will override any global thresholds set in
@@ -2420,6 +2445,42 @@ class Validate:
2420
2445
  lifetime of the transformed table, it only exists during the validation step and is not
2421
2446
  stored in the `Validate` object or used in subsequent validation steps.
2422
2447
 
2448
+ Segmentation
2449
+ ------------
2450
+ The `segments=` argument allows for the segmentation of a validation step into multiple
2451
+ segments. This is useful for applying the same validation step to different subsets of the
2452
+ data. The segmentation can be done based on a single column or specific fields within a
2453
+ column.
2454
+
2455
+ Providing a single column name will result in a separate validation step for each unique
2456
+ value in that column. For example, if you have a column called `"region"` with values
2457
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
2458
+ region.
2459
+
2460
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
2461
+ values to segment on. For example, if you have a column called `"date"` and you want to
2462
+ segment on only specific dates, you can provide a tuple like
2463
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
2464
+ (i.e., no validation steps will be created for them).
2465
+
2466
+ A list with a combination of column names and tuples can be provided as well. This allows
2467
+ for more complex segmentation scenarios. The following inputs are all valid:
2468
+
2469
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
2470
+ in the `"region"` column and specific dates in the `"date"` column
2471
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
2472
+ columns
2473
+
2474
+ The segmentation is performed during interrogation, and the resulting validation steps will
2475
+ be numbered sequentially. Each segment will have its own validation step, and the results
2476
+ will be reported separately. This allows for a more granular analysis of the data and helps
2477
+ identify issues within specific segments.
2478
+
2479
+ Importantly, the segmentation process will be performed after any preprocessing of the data
2480
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
2481
+ that can be used for segmentation. For example, you could create a new column called
2482
+ `"segment"` through use of `pre=` and then use that column for segmentation.
2483
+
2423
2484
  Thresholds
2424
2485
  ----------
2425
2486
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -2518,6 +2579,8 @@ class Validate:
2518
2579
  _check_column(column=columns)
2519
2580
  # _check_value_float_int(value=value)
2520
2581
  _check_pre(pre=pre)
2582
+ # TODO: add check for segments
2583
+ # _check_segments(segments=segments)
2521
2584
  _check_thresholds(thresholds=thresholds)
2522
2585
  _check_boolean_input(param=na_pass, param_name="na_pass")
2523
2586
  _check_boolean_input(param=active, param_name="active")
@@ -2550,6 +2613,7 @@ class Validate:
2550
2613
  values=value,
2551
2614
  na_pass=na_pass,
2552
2615
  pre=pre,
2616
+ segments=segments,
2553
2617
  thresholds=thresholds,
2554
2618
  actions=actions,
2555
2619
  brief=brief,
@@ -2566,6 +2630,7 @@ class Validate:
2566
2630
  value: float | int | Column,
2567
2631
  na_pass: bool = False,
2568
2632
  pre: Callable | None = None,
2633
+ segments: SegmentSpec | None = None,
2569
2634
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
2570
2635
  actions: Actions | None = None,
2571
2636
  brief: str | bool | None = None,
@@ -2597,10 +2662,15 @@ class Validate:
2597
2662
  Should any encountered None, NA, or Null values be considered as passing test units? By
2598
2663
  default, this is `False`. Set to `True` to pass test units with missing values.
2599
2664
  pre
2600
- A optional preprocessing function or lambda to apply to the data table during
2665
+ An optional preprocessing function or lambda to apply to the data table during
2601
2666
  interrogation. This function should take a table as input and return a modified table.
2602
2667
  Have a look at the *Preprocessing* section for more information on how to use this
2603
2668
  argument.
2669
+ segments
2670
+ An optional directive on segmentation, which serves to split a validation step into
2671
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
2672
+ column name and its corresponding values to segment on, or a combination of both
2673
+ (provided as a list). Read the *Segmentation* section for usage information.
2604
2674
  thresholds
2605
2675
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
2606
2676
  The thresholds are set at the step level and will override any global thresholds set in
@@ -2663,6 +2733,42 @@ class Validate:
2663
2733
  lifetime of the transformed table, it only exists during the validation step and is not
2664
2734
  stored in the `Validate` object or used in subsequent validation steps.
2665
2735
 
2736
+ Segmentation
2737
+ ------------
2738
+ The `segments=` argument allows for the segmentation of a validation step into multiple
2739
+ segments. This is useful for applying the same validation step to different subsets of the
2740
+ data. The segmentation can be done based on a single column or specific fields within a
2741
+ column.
2742
+
2743
+ Providing a single column name will result in a separate validation step for each unique
2744
+ value in that column. For example, if you have a column called `"region"` with values
2745
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
2746
+ region.
2747
+
2748
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
2749
+ values to segment on. For example, if you have a column called `"date"` and you want to
2750
+ segment on only specific dates, you can provide a tuple like
2751
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
2752
+ (i.e., no validation steps will be created for them).
2753
+
2754
+ A list with a combination of column names and tuples can be provided as well. This allows
2755
+ for more complex segmentation scenarios. The following inputs are all valid:
2756
+
2757
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
2758
+ in the `"region"` column and specific dates in the `"date"` column
2759
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
2760
+ columns
2761
+
2762
+ The segmentation is performed during interrogation, and the resulting validation steps will
2763
+ be numbered sequentially. Each segment will have its own validation step, and the results
2764
+ will be reported separately. This allows for a more granular analysis of the data and helps
2765
+ identify issues within specific segments.
2766
+
2767
+ Importantly, the segmentation process will be performed after any preprocessing of the data
2768
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
2769
+ that can be used for segmentation. For example, you could create a new column called
2770
+ `"segment"` through use of `pre=` and then use that column for segmentation.
2771
+
2666
2772
  Thresholds
2667
2773
  ----------
2668
2774
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -2760,6 +2866,8 @@ class Validate:
2760
2866
  _check_column(column=columns)
2761
2867
  # _check_value_float_int(value=value)
2762
2868
  _check_pre(pre=pre)
2869
+ # TODO: add check for segments
2870
+ # _check_segments(segments=segments)
2763
2871
  _check_thresholds(thresholds=thresholds)
2764
2872
  _check_boolean_input(param=na_pass, param_name="na_pass")
2765
2873
  _check_boolean_input(param=active, param_name="active")
@@ -2792,6 +2900,7 @@ class Validate:
2792
2900
  values=value,
2793
2901
  na_pass=na_pass,
2794
2902
  pre=pre,
2903
+ segments=segments,
2795
2904
  thresholds=thresholds,
2796
2905
  actions=actions,
2797
2906
  brief=brief,
@@ -2808,6 +2917,7 @@ class Validate:
2808
2917
  value: float | int | Column,
2809
2918
  na_pass: bool = False,
2810
2919
  pre: Callable | None = None,
2920
+ segments: SegmentSpec | None = None,
2811
2921
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
2812
2922
  actions: Actions | None = None,
2813
2923
  brief: str | bool | None = None,
@@ -2839,10 +2949,15 @@ class Validate:
2839
2949
  Should any encountered None, NA, or Null values be considered as passing test units? By
2840
2950
  default, this is `False`. Set to `True` to pass test units with missing values.
2841
2951
  pre
2842
- A optional preprocessing function or lambda to apply to the data table during
2952
+ An optional preprocessing function or lambda to apply to the data table during
2843
2953
  interrogation. This function should take a table as input and return a modified table.
2844
2954
  Have a look at the *Preprocessing* section for more information on how to use this
2845
2955
  argument.
2956
+ segments
2957
+ An optional directive on segmentation, which serves to split a validation step into
2958
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
2959
+ column name and its corresponding values to segment on, or a combination of both
2960
+ (provided as a list). Read the *Segmentation* section for usage information.
2846
2961
  thresholds
2847
2962
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
2848
2963
  The thresholds are set at the step level and will override any global thresholds set in
@@ -2905,6 +3020,42 @@ class Validate:
2905
3020
  lifetime of the transformed table, it only exists during the validation step and is not
2906
3021
  stored in the `Validate` object or used in subsequent validation steps.
2907
3022
 
3023
+ Segmentation
3024
+ ------------
3025
+ The `segments=` argument allows for the segmentation of a validation step into multiple
3026
+ segments. This is useful for applying the same validation step to different subsets of the
3027
+ data. The segmentation can be done based on a single column or specific fields within a
3028
+ column.
3029
+
3030
+ Providing a single column name will result in a separate validation step for each unique
3031
+ value in that column. For example, if you have a column called `"region"` with values
3032
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
3033
+ region.
3034
+
3035
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
3036
+ values to segment on. For example, if you have a column called `"date"` and you want to
3037
+ segment on only specific dates, you can provide a tuple like
3038
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
3039
+ (i.e., no validation steps will be created for them).
3040
+
3041
+ A list with a combination of column names and tuples can be provided as well. This allows
3042
+ for more complex segmentation scenarios. The following inputs are all valid:
3043
+
3044
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3045
+ in the `"region"` column and specific dates in the `"date"` column
3046
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3047
+ columns
3048
+
3049
+ The segmentation is performed during interrogation, and the resulting validation steps will
3050
+ be numbered sequentially. Each segment will have its own validation step, and the results
3051
+ will be reported separately. This allows for a more granular analysis of the data and helps
3052
+ identify issues within specific segments.
3053
+
3054
+ Importantly, the segmentation process will be performed after any preprocessing of the data
3055
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
3056
+ that can be used for segmentation. For example, you could create a new column called
3057
+ `"segment"` through use of `pre=` and then use that column for segmentation.
3058
+
2908
3059
  Thresholds
2909
3060
  ----------
2910
3061
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3001,6 +3152,8 @@ class Validate:
3001
3152
  _check_column(column=columns)
3002
3153
  # _check_value_float_int(value=value)
3003
3154
  _check_pre(pre=pre)
3155
+ # TODO: add check for segments
3156
+ # _check_segments(segments=segments)
3004
3157
  _check_thresholds(thresholds=thresholds)
3005
3158
  _check_boolean_input(param=na_pass, param_name="na_pass")
3006
3159
  _check_boolean_input(param=active, param_name="active")
@@ -3033,6 +3186,7 @@ class Validate:
3033
3186
  values=value,
3034
3187
  na_pass=na_pass,
3035
3188
  pre=pre,
3189
+ segments=segments,
3036
3190
  thresholds=thresholds,
3037
3191
  actions=actions,
3038
3192
  brief=brief,
@@ -3049,6 +3203,7 @@ class Validate:
3049
3203
  value: float | int | Column,
3050
3204
  na_pass: bool = False,
3051
3205
  pre: Callable | None = None,
3206
+ segments: SegmentSpec | None = None,
3052
3207
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
3053
3208
  actions: Actions | None = None,
3054
3209
  brief: str | bool | None = None,
@@ -3080,10 +3235,15 @@ class Validate:
3080
3235
  Should any encountered None, NA, or Null values be considered as passing test units? By
3081
3236
  default, this is `False`. Set to `True` to pass test units with missing values.
3082
3237
  pre
3083
- A optional preprocessing function or lambda to apply to the data table during
3238
+ An optional preprocessing function or lambda to apply to the data table during
3084
3239
  interrogation. This function should take a table as input and return a modified table.
3085
3240
  Have a look at the *Preprocessing* section for more information on how to use this
3086
3241
  argument.
3242
+ segments
3243
+ An optional directive on segmentation, which serves to split a validation step into
3244
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
3245
+ column name and its corresponding values to segment on, or a combination of both
3246
+ (provided as a list). Read the *Segmentation* section for usage information.
3087
3247
  thresholds
3088
3248
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
3089
3249
  The thresholds are set at the step level and will override any global thresholds set in
@@ -3146,6 +3306,42 @@ class Validate:
3146
3306
  lifetime of the transformed table, it only exists during the validation step and is not
3147
3307
  stored in the `Validate` object or used in subsequent validation steps.
3148
3308
 
3309
+ Segmentation
3310
+ ------------
3311
+ The `segments=` argument allows for the segmentation of a validation step into multiple
3312
+ segments. This is useful for applying the same validation step to different subsets of the
3313
+ data. The segmentation can be done based on a single column or specific fields within a
3314
+ column.
3315
+
3316
+ Providing a single column name will result in a separate validation step for each unique
3317
+ value in that column. For example, if you have a column called `"region"` with values
3318
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
3319
+ region.
3320
+
3321
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
3322
+ values to segment on. For example, if you have a column called `"date"` and you want to
3323
+ segment on only specific dates, you can provide a tuple like
3324
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
3325
+ (i.e., no validation steps will be created for them).
3326
+
3327
+ A list with a combination of column names and tuples can be provided as well. This allows
3328
+ for more complex segmentation scenarios. The following inputs are all valid:
3329
+
3330
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3331
+ in the `"region"` column and specific dates in the `"date"` column
3332
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3333
+ columns
3334
+
3335
+ The segmentation is performed during interrogation, and the resulting validation steps will
3336
+ be numbered sequentially. Each segment will have its own validation step, and the results
3337
+ will be reported separately. This allows for a more granular analysis of the data and helps
3338
+ identify issues within specific segments.
3339
+
3340
+ Importantly, the segmentation process will be performed after any preprocessing of the data
3341
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
3342
+ that can be used for segmentation. For example, you could create a new column called
3343
+ `"segment"` through use of `pre=` and then use that column for segmentation.
3344
+
3149
3345
  Thresholds
3150
3346
  ----------
3151
3347
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3240,6 +3436,8 @@ class Validate:
3240
3436
  _check_column(column=columns)
3241
3437
  # _check_value_float_int(value=value)
3242
3438
  _check_pre(pre=pre)
3439
+ # TODO: add check for segments
3440
+ # _check_segments(segments=segments)
3243
3441
  _check_thresholds(thresholds=thresholds)
3244
3442
  _check_boolean_input(param=na_pass, param_name="na_pass")
3245
3443
  _check_boolean_input(param=active, param_name="active")
@@ -3272,6 +3470,7 @@ class Validate:
3272
3470
  values=value,
3273
3471
  na_pass=na_pass,
3274
3472
  pre=pre,
3473
+ segments=segments,
3275
3474
  thresholds=thresholds,
3276
3475
  actions=actions,
3277
3476
  brief=brief,
@@ -3288,6 +3487,7 @@ class Validate:
3288
3487
  value: float | int | Column,
3289
3488
  na_pass: bool = False,
3290
3489
  pre: Callable | None = None,
3490
+ segments: SegmentSpec | None = None,
3291
3491
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
3292
3492
  actions: Actions | None = None,
3293
3493
  brief: str | bool | None = None,
@@ -3319,10 +3519,15 @@ class Validate:
3319
3519
  Should any encountered None, NA, or Null values be considered as passing test units? By
3320
3520
  default, this is `False`. Set to `True` to pass test units with missing values.
3321
3521
  pre
3322
- A optional preprocessing function or lambda to apply to the data table during
3522
+ An optional preprocessing function or lambda to apply to the data table during
3323
3523
  interrogation. This function should take a table as input and return a modified table.
3324
3524
  Have a look at the *Preprocessing* section for more information on how to use this
3325
3525
  argument.
3526
+ segments
3527
+ An optional directive on segmentation, which serves to split a validation step into
3528
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
3529
+ column name and its corresponding values to segment on, or a combination of both
3530
+ (provided as a list). Read the *Segmentation* section for usage information.
3326
3531
  thresholds
3327
3532
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
3328
3533
  The thresholds are set at the step level and will override any global thresholds set in
@@ -3385,6 +3590,42 @@ class Validate:
3385
3590
  lifetime of the transformed table, it only exists during the validation step and is not
3386
3591
  stored in the `Validate` object or used in subsequent validation steps.
3387
3592
 
3593
+ Segmentation
3594
+ ------------
3595
+ The `segments=` argument allows for the segmentation of a validation step into multiple
3596
+ segments. This is useful for applying the same validation step to different subsets of the
3597
+ data. The segmentation can be done based on a single column or specific fields within a
3598
+ column.
3599
+
3600
+ Providing a single column name will result in a separate validation step for each unique
3601
+ value in that column. For example, if you have a column called `"region"` with values
3602
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
3603
+ region.
3604
+
3605
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
3606
+ values to segment on. For example, if you have a column called `"date"` and you want to
3607
+ segment on only specific dates, you can provide a tuple like
3608
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
3609
+ (i.e., no validation steps will be created for them).
3610
+
3611
+ A list with a combination of column names and tuples can be provided as well. This allows
3612
+ for more complex segmentation scenarios. The following inputs are all valid:
3613
+
3614
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3615
+ in the `"region"` column and specific dates in the `"date"` column
3616
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3617
+ columns
3618
+
3619
+ The segmentation is performed during interrogation, and the resulting validation steps will
3620
+ be numbered sequentially. Each segment will have its own validation step, and the results
3621
+ will be reported separately. This allows for a more granular analysis of the data and helps
3622
+ identify issues within specific segments.
3623
+
3624
+ Importantly, the segmentation process will be performed after any preprocessing of the data
3625
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
3626
+ that can be used for segmentation. For example, you could create a new column called
3627
+ `"segment"` through use of `pre=` and then use that column for segmentation.
3628
+
3388
3629
  Thresholds
3389
3630
  ----------
3390
3631
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3483,6 +3724,8 @@ class Validate:
3483
3724
  _check_column(column=columns)
3484
3725
  # _check_value_float_int(value=value)
3485
3726
  _check_pre(pre=pre)
3727
+ # TODO: add check for segments
3728
+ # _check_segments(segments=segments)
3486
3729
  _check_thresholds(thresholds=thresholds)
3487
3730
  _check_boolean_input(param=na_pass, param_name="na_pass")
3488
3731
  _check_boolean_input(param=active, param_name="active")
@@ -3515,6 +3758,7 @@ class Validate:
3515
3758
  values=value,
3516
3759
  na_pass=na_pass,
3517
3760
  pre=pre,
3761
+ segments=segments,
3518
3762
  thresholds=thresholds,
3519
3763
  actions=actions,
3520
3764
  brief=brief,
@@ -3531,6 +3775,7 @@ class Validate:
3531
3775
  value: float | int | Column,
3532
3776
  na_pass: bool = False,
3533
3777
  pre: Callable | None = None,
3778
+ segments: SegmentSpec | None = None,
3534
3779
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
3535
3780
  actions: Actions | None = None,
3536
3781
  brief: str | bool | None = None,
@@ -3562,10 +3807,15 @@ class Validate:
3562
3807
  Should any encountered None, NA, or Null values be considered as passing test units? By
3563
3808
  default, this is `False`. Set to `True` to pass test units with missing values.
3564
3809
  pre
3565
- A optional preprocessing function or lambda to apply to the data table during
3810
+ An optional preprocessing function or lambda to apply to the data table during
3566
3811
  interrogation. This function should take a table as input and return a modified table.
3567
3812
  Have a look at the *Preprocessing* section for more information on how to use this
3568
3813
  argument.
3814
+ segments
3815
+ An optional directive on segmentation, which serves to split a validation step into
3816
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
3817
+ column name and its corresponding values to segment on, or a combination of both
3818
+ (provided as a list). Read the *Segmentation* section for usage information.
3569
3819
  thresholds
3570
3820
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
3571
3821
  The thresholds are set at the step level and will override any global thresholds set in
@@ -3628,6 +3878,42 @@ class Validate:
3628
3878
  lifetime of the transformed table, it only exists during the validation step and is not
3629
3879
  stored in the `Validate` object or used in subsequent validation steps.
3630
3880
 
3881
+ Segmentation
3882
+ ------------
3883
+ The `segments=` argument allows for the segmentation of a validation step into multiple
3884
+ segments. This is useful for applying the same validation step to different subsets of the
3885
+ data. The segmentation can be done based on a single column or specific fields within a
3886
+ column.
3887
+
3888
+ Providing a single column name will result in a separate validation step for each unique
3889
+ value in that column. For example, if you have a column called `"region"` with values
3890
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
3891
+ region.
3892
+
3893
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
3894
+ values to segment on. For example, if you have a column called `"date"` and you want to
3895
+ segment on only specific dates, you can provide a tuple like
3896
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
3897
+ (i.e., no validation steps will be created for them).
3898
+
3899
+ A list with a combination of column names and tuples can be provided as well. This allows
3900
+ for more complex segmentation scenarios. The following inputs are all valid:
3901
+
3902
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3903
+ in the `"region"` column and specific dates in the `"date"` column
3904
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3905
+ columns
3906
+
3907
+ The segmentation is performed during interrogation, and the resulting validation steps will
3908
+ be numbered sequentially. Each segment will have its own validation step, and the results
3909
+ will be reported separately. This allows for a more granular analysis of the data and helps
3910
+ identify issues within specific segments.
3911
+
3912
+ Importantly, the segmentation process will be performed after any preprocessing of the data
3913
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
3914
+ that can be used for segmentation. For example, you could create a new column called
3915
+ `"segment"` through use of `pre=` and then use that column for segmentation.
3916
+
3631
3917
  Thresholds
3632
3918
  ----------
3633
3919
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3726,6 +4012,8 @@ class Validate:
3726
4012
  _check_column(column=columns)
3727
4013
  # _check_value_float_int(value=value)
3728
4014
  _check_pre(pre=pre)
4015
+ # TODO: add check for segments
4016
+ # _check_segments(segments=segments)
3729
4017
  _check_thresholds(thresholds=thresholds)
3730
4018
  _check_boolean_input(param=na_pass, param_name="na_pass")
3731
4019
  _check_boolean_input(param=active, param_name="active")
@@ -3758,6 +4046,7 @@ class Validate:
3758
4046
  values=value,
3759
4047
  na_pass=na_pass,
3760
4048
  pre=pre,
4049
+ segments=segments,
3761
4050
  thresholds=thresholds,
3762
4051
  actions=actions,
3763
4052
  brief=brief,
@@ -3776,6 +4065,7 @@ class Validate:
3776
4065
  inclusive: tuple[bool, bool] = (True, True),
3777
4066
  na_pass: bool = False,
3778
4067
  pre: Callable | None = None,
4068
+ segments: SegmentSpec | None = None,
3779
4069
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
3780
4070
  actions: Actions | None = None,
3781
4071
  brief: str | bool | None = None,
@@ -3817,10 +4107,15 @@ class Validate:
3817
4107
  Should any encountered None, NA, or Null values be considered as passing test units? By
3818
4108
  default, this is `False`. Set to `True` to pass test units with missing values.
3819
4109
  pre
3820
- A optional preprocessing function or lambda to apply to the data table during
4110
+ An optional preprocessing function or lambda to apply to the data table during
3821
4111
  interrogation. This function should take a table as input and return a modified table.
3822
4112
  Have a look at the *Preprocessing* section for more information on how to use this
3823
4113
  argument.
4114
+ segments
4115
+ An optional directive on segmentation, which serves to split a validation step into
4116
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4117
+ column name and its corresponding values to segment on, or a combination of both
4118
+ (provided as a list). Read the *Segmentation* section for usage information.
3824
4119
  thresholds
3825
4120
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
3826
4121
  The thresholds are set at the step level and will override any global thresholds set in
@@ -3885,6 +4180,42 @@ class Validate:
3885
4180
  lifetime of the transformed table, it only exists during the validation step and is not
3886
4181
  stored in the `Validate` object or used in subsequent validation steps.
3887
4182
 
4183
+ Segmentation
4184
+ ------------
4185
+ The `segments=` argument allows for the segmentation of a validation step into multiple
4186
+ segments. This is useful for applying the same validation step to different subsets of the
4187
+ data. The segmentation can be done based on a single column or specific fields within a
4188
+ column.
4189
+
4190
+ Providing a single column name will result in a separate validation step for each unique
4191
+ value in that column. For example, if you have a column called `"region"` with values
4192
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
4193
+ region.
4194
+
4195
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
4196
+ values to segment on. For example, if you have a column called `"date"` and you want to
4197
+ segment on only specific dates, you can provide a tuple like
4198
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
4199
+ (i.e., no validation steps will be created for them).
4200
+
4201
+ A list with a combination of column names and tuples can be provided as well. This allows
4202
+ for more complex segmentation scenarios. The following inputs are all valid:
4203
+
4204
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4205
+ in the `"region"` column and specific dates in the `"date"` column
4206
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4207
+ columns
4208
+
4209
+ The segmentation is performed during interrogation, and the resulting validation steps will
4210
+ be numbered sequentially. Each segment will have its own validation step, and the results
4211
+ will be reported separately. This allows for a more granular analysis of the data and helps
4212
+ identify issues within specific segments.
4213
+
4214
+ Importantly, the segmentation process will be performed after any preprocessing of the data
4215
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
4216
+ that can be used for segmentation. For example, you could create a new column called
4217
+ `"segment"` through use of `pre=` and then use that column for segmentation.
4218
+
3888
4219
  Thresholds
3889
4220
  ----------
3890
4221
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3992,6 +4323,8 @@ class Validate:
3992
4323
  # _check_value_float_int(value=left)
3993
4324
  # _check_value_float_int(value=right)
3994
4325
  _check_pre(pre=pre)
4326
+ # TODO: add check for segments
4327
+ # _check_segments(segments=segments)
3995
4328
  _check_thresholds(thresholds=thresholds)
3996
4329
  _check_boolean_input(param=na_pass, param_name="na_pass")
3997
4330
  _check_boolean_input(param=active, param_name="active")
@@ -4029,6 +4362,7 @@ class Validate:
4029
4362
  inclusive=inclusive,
4030
4363
  na_pass=na_pass,
4031
4364
  pre=pre,
4365
+ segments=segments,
4032
4366
  thresholds=thresholds,
4033
4367
  actions=actions,
4034
4368
  brief=brief,
@@ -4047,6 +4381,7 @@ class Validate:
4047
4381
  inclusive: tuple[bool, bool] = (True, True),
4048
4382
  na_pass: bool = False,
4049
4383
  pre: Callable | None = None,
4384
+ segments: SegmentSpec | None = None,
4050
4385
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4051
4386
  actions: Actions | None = None,
4052
4387
  brief: str | bool | None = None,
@@ -4088,10 +4423,15 @@ class Validate:
4088
4423
  Should any encountered None, NA, or Null values be considered as passing test units? By
4089
4424
  default, this is `False`. Set to `True` to pass test units with missing values.
4090
4425
  pre
4091
- A optional preprocessing function or lambda to apply to the data table during
4426
+ An optional preprocessing function or lambda to apply to the data table during
4092
4427
  interrogation. This function should take a table as input and return a modified table.
4093
4428
  Have a look at the *Preprocessing* section for more information on how to use this
4094
4429
  argument.
4430
+ segments
4431
+ An optional directive on segmentation, which serves to split a validation step into
4432
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4433
+ column name and its corresponding values to segment on, or a combination of both
4434
+ (provided as a list). Read the *Segmentation* section for usage information.
4095
4435
  thresholds
4096
4436
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4097
4437
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4156,6 +4496,42 @@ class Validate:
4156
4496
  lifetime of the transformed table, it only exists during the validation step and is not
4157
4497
  stored in the `Validate` object or used in subsequent validation steps.
4158
4498
 
4499
+ Segmentation
4500
+ ------------
4501
+ The `segments=` argument allows for the segmentation of a validation step into multiple
4502
+ segments. This is useful for applying the same validation step to different subsets of the
4503
+ data. The segmentation can be done based on a single column or specific fields within a
4504
+ column.
4505
+
4506
+ Providing a single column name will result in a separate validation step for each unique
4507
+ value in that column. For example, if you have a column called `"region"` with values
4508
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
4509
+ region.
4510
+
4511
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
4512
+ values to segment on. For example, if you have a column called `"date"` and you want to
4513
+ segment on only specific dates, you can provide a tuple like
4514
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
4515
+ (i.e., no validation steps will be created for them).
4516
+
4517
+ A list with a combination of column names and tuples can be provided as well. This allows
4518
+ for more complex segmentation scenarios. The following inputs are all valid:
4519
+
4520
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4521
+ in the `"region"` column and specific dates in the `"date"` column
4522
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4523
+ columns
4524
+
4525
+ The segmentation is performed during interrogation, and the resulting validation steps will
4526
+ be numbered sequentially. Each segment will have its own validation step, and the results
4527
+ will be reported separately. This allows for a more granular analysis of the data and helps
4528
+ identify issues within specific segments.
4529
+
4530
+ Importantly, the segmentation process will be performed after any preprocessing of the data
4531
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
4532
+ that can be used for segmentation. For example, you could create a new column called
4533
+ `"segment"` through use of `pre=` and then use that column for segmentation.
4534
+
4159
4535
  Thresholds
4160
4536
  ----------
4161
4537
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4263,6 +4639,8 @@ class Validate:
4263
4639
  # _check_value_float_int(value=left)
4264
4640
  # _check_value_float_int(value=right)
4265
4641
  _check_pre(pre=pre)
4642
+ # TODO: add check for segments
4643
+ # _check_segments(segments=segments)
4266
4644
  _check_thresholds(thresholds=thresholds)
4267
4645
  _check_boolean_input(param=na_pass, param_name="na_pass")
4268
4646
  _check_boolean_input(param=active, param_name="active")
@@ -4300,6 +4678,7 @@ class Validate:
4300
4678
  inclusive=inclusive,
4301
4679
  na_pass=na_pass,
4302
4680
  pre=pre,
4681
+ segments=segments,
4303
4682
  thresholds=thresholds,
4304
4683
  actions=actions,
4305
4684
  brief=brief,
@@ -4315,6 +4694,7 @@ class Validate:
4315
4694
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
4316
4695
  set: Collection[Any],
4317
4696
  pre: Callable | None = None,
4697
+ segments: SegmentSpec | None = None,
4318
4698
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4319
4699
  actions: Actions | None = None,
4320
4700
  brief: str | bool | None = None,
@@ -4338,10 +4718,15 @@ class Validate:
4338
4718
  set
4339
4719
  A list of values to compare against.
4340
4720
  pre
4341
- A optional preprocessing function or lambda to apply to the data table during
4721
+ An optional preprocessing function or lambda to apply to the data table during
4342
4722
  interrogation. This function should take a table as input and return a modified table.
4343
4723
  Have a look at the *Preprocessing* section for more information on how to use this
4344
4724
  argument.
4725
+ segments
4726
+ An optional directive on segmentation, which serves to split a validation step into
4727
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4728
+ column name and its corresponding values to segment on, or a combination of both
4729
+ (provided as a list). Read the *Segmentation* section for usage information.
4345
4730
  thresholds
4346
4731
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4347
4732
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4383,6 +4768,42 @@ class Validate:
4383
4768
  only exists during the validation step and is not stored in the `Validate` object or used in
4384
4769
  subsequent validation steps.
4385
4770
 
4771
+ Segmentation
4772
+ ------------
4773
+ The `segments=` argument allows for the segmentation of a validation step into multiple
4774
+ segments. This is useful for applying the same validation step to different subsets of the
4775
+ data. The segmentation can be done based on a single column or specific fields within a
4776
+ column.
4777
+
4778
+ Providing a single column name will result in a separate validation step for each unique
4779
+ value in that column. For example, if you have a column called `"region"` with values
4780
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
4781
+ region.
4782
+
4783
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
4784
+ values to segment on. For example, if you have a column called `"date"` and you want to
4785
+ segment on only specific dates, you can provide a tuple like
4786
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
4787
+ (i.e., no validation steps will be created for them).
4788
+
4789
+ A list with a combination of column names and tuples can be provided as well. This allows
4790
+ for more complex segmentation scenarios. The following inputs are all valid:
4791
+
4792
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4793
+ in the `"region"` column and specific dates in the `"date"` column
4794
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4795
+ columns
4796
+
4797
+ The segmentation is performed during interrogation, and the resulting validation steps will
4798
+ be numbered sequentially. Each segment will have its own validation step, and the results
4799
+ will be reported separately. This allows for a more granular analysis of the data and helps
4800
+ identify issues within specific segments.
4801
+
4802
+ Importantly, the segmentation process will be performed after any preprocessing of the data
4803
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
4804
+ that can be used for segmentation. For example, you could create a new column called
4805
+ `"segment"` through use of `pre=` and then use that column for segmentation.
4806
+
4386
4807
  Thresholds
4387
4808
  ----------
4388
4809
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4481,6 +4902,8 @@ class Validate:
4481
4902
  raise ValueError("`set=` must be a list of floats, integers, or strings.")
4482
4903
 
4483
4904
  _check_pre(pre=pre)
4905
+ # TODO: add check for segments
4906
+ # _check_segments(segments=segments)
4484
4907
  _check_thresholds(thresholds=thresholds)
4485
4908
  _check_boolean_input(param=active, param_name="active")
4486
4909
 
@@ -4508,6 +4931,7 @@ class Validate:
4508
4931
  column=column,
4509
4932
  values=set,
4510
4933
  pre=pre,
4934
+ segments=segments,
4511
4935
  thresholds=thresholds,
4512
4936
  actions=actions,
4513
4937
  brief=brief,
@@ -4523,6 +4947,7 @@ class Validate:
4523
4947
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
4524
4948
  set: list[float | int],
4525
4949
  pre: Callable | None = None,
4950
+ segments: SegmentSpec | None = None,
4526
4951
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4527
4952
  actions: Actions | None = None,
4528
4953
  brief: str | bool | None = None,
@@ -4546,10 +4971,15 @@ class Validate:
4546
4971
  set
4547
4972
  A list of values to compare against.
4548
4973
  pre
4549
- A optional preprocessing function or lambda to apply to the data table during
4974
+ An optional preprocessing function or lambda to apply to the data table during
4550
4975
  interrogation. This function should take a table as input and return a modified table.
4551
4976
  Have a look at the *Preprocessing* section for more information on how to use this
4552
4977
  argument.
4978
+ segments
4979
+ An optional directive on segmentation, which serves to split a validation step into
4980
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4981
+ column name and its corresponding values to segment on, or a combination of both
4982
+ (provided as a list). Read the *Segmentation* section for usage information.
4553
4983
  thresholds
4554
4984
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4555
4985
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4591,6 +5021,42 @@ class Validate:
4591
5021
  only exists during the validation step and is not stored in the `Validate` object or used in
4592
5022
  subsequent validation steps.
4593
5023
 
5024
+ Segmentation
5025
+ ------------
5026
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5027
+ segments. This is useful for applying the same validation step to different subsets of the
5028
+ data. The segmentation can be done based on a single column or specific fields within a
5029
+ column.
5030
+
5031
+ Providing a single column name will result in a separate validation step for each unique
5032
+ value in that column. For example, if you have a column called `"region"` with values
5033
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
5034
+ region.
5035
+
5036
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
5037
+ values to segment on. For example, if you have a column called `"date"` and you want to
5038
+ segment on only specific dates, you can provide a tuple like
5039
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
5040
+ (i.e., no validation steps will be created for them).
5041
+
5042
+ A list with a combination of column names and tuples can be provided as well. This allows
5043
+ for more complex segmentation scenarios. The following inputs are all valid:
5044
+
5045
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5046
+ in the `"region"` column and specific dates in the `"date"` column
5047
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5048
+ columns
5049
+
5050
+ The segmentation is performed during interrogation, and the resulting validation steps will
5051
+ be numbered sequentially. Each segment will have its own validation step, and the results
5052
+ will be reported separately. This allows for a more granular analysis of the data and helps
5053
+ identify issues within specific segments.
5054
+
5055
+ Importantly, the segmentation process will be performed after any preprocessing of the data
5056
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
5057
+ that can be used for segmentation. For example, you could create a new column called
5058
+ `"segment"` through use of `pre=` and then use that column for segmentation.
5059
+
4594
5060
  Thresholds
4595
5061
  ----------
4596
5062
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4684,6 +5150,8 @@ class Validate:
4684
5150
  _check_column(column=columns)
4685
5151
  _check_set_types(set=set)
4686
5152
  _check_pre(pre=pre)
5153
+ # TODO: add check for segments
5154
+ # _check_segments(segments=segments)
4687
5155
  _check_thresholds(thresholds=thresholds)
4688
5156
  _check_boolean_input(param=active, param_name="active")
4689
5157
 
@@ -4711,6 +5179,7 @@ class Validate:
4711
5179
  column=column,
4712
5180
  values=set,
4713
5181
  pre=pre,
5182
+ segments=segments,
4714
5183
  thresholds=thresholds,
4715
5184
  actions=actions,
4716
5185
  brief=brief,
@@ -4725,6 +5194,7 @@ class Validate:
4725
5194
  self,
4726
5195
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
4727
5196
  pre: Callable | None = None,
5197
+ segments: SegmentSpec | None = None,
4728
5198
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4729
5199
  actions: Actions | None = None,
4730
5200
  brief: str | bool | None = None,
@@ -4745,10 +5215,15 @@ class Validate:
4745
5215
  multiple columns are supplied or resolved, there will be a separate validation step
4746
5216
  generated for each column.
4747
5217
  pre
4748
- A optional preprocessing function or lambda to apply to the data table during
5218
+ An optional preprocessing function or lambda to apply to the data table during
4749
5219
  interrogation. This function should take a table as input and return a modified table.
4750
5220
  Have a look at the *Preprocessing* section for more information on how to use this
4751
5221
  argument.
5222
+ segments
5223
+ An optional directive on segmentation, which serves to split a validation step into
5224
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
5225
+ column name and its corresponding values to segment on, or a combination of both
5226
+ (provided as a list). Read the *Segmentation* section for usage information.
4752
5227
  thresholds
4753
5228
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4754
5229
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4790,6 +5265,42 @@ class Validate:
4790
5265
  only exists during the validation step and is not stored in the `Validate` object or used in
4791
5266
  subsequent validation steps.
4792
5267
 
5268
+ Segmentation
5269
+ ------------
5270
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5271
+ segments. This is useful for applying the same validation step to different subsets of the
5272
+ data. The segmentation can be done based on a single column or specific fields within a
5273
+ column.
5274
+
5275
+ Providing a single column name will result in a separate validation step for each unique
5276
+ value in that column. For example, if you have a column called `"region"` with values
5277
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
5278
+ region.
5279
+
5280
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
5281
+ values to segment on. For example, if you have a column called `"date"` and you want to
5282
+ segment on only specific dates, you can provide a tuple like
5283
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
5284
+ (i.e., no validation steps will be created for them).
5285
+
5286
+ A list with a combination of column names and tuples can be provided as well. This allows
5287
+ for more complex segmentation scenarios. The following inputs are all valid:
5288
+
5289
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5290
+ in the `"region"` column and specific dates in the `"date"` column
5291
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5292
+ columns
5293
+
5294
+ The segmentation is performed during interrogation, and the resulting validation steps will
5295
+ be numbered sequentially. Each segment will have its own validation step, and the results
5296
+ will be reported separately. This allows for a more granular analysis of the data and helps
5297
+ identify issues within specific segments.
5298
+
5299
+ Importantly, the segmentation process will be performed after any preprocessing of the data
5300
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
5301
+ that can be used for segmentation. For example, you could create a new column called
5302
+ `"segment"` through use of `pre=` and then use that column for segmentation.
5303
+
4793
5304
  Thresholds
4794
5305
  ----------
4795
5306
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4879,6 +5390,8 @@ class Validate:
4879
5390
 
4880
5391
  _check_column(column=columns)
4881
5392
  _check_pre(pre=pre)
5393
+ # TODO: add check for segments
5394
+ # _check_segments(segments=segments)
4882
5395
  _check_thresholds(thresholds=thresholds)
4883
5396
  _check_boolean_input(param=active, param_name="active")
4884
5397
 
@@ -4905,6 +5418,7 @@ class Validate:
4905
5418
  assertion_type=assertion_type,
4906
5419
  column=column,
4907
5420
  pre=pre,
5421
+ segments=segments,
4908
5422
  thresholds=thresholds,
4909
5423
  actions=actions,
4910
5424
  brief=brief,
@@ -4919,6 +5433,7 @@ class Validate:
4919
5433
  self,
4920
5434
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
4921
5435
  pre: Callable | None = None,
5436
+ segments: SegmentSpec | None = None,
4922
5437
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
4923
5438
  actions: Actions | None = None,
4924
5439
  brief: str | bool | None = None,
@@ -4939,10 +5454,15 @@ class Validate:
4939
5454
  multiple columns are supplied or resolved, there will be a separate validation step
4940
5455
  generated for each column.
4941
5456
  pre
4942
- A optional preprocessing function or lambda to apply to the data table during
5457
+ An optional preprocessing function or lambda to apply to the data table during
4943
5458
  interrogation. This function should take a table as input and return a modified table.
4944
5459
  Have a look at the *Preprocessing* section for more information on how to use this
4945
5460
  argument.
5461
+ segments
5462
+ An optional directive on segmentation, which serves to split a validation step into
5463
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
5464
+ column name and its corresponding values to segment on, or a combination of both
5465
+ (provided as a list). Read the *Segmentation* section for usage information.
4946
5466
  thresholds
4947
5467
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4948
5468
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4984,6 +5504,42 @@ class Validate:
4984
5504
  only exists during the validation step and is not stored in the `Validate` object or used in
4985
5505
  subsequent validation steps.
4986
5506
 
5507
+ Segmentation
5508
+ ------------
5509
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5510
+ segments. This is useful for applying the same validation step to different subsets of the
5511
+ data. The segmentation can be done based on a single column or specific fields within a
5512
+ column.
5513
+
5514
+ Providing a single column name will result in a separate validation step for each unique
5515
+ value in that column. For example, if you have a column called `"region"` with values
5516
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
5517
+ region.
5518
+
5519
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
5520
+ values to segment on. For example, if you have a column called `"date"` and you want to
5521
+ segment on only specific dates, you can provide a tuple like
5522
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
5523
+ (i.e., no validation steps will be created for them).
5524
+
5525
+ A list with a combination of column names and tuples can be provided as well. This allows
5526
+ for more complex segmentation scenarios. The following inputs are all valid:
5527
+
5528
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5529
+ in the `"region"` column and specific dates in the `"date"` column
5530
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5531
+ columns
5532
+
5533
+ The segmentation is performed during interrogation, and the resulting validation steps will
5534
+ be numbered sequentially. Each segment will have its own validation step, and the results
5535
+ will be reported separately. This allows for a more granular analysis of the data and helps
5536
+ identify issues within specific segments.
5537
+
5538
+ Importantly, the segmentation process will be performed after any preprocessing of the data
5539
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
5540
+ that can be used for segmentation. For example, you could create a new column called
5541
+ `"segment"` through use of `pre=` and then use that column for segmentation.
5542
+
4987
5543
  Thresholds
4988
5544
  ----------
4989
5545
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5073,6 +5629,8 @@ class Validate:
5073
5629
 
5074
5630
  _check_column(column=columns)
5075
5631
  _check_pre(pre=pre)
5632
+ # TODO: add check for segments
5633
+ # _check_segments(segments=segments)
5076
5634
  _check_thresholds(thresholds=thresholds)
5077
5635
  _check_boolean_input(param=active, param_name="active")
5078
5636
 
@@ -5099,6 +5657,7 @@ class Validate:
5099
5657
  assertion_type=assertion_type,
5100
5658
  column=column,
5101
5659
  pre=pre,
5660
+ segments=segments,
5102
5661
  thresholds=thresholds,
5103
5662
  actions=actions,
5104
5663
  brief=brief,
@@ -5115,6 +5674,7 @@ class Validate:
5115
5674
  pattern: str,
5116
5675
  na_pass: bool = False,
5117
5676
  pre: Callable | None = None,
5677
+ segments: SegmentSpec | None = None,
5118
5678
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
5119
5679
  actions: Actions | None = None,
5120
5680
  brief: str | bool | None = None,
@@ -5141,10 +5701,15 @@ class Validate:
5141
5701
  Should any encountered None, NA, or Null values be considered as passing test units? By
5142
5702
  default, this is `False`. Set to `True` to pass test units with missing values.
5143
5703
  pre
5144
- A optional preprocessing function or lambda to apply to the data table during
5704
+ An optional preprocessing function or lambda to apply to the data table during
5145
5705
  interrogation. This function should take a table as input and return a modified table.
5146
5706
  Have a look at the *Preprocessing* section for more information on how to use this
5147
5707
  argument.
5708
+ segments
5709
+ An optional directive on segmentation, which serves to split a validation step into
5710
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
5711
+ column name and its corresponding values to segment on, or a combination of both
5712
+ (provided as a list). Read the *Segmentation* section for usage information.
5148
5713
  thresholds
5149
5714
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
5150
5715
  The thresholds are set at the step level and will override any global thresholds set in
@@ -5186,6 +5751,42 @@ class Validate:
5186
5751
  only exists during the validation step and is not stored in the `Validate` object or used in
5187
5752
  subsequent validation steps.
5188
5753
 
5754
+ Segmentation
5755
+ ------------
5756
+ The `segments=` argument allows for the segmentation of a validation step into multiple
5757
+ segments. This is useful for applying the same validation step to different subsets of the
5758
+ data. The segmentation can be done based on a single column or specific fields within a
5759
+ column.
5760
+
5761
+ Providing a single column name will result in a separate validation step for each unique
5762
+ value in that column. For example, if you have a column called `"region"` with values
5763
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
5764
+ region.
5765
+
5766
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
5767
+ values to segment on. For example, if you have a column called `"date"` and you want to
5768
+ segment on only specific dates, you can provide a tuple like
5769
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
5770
+ (i.e., no validation steps will be created for them).
5771
+
5772
+ A list with a combination of column names and tuples can be provided as well. This allows
5773
+ for more complex segmentation scenarios. The following inputs are all valid:
5774
+
5775
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5776
+ in the `"region"` column and specific dates in the `"date"` column
5777
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5778
+ columns
5779
+
5780
+ The segmentation is performed during interrogation, and the resulting validation steps will
5781
+ be numbered sequentially. Each segment will have its own validation step, and the results
5782
+ will be reported separately. This allows for a more granular analysis of the data and helps
5783
+ identify issues within specific segments.
5784
+
5785
+ Importantly, the segmentation process will be performed after any preprocessing of the data
5786
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
5787
+ that can be used for segmentation. For example, you could create a new column called
5788
+ `"segment"` through use of `pre=` and then use that column for segmentation.
5789
+
5189
5790
  Thresholds
5190
5791
  ----------
5191
5792
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5277,6 +5878,8 @@ class Validate:
5277
5878
 
5278
5879
  _check_column(column=columns)
5279
5880
  _check_pre(pre=pre)
5881
+ # TODO: add check for segments
5882
+ # _check_segments(segments=segments)
5280
5883
  _check_thresholds(thresholds=thresholds)
5281
5884
  _check_boolean_input(param=na_pass, param_name="na_pass")
5282
5885
  _check_boolean_input(param=active, param_name="active")
@@ -5306,6 +5909,7 @@ class Validate:
5306
5909
  values=pattern,
5307
5910
  na_pass=na_pass,
5308
5911
  pre=pre,
5912
+ segments=segments,
5309
5913
  thresholds=thresholds,
5310
5914
  actions=actions,
5311
5915
  brief=brief,
@@ -5320,6 +5924,7 @@ class Validate:
5320
5924
  self,
5321
5925
  expr: any,
5322
5926
  pre: Callable | None = None,
5927
+ segments: SegmentSpec | None = None,
5323
5928
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
5324
5929
  actions: Actions | None = None,
5325
5930
  brief: str | bool | None = None,
@@ -5341,10 +5946,15 @@ class Validate:
5341
5946
  be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
5342
5947
  should either be a lambda expression or a Narwhals column expression.
5343
5948
  pre
5344
- A optional preprocessing function or lambda to apply to the data table during
5949
+ An optional preprocessing function or lambda to apply to the data table during
5345
5950
  interrogation. This function should take a table as input and return a modified table.
5346
5951
  Have a look at the *Preprocessing* section for more information on how to use this
5347
5952
  argument.
5953
+ segments
5954
+ An optional directive on segmentation, which serves to split a validation step into
5955
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
5956
+ column name and its corresponding values to segment on, or a combination of both
5957
+ (provided as a list). Read the *Segmentation* section for usage information.
5348
5958
  thresholds
5349
5959
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
5350
5960
  The thresholds are set at the step level and will override any global thresholds set in
@@ -5384,6 +5994,42 @@ class Validate:
5384
5994
  transformed table, it only exists during the validation step and is not stored in the
5385
5995
  `Validate` object or used in subsequent validation steps.
5386
5996
 
5997
+ Segmentation
5998
+ ------------
5999
+ The `segments=` argument allows for the segmentation of a validation step into multiple
6000
+ segments. This is useful for applying the same validation step to different subsets of the
6001
+ data. The segmentation can be done based on a single column or specific fields within a
6002
+ column.
6003
+
6004
+ Providing a single column name will result in a separate validation step for each unique
6005
+ value in that column. For example, if you have a column called `"region"` with values
6006
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
6007
+ region.
6008
+
6009
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
6010
+ values to segment on. For example, if you have a column called `"date"` and you want to
6011
+ segment on only specific dates, you can provide a tuple like
6012
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
6013
+ (i.e., no validation steps will be created for them).
6014
+
6015
+ A list with a combination of column names and tuples can be provided as well. This allows
6016
+ for more complex segmentation scenarios. The following inputs are all valid:
6017
+
6018
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6019
+ in the `"region"` column and specific dates in the `"date"` column
6020
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6021
+ columns
6022
+
6023
+ The segmentation is performed during interrogation, and the resulting validation steps will
6024
+ be numbered sequentially. Each segment will have its own validation step, and the results
6025
+ will be reported separately. This allows for a more granular analysis of the data and helps
6026
+ identify issues within specific segments.
6027
+
6028
+ Importantly, the segmentation process will be performed after any preprocessing of the data
6029
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
6030
+ that can be used for segmentation. For example, you could create a new column called
6031
+ `"segment"` through use of `pre=` and then use that column for segmentation.
6032
+
5387
6033
  Thresholds
5388
6034
  ----------
5389
6035
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5461,6 +6107,8 @@ class Validate:
5461
6107
  # TODO: Add a check for the expression to ensure it's a valid expression object
5462
6108
  # _check_expr(expr=expr)
5463
6109
  _check_pre(pre=pre)
6110
+ # TODO: add check for segments
6111
+ # _check_segments(segments=segments)
5464
6112
  _check_thresholds(thresholds=thresholds)
5465
6113
  _check_boolean_input(param=active, param_name="active")
5466
6114
 
@@ -5477,6 +6125,7 @@ class Validate:
5477
6125
  column=None,
5478
6126
  values=expr,
5479
6127
  pre=pre,
6128
+ segments=segments,
5480
6129
  thresholds=thresholds,
5481
6130
  actions=actions,
5482
6131
  brief=brief,
@@ -5665,6 +6314,7 @@ class Validate:
5665
6314
  self,
5666
6315
  columns_subset: str | list[str] | None = None,
5667
6316
  pre: Callable | None = None,
6317
+ segments: SegmentSpec | None = None,
5668
6318
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
5669
6319
  actions: Actions | None = None,
5670
6320
  brief: str | bool | None = None,
@@ -5685,10 +6335,15 @@ class Validate:
5685
6335
  columns are supplied, the distinct comparison will be made over the combination of
5686
6336
  values in those columns.
5687
6337
  pre
5688
- A optional preprocessing function or lambda to apply to the data table during
6338
+ An optional preprocessing function or lambda to apply to the data table during
5689
6339
  interrogation. This function should take a table as input and return a modified table.
5690
6340
  Have a look at the *Preprocessing* section for more information on how to use this
5691
6341
  argument.
6342
+ segments
6343
+ An optional directive on segmentation, which serves to split a validation step into
6344
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
6345
+ column name and its corresponding values to segment on, or a combination of both
6346
+ (provided as a list). Read the *Segmentation* section for usage information.
5692
6347
  thresholds
5693
6348
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
5694
6349
  The thresholds are set at the step level and will override any global thresholds set in
@@ -5730,6 +6385,42 @@ class Validate:
5730
6385
  table, it only exists during the validation step and is not stored in the `Validate` object
5731
6386
  or used in subsequent validation steps.
5732
6387
 
6388
+ Segmentation
6389
+ ------------
6390
+ The `segments=` argument allows for the segmentation of a validation step into multiple
6391
+ segments. This is useful for applying the same validation step to different subsets of the
6392
+ data. The segmentation can be done based on a single column or specific fields within a
6393
+ column.
6394
+
6395
+ Providing a single column name will result in a separate validation step for each unique
6396
+ value in that column. For example, if you have a column called `"region"` with values
6397
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
6398
+ region.
6399
+
6400
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
6401
+ values to segment on. For example, if you have a column called `"date"` and you want to
6402
+ segment on only specific dates, you can provide a tuple like
6403
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
6404
+ (i.e., no validation steps will be created for them).
6405
+
6406
+ A list with a combination of column names and tuples can be provided as well. This allows
6407
+ for more complex segmentation scenarios. The following inputs are all valid:
6408
+
6409
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6410
+ in the `"region"` column and specific dates in the `"date"` column
6411
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6412
+ columns
6413
+
6414
+ The segmentation is performed during interrogation, and the resulting validation steps will
6415
+ be numbered sequentially. Each segment will have its own validation step, and the results
6416
+ will be reported separately. This allows for a more granular analysis of the data and helps
6417
+ identify issues within specific segments.
6418
+
6419
+ Importantly, the segmentation process will be performed after any preprocessing of the data
6420
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
6421
+ that can be used for segmentation. For example, you could create a new column called
6422
+ `"segment"` through use of `pre=` and then use that column for segmentation.
6423
+
5733
6424
  Thresholds
5734
6425
  ----------
5735
6426
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5823,6 +6514,8 @@ class Validate:
5823
6514
  assertion_type = _get_fn_name()
5824
6515
 
5825
6516
  _check_pre(pre=pre)
6517
+ # TODO: add check for segments
6518
+ # _check_segments(segments=segments)
5826
6519
  _check_thresholds(thresholds=thresholds)
5827
6520
  _check_boolean_input(param=active, param_name="active")
5828
6521
 
@@ -5843,6 +6536,244 @@ class Validate:
5843
6536
  assertion_type=assertion_type,
5844
6537
  column=columns_subset,
5845
6538
  pre=pre,
6539
+ segments=segments,
6540
+ thresholds=thresholds,
6541
+ actions=actions,
6542
+ brief=brief,
6543
+ active=active,
6544
+ )
6545
+
6546
+ self._add_validation(validation_info=val_info)
6547
+
6548
+ return self
6549
+
6550
+ def rows_complete(
6551
+ self,
6552
+ columns_subset: str | list[str] | None = None,
6553
+ pre: Callable | None = None,
6554
+ segments: SegmentSpec | None = None,
6555
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
6556
+ actions: Actions | None = None,
6557
+ brief: str | bool | None = None,
6558
+ active: bool = True,
6559
+ ) -> Validate:
6560
+ """
6561
+ Validate whether row data are complete by having no missing values.
6562
+
6563
+ The `rows_complete()` method checks whether rows in the table are complete. Completeness
6564
+ of a row means that there are no missing values within the row. This validation will operate
6565
+ over the number of test units that is equal to the number of rows in the table (determined
6566
+ after any `pre=` mutation has been applied). A subset of columns can be specified for the
6567
+ completeness check. If no subset is provided, all columns in the table will be used.
6568
+
6569
+ Parameters
6570
+ ----------
6571
+ columns_subset
6572
+ A single column or a list of columns to use as a subset for the completeness check. If
6573
+ `None` (the default), then all columns in the table will be used.
6574
+ pre
6575
+ An optional preprocessing function or lambda to apply to the data table during
6576
+ interrogation. This function should take a table as input and return a modified table.
6577
+ Have a look at the *Preprocessing* section for more information on how to use this
6578
+ argument.
6579
+ segments
6580
+ An optional directive on segmentation, which serves to split a validation step into
6581
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
6582
+ column name and its corresponding values to segment on, or a combination of both
6583
+ (provided as a list). Read the *Segmentation* section for usage information.
6584
+ thresholds
6585
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
6586
+ The thresholds are set at the step level and will override any global thresholds set in
6587
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
6588
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
6589
+ section for information on how to set threshold levels.
6590
+ actions
6591
+ Optional actions to take when the validation step meets or exceeds any set threshold
6592
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
6593
+ define the actions.
6594
+ brief
6595
+ An optional brief description of the validation step that will be displayed in the
6596
+ reporting table. You can use the templating elements like `"{step}"` to insert
6597
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
6598
+ the entire brief will be automatically generated. If `None` (the default) then there
6599
+ won't be a brief.
6600
+ active
6601
+ A boolean value indicating whether the validation step should be active. Using `False`
6602
+ will make the validation step inactive (still reporting its presence and keeping indexes
6603
+ for the steps unchanged).
6604
+
6605
+ Returns
6606
+ -------
6607
+ Validate
6608
+ The `Validate` object with the added validation step.
6609
+
6610
+ Preprocessing
6611
+ -------------
6612
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
6613
+ table during interrogation. This function should take a table as input and return a modified
6614
+ table. This is useful for performing any necessary transformations or filtering on the data
6615
+ before the validation step is applied.
6616
+
6617
+ The preprocessing function can be any callable that takes a table as input and returns a
6618
+ modified table. For example, you could use a lambda function to filter the table based on
6619
+ certain criteria or to apply a transformation to the data. Note that you can refer to
6620
+ columns via `columns_subset=` that are expected to be present in the transformed table, but
6621
+ may not exist in the table before preprocessing. Regarding the lifetime of the transformed
6622
+ table, it only exists during the validation step and is not stored in the `Validate` object
6623
+ or used in subsequent validation steps.
6624
+
6625
+ Segmentation
6626
+ ------------
6627
+ The `segments=` argument allows for the segmentation of a validation step into multiple
6628
+ segments. This is useful for applying the same validation step to different subsets of the
6629
+ data. The segmentation can be done based on a single column or specific fields within a
6630
+ column.
6631
+
6632
+ Providing a single column name will result in a separate validation step for each unique
6633
+ value in that column. For example, if you have a column called `"region"` with values
6634
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
6635
+ region.
6636
+
6637
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
6638
+ values to segment on. For example, if you have a column called `"date"` and you want to
6639
+ segment on only specific dates, you can provide a tuple like
6640
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
6641
+ (i.e., no validation steps will be created for them).
6642
+
6643
+ A list with a combination of column names and tuples can be provided as well. This allows
6644
+ for more complex segmentation scenarios. The following inputs are all valid:
6645
+
6646
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6647
+ in the `"region"` column and specific dates in the `"date"` column
6648
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6649
+ columns
6650
+
6651
+ The segmentation is performed during interrogation, and the resulting validation steps will
6652
+ be numbered sequentially. Each segment will have its own validation step, and the results
6653
+ will be reported separately. This allows for a more granular analysis of the data and helps
6654
+ identify issues within specific segments.
6655
+
6656
+ Importantly, the segmentation process will be performed after any preprocessing of the data
6657
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
6658
+ that can be used for segmentation. For example, you could create a new column called
6659
+ `"segment"` through use of `pre=` and then use that column for segmentation.
6660
+
6661
+ Thresholds
6662
+ ----------
6663
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
6664
+ step. If they are set here at the step level, these thresholds will override any thresholds
6665
+ set at the global level in `Validate(thresholds=...)`.
6666
+
6667
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
6668
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
6669
+ or, the absolute number of failing test units (as integer that's `1` or greater).
6670
+
6671
+ Thresholds can be defined using one of these input schemes:
6672
+
6673
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
6674
+ thresholds)
6675
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
6676
+ the 'error' level, and position `2` is the 'critical' level
6677
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
6678
+ 'critical'
6679
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
6680
+ for the 'warning' level only
6681
+
6682
+ If the number of failing test units exceeds set thresholds, the validation step will be
6683
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
6684
+ set, you're free to set any combination of them.
6685
+
6686
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
6687
+ take for each level of failure (using the `actions=` parameter).
6688
+
6689
+ Examples
6690
+ --------
6691
+ ```{python}
6692
+ #| echo: false
6693
+ #| output: false
6694
+ import pointblank as pb
6695
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
6696
+ ```
6697
+ For the examples here, we'll use a simple Polars DataFrame with three string columns
6698
+ (`col_1`, `col_2`, and `col_3`). The table is shown below:
6699
+
6700
+ ```{python}
6701
+ import pointblank as pb
6702
+ import polars as pl
6703
+
6704
+ tbl = pl.DataFrame(
6705
+ {
6706
+ "col_1": ["a", None, "c", "d"],
6707
+ "col_2": ["a", "a", "c", None],
6708
+ "col_3": ["a", "a", "d", None],
6709
+ }
6710
+ )
6711
+
6712
+ pb.preview(tbl)
6713
+ ```
6714
+
6715
+ Let's validate that the rows in the table are complete with `rows_complete()`. We'll
6716
+ determine if this validation had any failing test units (there are four test units, one for
6717
+ each row). A failing test units means that a given row is not complete (i.e., has at least
6718
+ one missing value).
6719
+
6720
+ ```{python}
6721
+ validation = (
6722
+ pb.Validate(data=tbl)
6723
+ .rows_complete()
6724
+ .interrogate()
6725
+ )
6726
+
6727
+ validation
6728
+ ```
6729
+
6730
+ From this validation table we see that there are two failing test units. This is because
6731
+ two rows in the table have at least one missing value (the second row and the last row).
6732
+
6733
+ We can also use a subset of columns to determine completeness. Let's specify the subset
6734
+ using columns `col_2` and `col_3` for the next validation.
6735
+
6736
+ ```{python}
6737
+ validation = (
6738
+ pb.Validate(data=tbl)
6739
+ .rows_complete(columns_subset=["col_2", "col_3"])
6740
+ .interrogate()
6741
+ )
6742
+
6743
+ validation
6744
+ ```
6745
+
6746
+ The validation table reports a single failing test units. The last row contains missing
6747
+ values in both the `col_2` and `col_3` columns.
6748
+ others.
6749
+ """
6750
+
6751
+ assertion_type = _get_fn_name()
6752
+
6753
+ _check_pre(pre=pre)
6754
+ # TODO: add check for segments
6755
+ # _check_segments(segments=segments)
6756
+ _check_thresholds(thresholds=thresholds)
6757
+ _check_boolean_input(param=active, param_name="active")
6758
+
6759
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
6760
+ thresholds = (
6761
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
6762
+ )
6763
+
6764
+ if columns_subset is not None and isinstance(columns_subset, str):
6765
+ columns_subset = [columns_subset]
6766
+
6767
+ # TODO: incorporate Column object
6768
+
6769
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
6770
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
6771
+
6772
+ val_info = _ValidationInfo(
6773
+ assertion_type=assertion_type,
6774
+ column=columns_subset,
6775
+ pre=pre,
6776
+ segments=segments,
5846
6777
  thresholds=thresholds,
5847
6778
  actions=actions,
5848
6779
  brief=brief,
@@ -5903,7 +6834,7 @@ class Validate:
5903
6834
  substring matches are allowed, so a schema data type of `Int` would match a target table
5904
6835
  data type of `Int64`.
5905
6836
  pre
5906
- A optional preprocessing function or lambda to apply to the data table during
6837
+ An optional preprocessing function or lambda to apply to the data table during
5907
6838
  interrogation. This function should take a table as input and return a modified table.
5908
6839
  Have a look at the *Preprocessing* section for more information on how to use this
5909
6840
  argument.
@@ -6116,7 +7047,7 @@ class Validate:
6116
7047
  Should the validation step be inverted? If `True`, then the expectation is that the row
6117
7048
  count of the target table should not match the specified `count=` value.
6118
7049
  pre
6119
- A optional preprocessing function or lambda to apply to the data table during
7050
+ An optional preprocessing function or lambda to apply to the data table during
6120
7051
  interrogation. This function should take a table as input and return a modified table.
6121
7052
  Have a look at the *Preprocessing* section for more information on how to use this
6122
7053
  argument.
@@ -6326,7 +7257,7 @@ class Validate:
6326
7257
  Should the validation step be inverted? If `True`, then the expectation is that the
6327
7258
  column count of the target table should not match the specified `count=` value.
6328
7259
  pre
6329
- A optional preprocessing function or lambda to apply to the data table during
7260
+ An optional preprocessing function or lambda to apply to the data table during
6330
7261
  interrogation. This function should take a table as input and return a modified table.
6331
7262
  Have a look at the *Preprocessing* section for more information on how to use this
6332
7263
  argument.
@@ -6844,10 +7775,14 @@ class Validate:
6844
7775
 
6845
7776
  self.time_start = datetime.datetime.now(datetime.timezone.utc)
6846
7777
 
6847
- # Expand `validation_info` by evaluating any column expressions in `column`
7778
+ # Expand `validation_info` by evaluating any column expressions in `columns=`
6848
7779
  # (the `_evaluate_column_exprs()` method will eval and expand as needed)
6849
7780
  self._evaluate_column_exprs(validation_info=self.validation_info)
6850
7781
 
7782
+ # Expand `validation_info` by evaluating for any segmentation directives
7783
+ # provided in `segments=` (the `_evaluate_segments()` method will eval and expand as needed)
7784
+ self._evaluate_segments(validation_info=self.validation_info)
7785
+
6851
7786
  for validation in self.validation_info:
6852
7787
  # Set the `i` value for the validation step (this is 1-indexed)
6853
7788
  index_value = self.validation_info.index(validation) + 1
@@ -6883,6 +7818,10 @@ class Validate:
6883
7818
 
6884
7819
  validation.autobrief = autobrief
6885
7820
 
7821
+ # ------------------------------------------------
7822
+ # Bypassing the validation step if conditions met
7823
+ # ------------------------------------------------
7824
+
6886
7825
  # Skip the validation step if it is not active but still record the time of processing
6887
7826
  if not validation.active:
6888
7827
  end_time = datetime.datetime.now(datetime.timezone.utc)
@@ -6939,6 +7878,17 @@ class Validate:
6939
7878
  elif isinstance(validation.pre, Callable):
6940
7879
  data_tbl_step = validation.pre(data_tbl_step)
6941
7880
 
7881
+ # ------------------------------------------------
7882
+ # Segmentation stage
7883
+ # ------------------------------------------------
7884
+
7885
+ # Determine whether any segmentation directives are to be applied to the table
7886
+
7887
+ if validation.segments is not None:
7888
+ data_tbl_step = _apply_segments(
7889
+ data_tbl=data_tbl_step, segments_expr=validation.segments
7890
+ )
7891
+
6942
7892
  validation.n = NumberOfTestUnits(df=data_tbl_step, column=column).get_test_units(
6943
7893
  tbl_type=tbl_type
6944
7894
  )
@@ -7012,6 +7962,14 @@ class Validate:
7012
7962
  tbl_type=tbl_type,
7013
7963
  ).get_test_results()
7014
7964
 
7965
+ if assertion_category == "ROWS_COMPLETE":
7966
+ results_tbl = RowsComplete(
7967
+ data_tbl=data_tbl_step,
7968
+ columns_subset=column,
7969
+ threshold=threshold,
7970
+ tbl_type=tbl_type,
7971
+ ).get_test_results()
7972
+
7015
7973
  if assertion_category == "COL_EXISTS_HAS_TYPE":
7016
7974
  result_bool = ColExistsHasType(
7017
7975
  data_tbl=data_tbl_step,
@@ -7282,7 +8240,8 @@ class Validate:
7282
8240
  # TODO: Add support for extraction of rows for Ibis backends
7283
8241
  if (
7284
8242
  collect_extracts
7285
- and assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_distinct"]
8243
+ and assertion_type
8244
+ in ROW_BASED_VALIDATION_TYPES + ["rows_distinct", "rows_complete"]
7286
8245
  and tbl_type not in IBIS_BACKENDS
7287
8246
  ):
7288
8247
  # Add row numbers to the results table
@@ -8364,19 +9323,134 @@ class Validate:
8364
9323
  """
8365
9324
  Get a report of the validation results as a JSON-formatted string.
8366
9325
 
9326
+ The `get_json_report()` method provides a machine-readable report of validation results in
9327
+ JSON format. This is particularly useful for programmatic processing, storing validation
9328
+ results, or integrating with other systems. The report includes detailed information about
9329
+ each validation step, such as assertion type, columns validated, threshold values, test
9330
+ results, and more.
9331
+
9332
+ By default, all available validation information fields are included in the report. However,
9333
+ you can customize the fields to include or exclude using the `use_fields=` and
9334
+ `exclude_fields=` parameters.
9335
+
8367
9336
  Parameters
8368
9337
  ----------
8369
9338
  use_fields
8370
- A list of fields to include in the report. If `None`, all fields are included.
9339
+ An optional list of specific fields to include in the report. If provided, only these
9340
+ fields will be included in the JSON output. If `None` (the default), all standard
9341
+ validation report fields are included. Have a look at the *Available Report Fields*
9342
+ section below for a list of fields that can be included in the report.
8371
9343
  exclude_fields
8372
- A list of fields to exclude from the report. If `None`, no fields are excluded.
9344
+ An optional list of fields to exclude from the report. If provided, these fields will
9345
+ be omitted from the JSON output. If `None` (the default), no fields are excluded.
9346
+ This parameter cannot be used together with `use_fields=`. The *Available Report Fields*
9347
+ provides a listing of fields that can be excluded from the report.
8373
9348
 
8374
9349
  Returns
8375
9350
  -------
8376
9351
  str
8377
- A JSON-formatted string representing the validation report.
8378
- """
9352
+ A JSON-formatted string representing the validation report, with each validation step
9353
+ as an object in the report array.
9354
+
9355
+ Available Report Fields
9356
+ -----------------------
9357
+ The JSON report can include any of the standard validation report fields, including:
9358
+
9359
+ - `i`: the step number (1-indexed)
9360
+ - `i_o`: the original step index from the validation plan (pre-expansion)
9361
+ - `assertion_type`: the type of validation assertion (e.g., `"col_vals_gt"`, etc.)
9362
+ - `column`: the column being validated (or columns used in certain validations)
9363
+ - `values`: the comparison values or parameters used in the validation
9364
+ - `inclusive`: whether the comparison is inclusive (for range-based validations)
9365
+ - `na_pass`: whether `NA`/`Null` values are considered passing (for certain validations)
9366
+ - `pre`: preprocessing function applied before validation
9367
+ - `segments`: data segments to which the validation was applied
9368
+ - `thresholds`: threshold level statement that was used for the validation step
9369
+ - `label`: custom label for the validation step
9370
+ - `brief`: a brief description of the validation step
9371
+ - `active`: whether the validation step is active
9372
+ - `all_passed`: whether all test units passed in the step
9373
+ - `n`: total number of test units
9374
+ - `n_passed`, `n_failed`: number of test units that passed and failed
9375
+ - `f_passed`, `f_failed`: Fraction of test units that passed and failed
9376
+ - `warning`, `error`, `critical`: whether the namesake threshold level was exceeded (is
9377
+ `null` if threshold not set)
9378
+ - `time_processed`: when the validation step was processed (ISO 8601 format)
9379
+ - `proc_duration_s`: the processing duration in seconds
9380
+
9381
+ Examples
9382
+ --------
9383
+ Let's create a validation plan with a few validation steps and generate a JSON report of the
9384
+ results:
9385
+
9386
+ ```{python}
9387
+ import pointblank as pb
9388
+ import polars as pl
9389
+
9390
+ # Create a sample DataFrame
9391
+ tbl = pl.DataFrame({
9392
+ "a": [5, 7, 8, 9],
9393
+ "b": [3, 4, 2, 1]
9394
+ })
8379
9395
 
9396
+ # Create and execute a validation plan
9397
+ validation = (
9398
+ pb.Validate(data=tbl)
9399
+ .col_vals_gt(columns="a", value=6)
9400
+ .col_vals_lt(columns="b", value=4)
9401
+ .interrogate()
9402
+ )
9403
+
9404
+ # Get the full JSON report
9405
+ json_report = validation.get_json_report()
9406
+
9407
+ print(json_report)
9408
+ ```
9409
+
9410
+ You can also customize which fields to include:
9411
+
9412
+ ```{python}
9413
+ json_report = validation.get_json_report(
9414
+ use_fields=["i", "assertion_type", "column", "n_passed", "n_failed"]
9415
+ )
9416
+
9417
+ print(json_report)
9418
+ ```
9419
+
9420
+ Or which fields to exclude:
9421
+
9422
+ ```{python}
9423
+ json_report = validation.get_json_report(
9424
+ exclude_fields=[
9425
+ "i_o", "thresholds", "pre", "segments", "values",
9426
+ "na_pass", "inclusive", "label", "brief", "active",
9427
+ "time_processed", "proc_duration_s"
9428
+ ]
9429
+ )
9430
+
9431
+ print(json_report)
9432
+ ```
9433
+
9434
+ The JSON output can be further processed or analyzed programmatically:
9435
+
9436
+ ```{python}
9437
+ import json
9438
+
9439
+ # Parse the JSON report
9440
+ report_data = json.loads(validation.get_json_report())
9441
+
9442
+ # Extract and analyze validation results
9443
+ failing_steps = [step for step in report_data if step["n_failed"] > 0]
9444
+ print(f"Number of failing validation steps: {len(failing_steps)}")
9445
+ ```
9446
+
9447
+ See Also
9448
+ --------
9449
+ - [`get_tabular_report()`](`pointblank.Validate.get_tabular_report`): Get a formatted HTML
9450
+ report as a GT table
9451
+ - [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`): Get rows that
9452
+ failed validation
9453
+ """
8380
9454
  if use_fields is not None and exclude_fields is not None:
8381
9455
  raise ValueError("Cannot specify both `use_fields=` and `exclude_fields=`.")
8382
9456
 
@@ -8840,6 +9914,13 @@ class Validate:
8840
9914
  # will be made blank if the validation has not been performed
8841
9915
  interrogation_performed = validation_info_dict.get("proc_duration_s", [None])[0] is not None
8842
9916
 
9917
+ # Determine which steps are those using segmented data
9918
+ segmented_steps = [
9919
+ i + 1
9920
+ for i, segment in enumerate(validation_info_dict["segments"])
9921
+ if segment is not None
9922
+ ]
9923
+
8843
9924
  # ------------------------------------------------
8844
9925
  # Process the `type_upd` entry
8845
9926
  # ------------------------------------------------
@@ -8849,6 +9930,7 @@ class Validate:
8849
9930
  assertion_str=validation_info_dict["assertion_type"],
8850
9931
  brief_str=validation_info_dict["brief"],
8851
9932
  autobrief_str=validation_info_dict["autobrief"],
9933
+ segmentation_str=validation_info_dict["segments"],
8852
9934
  lang=lang,
8853
9935
  )
8854
9936
 
@@ -8877,7 +9959,7 @@ class Validate:
8877
9959
  "col_vals_expr",
8878
9960
  ]:
8879
9961
  columns_upd.append("—")
8880
- elif assertion_type[i] in ["rows_distinct"]:
9962
+ elif assertion_type[i] in ["rows_distinct", "rows_complete"]:
8881
9963
  if not column:
8882
9964
  # If there is no column subset, then all columns are used
8883
9965
  columns_upd.append("ALL COLUMNS")
@@ -8940,6 +10022,7 @@ class Validate:
8940
10022
  "col_vals_not_null",
8941
10023
  "col_exists",
8942
10024
  "rows_distinct",
10025
+ "rows_complete",
8943
10026
  ]:
8944
10027
  values_upd.append("—")
8945
10028
 
@@ -8980,11 +10063,14 @@ class Validate:
8980
10063
  # Add the `tbl` entry
8981
10064
  # ------------------------------------------------
8982
10065
 
8983
- # Depending on if there was some preprocessing done, get the appropriate icon
8984
- # for the table processing status to be displayed in the report under the `tbl` column
10066
+ # Depending on if there was some preprocessing done, get the appropriate icon for
10067
+ # the table processing status to be displayed in the report under the `tbl` column
10068
+ # TODO: add the icon for the segmented data option when the step is segmented
8985
10069
 
8986
10070
  validation_info_dict["tbl"] = _transform_tbl_preprocessed(
8987
- pre=validation_info_dict["pre"], interrogation_performed=interrogation_performed
10071
+ pre=validation_info_dict["pre"],
10072
+ seg=validation_info_dict["segments"],
10073
+ interrogation_performed=interrogation_performed,
8988
10074
  )
8989
10075
 
8990
10076
  # ------------------------------------------------
@@ -9019,8 +10105,9 @@ class Validate:
9019
10105
  # Process `pass` and `fail` entries
9020
10106
  # ------------------------------------------------
9021
10107
 
9022
- # Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries (the length
9023
- # of the `pass` entry should be equal to the length of the `n_passed` and `n_failed` entries)
10108
+ # Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries
10109
+ # (the length of the `pass` entry should be equal to the length of the
10110
+ # `n_passed` and `n_failed` entries)
9024
10111
 
9025
10112
  validation_info_dict["pass"] = _transform_passed_failed(
9026
10113
  n_passed_failed=validation_info_dict["n_passed"],
@@ -9173,6 +10260,9 @@ class Validate:
9173
10260
  # Remove the `pre` entry from the dictionary
9174
10261
  validation_info_dict.pop("pre")
9175
10262
 
10263
+ # Remove the `segments` entry from the dictionary
10264
+ validation_info_dict.pop("segments")
10265
+
9176
10266
  # Remove the `proc_duration_s` entry from the dictionary
9177
10267
  validation_info_dict.pop("proc_duration_s")
9178
10268
 
@@ -9255,6 +10345,10 @@ class Validate:
9255
10345
  columns=["type_upd", "columns_upd", "values_upd", "test_units", "pass", "fail"]
9256
10346
  ),
9257
10347
  )
10348
+ .tab_style(
10349
+ style=style.css("overflow-x: visible; white-space: nowrap;"),
10350
+ locations=loc.body(columns="type_upd", rows=segmented_steps),
10351
+ )
9258
10352
  .tab_style(
9259
10353
  style=style.fill(color="#FCFCFC" if interrogation_performed else "white"),
9260
10354
  locations=loc.body(columns=["w_upd", "e_upd", "c_upd"]),
@@ -9429,8 +10523,8 @@ class Validate:
9429
10523
  table object, which can be displayed in a notebook or exported to an HTML file.
9430
10524
 
9431
10525
  :::{.callout-warning}
9432
- The `get_step_report()` is still experimental. Please report any issues you encounter in the
9433
- [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
10526
+ The `get_step_report()` method is still experimental. Please report any issues you encounter
10527
+ in the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
9434
10528
  :::
9435
10529
 
9436
10530
  Parameters
@@ -9463,6 +10557,36 @@ class Validate:
9463
10557
  GT
9464
10558
  A GT table object that represents the detailed report for the validation step.
9465
10559
 
10560
+ Types of Step Reports
10561
+ ---------------------
10562
+ The `get_step_report()` method produces a report based on the *type* of validation step.
10563
+ The following row-based validation methods will produce a report that shows the rows of the
10564
+ data that failed because of failing test units within one or more columns failed:
10565
+
10566
+ - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
10567
+ - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
10568
+ - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
10569
+ - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
10570
+ - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
10571
+ - [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
10572
+ - [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
10573
+ - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
10574
+ - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
10575
+ - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
10576
+ - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
10577
+ - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
10578
+ - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
10579
+ - [`rows_complete()`](`pointblank.Validate.rows_complete`)
10580
+ - [`conjointly()`](`pointblank.Validate.conjointly`)
10581
+
10582
+ The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
10583
+ report that shows duplicate rows (or duplicate values in one or a set of columns as defined
10584
+ in that method's `columns_subset=` parameter.
10585
+
10586
+ The [`col_schema_match()`](`pointblank.Validate.col_schema_match`) validation step will
10587
+ produce a report that shows the schema of the data table and the schema of the validation
10588
+ step. The report will indicate whether the schemas match or not.
10589
+
9466
10590
  Examples
9467
10591
  --------
9468
10592
  ```{python}
@@ -9488,7 +10612,7 @@ class Validate:
9488
10612
  .col_vals_lt(columns="d", value=3500)
9489
10613
  .col_vals_between(columns="c", left=1, right=8)
9490
10614
  .col_vals_gt(columns="a", value=3)
9491
- .col_vals_regex(columns="b", pattern=r"\d-[a-z]{3}-\d{3}")
10615
+ .col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
9492
10616
  .interrogate()
9493
10617
  )
9494
10618
 
@@ -9612,7 +10736,7 @@ class Validate:
9612
10736
  # if get_row_count(extract) == 0:
9613
10737
  # return "No rows were extracted."
9614
10738
 
9615
- if assertion_type in ROW_BASED_VALIDATION_TYPES:
10739
+ if assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_complete"]:
9616
10740
  # Get the extracted data for the step
9617
10741
  extract = self.get_data_extracts(i=i, frame=True)
9618
10742
 
@@ -9776,6 +10900,95 @@ class Validate:
9776
10900
 
9777
10901
  return self
9778
10902
 
10903
+ def _evaluate_segments(self, validation_info):
10904
+ """
10905
+ Evaluate any segmentation expressions stored in the `segments` attribute and expand each
10906
+ validation step with such directives into multiple. This is done by evaluating the
10907
+ segmentation expression and creating a new validation step for each segment. Errors in
10908
+ evaluation (such as no segments matched) will be caught and recorded in the `eval_error`
10909
+ attribute.
10910
+
10911
+ Parameters
10912
+ ----------
10913
+ validation_info
10914
+ Information about the validation to evaluate and expand.
10915
+ """
10916
+
10917
+ # Create a list to store the expanded validation steps
10918
+ expanded_validation_info = []
10919
+
10920
+ # Iterate over the validation steps
10921
+ for i, validation in enumerate(validation_info):
10922
+ # Get the segments expression
10923
+ segments_expr = validation.segments
10924
+
10925
+ # If the value is None, then skip the evaluation and append the validation step to the
10926
+ # list of expanded validation steps
10927
+ if segments_expr is None:
10928
+ expanded_validation_info.append(validation)
10929
+ continue
10930
+
10931
+ # Evaluate the segments expression
10932
+ try:
10933
+ # Get the table for this step, it can either be:
10934
+ # 1. the target table itself
10935
+ # 2. the target table modified by a `pre` attribute
10936
+
10937
+ if validation.pre is None:
10938
+ table = self.data
10939
+ else:
10940
+ table = validation.pre(self.data)
10941
+
10942
+ # If the `segments` expression is a string, that string is taken as a column name
10943
+ # for which segmentation should occur across unique values in the column
10944
+ if isinstance(segments_expr, str):
10945
+ seg_tuples = _seg_expr_from_string(data_tbl=table, segments_expr=segments_expr)
10946
+
10947
+ # If the 'segments' expression is a tuple, then normalize it to a list of tuples
10948
+ # - ("col", "value") -> [("col", "value")]
10949
+ # - ("col", ["value1", "value2"]) -> [("col", "value1"), ("col", "value2")]
10950
+ elif isinstance(segments_expr, tuple):
10951
+ seg_tuples = _seg_expr_from_tuple(segments_expr=segments_expr)
10952
+
10953
+ # If the 'segments' expression is a list of strings or tuples (can be mixed) then
10954
+ # normalize it to a list of tuples following the rules above
10955
+ elif isinstance(segments_expr, list):
10956
+ seg_tuples = []
10957
+ for seg in segments_expr:
10958
+ if isinstance(seg, str):
10959
+ # Use the utility function for string items
10960
+ str_seg_tuples = _seg_expr_from_string(
10961
+ data_tbl=table, segments_expr=seg
10962
+ )
10963
+ seg_tuples.extend(str_seg_tuples)
10964
+ elif isinstance(seg, tuple):
10965
+ # Use the utility function for tuple items
10966
+ tuple_seg_tuples = _seg_expr_from_tuple(segments_expr=seg)
10967
+ seg_tuples.extend(tuple_seg_tuples)
10968
+ else: # pragma: no cover
10969
+ # Handle invalid segment type
10970
+ raise ValueError(
10971
+ f"Invalid segment expression item type: {type(seg)}. "
10972
+ "Must be either string or tuple."
10973
+ )
10974
+
10975
+ except Exception: # pragma: no cover
10976
+ validation.eval_error = True
10977
+
10978
+ # For each segmentation resolved, create a new validation step and add it to the list of
10979
+ # expanded validation steps
10980
+ for seg in seg_tuples:
10981
+ new_validation = copy.deepcopy(validation)
10982
+
10983
+ new_validation.segments = seg
10984
+
10985
+ expanded_validation_info.append(new_validation)
10986
+
10987
+ # Replace the `validation_info` attribute with the expanded version
10988
+ self.validation_info = expanded_validation_info
10989
+
10990
+ return self
10991
+
9779
10992
  def _get_validation_dict(self, i: int | list[int] | None, attr: str) -> dict[int, int]:
9780
10993
  """
9781
10994
  Utility function to get a dictionary of validation attributes for each validation step.
@@ -10233,6 +11446,13 @@ def _create_autobrief_or_failure_text(
10233
11446
  for_failure=for_failure,
10234
11447
  )
10235
11448
 
11449
+ if assertion_type == "rows_complete":
11450
+ return _create_text_rows_complete(
11451
+ lang=lang,
11452
+ columns_subset=column,
11453
+ for_failure=for_failure,
11454
+ )
11455
+
10236
11456
  if assertion_type == "row_count_match":
10237
11457
  return _create_text_row_count_match(
10238
11458
  lang=lang,
@@ -10408,6 +11628,24 @@ def _create_text_rows_distinct(
10408
11628
  return text
10409
11629
 
10410
11630
 
11631
+ def _create_text_rows_complete(
11632
+ lang: str, columns_subset: list[str] | None, for_failure: bool = False
11633
+ ) -> str:
11634
+ type_ = _expect_failure_type(for_failure=for_failure)
11635
+
11636
+ if columns_subset is None:
11637
+ text = EXPECT_FAIL_TEXT[f"all_row_complete_{type_}_text"][lang]
11638
+
11639
+ else:
11640
+ column_text = _prep_values_text(values=columns_subset, lang=lang, limit=3)
11641
+
11642
+ text = EXPECT_FAIL_TEXT[f"across_row_complete_{type_}_text"][lang].format(
11643
+ column_text=column_text
11644
+ )
11645
+
11646
+ return text
11647
+
11648
+
10411
11649
  def _create_text_row_count_match(lang: str, value: int, for_failure: bool = False) -> str:
10412
11650
  type_ = _expect_failure_type(for_failure=for_failure)
10413
11651
 
@@ -10493,6 +11731,143 @@ def _prep_values_text(
10493
11731
  return values_str
10494
11732
 
10495
11733
 
11734
+ def _seg_expr_from_string(data_tbl: any, segments_expr: str) -> tuple[str, str]:
11735
+ """
11736
+ Obtain the segmentation categories from a table column.
11737
+
11738
+ The `segments_expr` value will have been checked to be a string, so there's no need to check for
11739
+ that here. The function will return a list of tuples representing pairings of a column name and
11740
+ a value. The task is to obtain the unique values in the column (handling different table types)
11741
+ and produce a normalized list of tuples of the form: `(column, value)`.
11742
+
11743
+ This function is used to create a list of segments for the validation step. And since there will
11744
+ usually be more than one segment, the validation step will be expanded into multiple during
11745
+ interrogation (where this function is called).
11746
+
11747
+ Parameters
11748
+ ----------
11749
+ data_tbl
11750
+ The table from which to obtain the segmentation categories.
11751
+ segments_expr
11752
+ The column name for which segmentation should occur across unique values in the column.
11753
+
11754
+ Returns
11755
+ -------
11756
+ list[tuple[str, str]]
11757
+ A list of tuples representing pairings of a column name and a value in the column.
11758
+ """
11759
+ # Determine if the table is a DataFrame or a DB table
11760
+ tbl_type = _get_tbl_type(data=data_tbl)
11761
+
11762
+ # Obtain the segmentation categories from the table column given as `segments_expr`
11763
+ if tbl_type == "polars":
11764
+ seg_categories = data_tbl[segments_expr].unique().to_list()
11765
+ elif tbl_type == "pandas":
11766
+ seg_categories = data_tbl[segments_expr].unique().tolist()
11767
+ elif tbl_type in IBIS_BACKENDS:
11768
+ distinct_col_vals = data_tbl.select(segments_expr).distinct()
11769
+ seg_categories = distinct_col_vals[segments_expr].to_list()
11770
+ else: # pragma: no cover
11771
+ raise ValueError(f"Unsupported table type: {tbl_type}")
11772
+
11773
+ # Ensure that the categories are sorted
11774
+ seg_categories.sort()
11775
+
11776
+ # Place each category and each value in a list of tuples as: `(column, value)`
11777
+ seg_tuples = [(segments_expr, category) for category in seg_categories]
11778
+
11779
+ return seg_tuples
11780
+
11781
+
11782
+ def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, str]]:
11783
+ """
11784
+ Normalize the segments expression to a list of tuples, given a single tuple.
11785
+
11786
+ The `segments_expr` value will have been checked to be a tuple, so there's no need to check for
11787
+ that here. The function will return a list of tuples representing pairings of a column name and
11788
+ a value. The task is to normalize the tuple into a list of tuples of the form:
11789
+ `(column, value)`.
11790
+
11791
+ The following examples show how this normalzation works:
11792
+ - `("col", "value")` -> `[("col", "value")]` (single tuple, upgraded to a list of tuples)
11793
+ - `("col", ["value1", "value2"])` -> `[("col", "value1"), ("col", "value2")]` (tuple with a list
11794
+ of values, expanded into multiple tuples within a list)
11795
+
11796
+ This function is used to create a list of segments for the validation step. And since there will
11797
+ usually be more than one segment, the validation step will be expanded into multiple during
11798
+ interrogation (where this function is called).
11799
+
11800
+ Parameters
11801
+ ----------
11802
+ segments_expr
11803
+ The segments expression to normalize. It can be a tuple of the form
11804
+ `(column, value)` or `(column, [value1, value2])`.
11805
+
11806
+ Returns
11807
+ -------
11808
+ list[tuple[str, str]]
11809
+ A list of tuples representing pairings of a column name and a value in the column.
11810
+ """
11811
+ # Check if the first element is a string
11812
+ if isinstance(segments_expr[0], str):
11813
+ # If the second element is a list, create a list of tuples
11814
+ if isinstance(segments_expr[1], list):
11815
+ seg_tuples = [(segments_expr[0], value) for value in segments_expr[1]]
11816
+ # If the second element is not a list, create a single tuple
11817
+ else:
11818
+ seg_tuples = [(segments_expr[0], segments_expr[1])]
11819
+ # If the first element is not a string, raise an error
11820
+ else: # pragma: no cover
11821
+ raise ValueError("The first element of the segments expression must be a string.")
11822
+
11823
+ return seg_tuples
11824
+
11825
+
11826
+ def _apply_segments(data_tbl: any, segments_expr: tuple[str, str]) -> any:
11827
+ """
11828
+ Apply the segments expression to the data table.
11829
+
11830
+ Filter the data table based on the `segments_expr=` value, where the first element is the
11831
+ column name and the second element is the value to filter by.
11832
+
11833
+ Parameters
11834
+ ----------
11835
+ data_tbl
11836
+ The data table to filter. It can be a Pandas DataFrame, Polars DataFrame, or an Ibis
11837
+ backend table.
11838
+ segments_expr
11839
+ The segments expression to apply. It is a tuple of the form `(column, value)`.
11840
+
11841
+ Returns
11842
+ -------
11843
+ any
11844
+ The filtered data table. It will be of the same type as the input table.
11845
+ """
11846
+ # Get the table type
11847
+ tbl_type = _get_tbl_type(data=data_tbl)
11848
+
11849
+ if tbl_type in ["pandas", "polars"]:
11850
+ # If the table is a Pandas or Polars DataFrame, transforming to a Narwhals table
11851
+ # and perform the filtering operation
11852
+
11853
+ # Transform to Narwhals table if a DataFrame
11854
+ data_tbl_nw = nw.from_native(data_tbl)
11855
+
11856
+ # Filter the data table based on the column name and value
11857
+ data_tbl_nw = data_tbl_nw.filter(nw.col(segments_expr[0]) == segments_expr[1])
11858
+
11859
+ # Transform back to the original table type
11860
+ data_tbl = data_tbl_nw.to_native()
11861
+
11862
+ elif tbl_type in IBIS_BACKENDS:
11863
+ # If the table is an Ibis backend table, perform the filtering operation directly
11864
+
11865
+ # Filter the data table based on the column name and value
11866
+ data_tbl = data_tbl[data_tbl[segments_expr[0]] == segments_expr[1]]
11867
+
11868
+ return data_tbl
11869
+
11870
+
10496
11871
  def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
10497
11872
  """
10498
11873
  Convert a `_ValidationInfo` object to a dictionary.
@@ -10517,6 +11892,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
10517
11892
  "inclusive",
10518
11893
  "na_pass",
10519
11894
  "pre",
11895
+ "segments",
10520
11896
  "label",
10521
11897
  "brief",
10522
11898
  "autobrief",
@@ -10631,7 +12007,7 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s
10631
12007
  return title_text
10632
12008
 
10633
12009
 
10634
- def _transform_tbl_preprocessed(pre: str, interrogation_performed: bool) -> list[str]:
12010
+ def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: bool) -> list[str]:
10635
12011
  # If no interrogation was performed, return a list of empty strings
10636
12012
  if not interrogation_performed:
10637
12013
  return ["" for _ in range(len(pre))]
@@ -10640,11 +12016,13 @@ def _transform_tbl_preprocessed(pre: str, interrogation_performed: bool) -> list
10640
12016
  # (either 'unchanged' (None) or 'modified' (not None))
10641
12017
  status_list = []
10642
12018
 
10643
- for status in pre:
10644
- if status is None:
10645
- status_list.append("unchanged")
10646
- else:
12019
+ for i in range(len(pre)):
12020
+ if seg[i] is not None:
12021
+ status_list.append("segmented")
12022
+ elif pre[i] is not None:
10647
12023
  status_list.append("modified")
12024
+ else:
12025
+ status_list.append("unchanged")
10648
12026
 
10649
12027
  return _get_preprocessed_table_icon(icon=status_list)
10650
12028
 
@@ -10752,7 +12130,11 @@ def _transform_w_e_c(values, color, interrogation_performed):
10752
12130
 
10753
12131
 
10754
12132
  def _transform_assertion_str(
10755
- assertion_str: list[str], brief_str: list[str | None], autobrief_str: list[str], lang: str
12133
+ assertion_str: list[str],
12134
+ brief_str: list[str | None],
12135
+ autobrief_str: list[str],
12136
+ segmentation_str: list[tuple | None],
12137
+ lang: str,
10756
12138
  ) -> list[str]:
10757
12139
  # Get the SVG icons for the assertion types
10758
12140
  svg_icon = _get_assertion_icon(icon=assertion_str)
@@ -10813,6 +12195,26 @@ def _transform_assertion_str(
10813
12195
  for assertion, svg, size, brief_div in zip(assertion_str, svg_icon, text_size, brief_divs)
10814
12196
  ]
10815
12197
 
12198
+ # If the `segments` list is not empty, prepend a segmentation div to the `type_upd` strings
12199
+ if segmentation_str:
12200
+ for i in range(len(type_upd)):
12201
+ if segmentation_str[i] is not None:
12202
+ # Get the column name and value from the segmentation expression
12203
+ column_name = segmentation_str[i][0]
12204
+ column_value = segmentation_str[i][1]
12205
+ # Create the segmentation div
12206
+ segmentation_div = (
12207
+ "<div style='margin-top: 0px; margin-bottom: 0px; "
12208
+ "white-space: pre; font-size: 8px; color: darkblue; padding-bottom: 4px; "
12209
+ "'>"
12210
+ "<strong><span style='font-family: Helvetica, arial, sans-serif;'>"
12211
+ f"SEGMENT&nbsp;&nbsp;</span></strong><span>{column_name} / {column_value}"
12212
+ "</span>"
12213
+ "</div>"
12214
+ )
12215
+ # Prepend the segmentation div to the type_upd string
12216
+ type_upd[i] = f"{segmentation_div} {type_upd[i]}"
12217
+
10816
12218
  return type_upd
10817
12219
 
10818
12220
 
@@ -11044,6 +12446,11 @@ def _step_report_row_based(
11044
12446
  text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
11045
12447
  elif assertion_type == "col_vals_not_null":
11046
12448
  text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
12449
+ elif assertion_type == "rows_complete":
12450
+ if column is None:
12451
+ text = STEP_REPORT_TEXT["rows_complete_all"][lang]
12452
+ else:
12453
+ text = STEP_REPORT_TEXT["rows_complete_subset"][lang]
11047
12454
 
11048
12455
  # Wrap assertion text in a <code> tag
11049
12456
  text = (