PyPI - pointblank - Versions diffs - 0.8.6__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

pointblank 0.8.6py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

pointblank/_constants.py +11 -10
pointblank/_interrogation.py +10 -4
pointblank/_typing.py +19 -3
pointblank/data/api-docs.txt +716 -49
pointblank/datascan.py +4 -4
pointblank/draft.py +1 -1
pointblank/thresholds.py +10 -0
pointblank/validate.py +1071 -50
{pointblank-0.8.6.dist-info → pointblank-0.9.0.dist-info}/METADATA +19 -4
{pointblank-0.8.6.dist-info → pointblank-0.9.0.dist-info}/RECORD +13 -13
{pointblank-0.8.6.dist-info → pointblank-0.9.0.dist-info}/WHEEL +1 -1
{pointblank-0.8.6.dist-info → pointblank-0.9.0.dist-info}/licenses/LICENSE +0 -0
{pointblank-0.8.6.dist-info → pointblank-0.9.0.dist-info}/top_level.txt +0 -0

pointblank/validate.py CHANGED Viewed

@@ -7,6 +7,7 @@ import datetime
 import inspect
 import json
 import re
+import tempfile
 import threading
 from dataclasses import dataclass
 from importlib.metadata import version
@@ -57,6 +58,7 @@ from pointblank._interrogation import (
     RowCountMatch,
     RowsDistinct,
 )
+from pointblank._typing import SegmentSpec
 from pointblank._utils import (
     _check_any_df_lib,
     _check_invalid_fields,
@@ -87,6 +89,8 @@ from pointblank.thresholds import (
 )
 if TYPE_CHECKING:
+    from collections.abc import Collection
     from pointblank._typing import AbsoluteBounds, Tolerance
 __all__ = [
@@ -117,16 +121,18 @@ def _action_context_manager(metadata):
             delattr(_action_context, "metadata")
-def get_action_metadata():
+def get_action_metadata() -> dict | None:
     """Access step-level metadata when authoring custom actions.
     Get the metadata for the validation step where an action was triggered. This can be called by
-    user functions to get the metadata for the current action.
+    user functions to get the metadata for the current action. This function can only be used within
+    callables crafted for the [`Actions`](`pointblank.Actions`) class.
     Returns
     -------
-    dict
-        A dictionary containing the metadata for the current step.
+    dict | None
+        A dictionary containing the metadata for the current step. If called outside of an action
+        (i.e., when no action is being executed), this function will return `None`.
     Description of the Metadata Fields
     ----------------------------------
@@ -161,7 +167,7 @@ def get_action_metadata():
             thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
             actions=pb.Actions(warning=log_issue),
         )
-        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
+        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
         .col_vals_gt(columns="item_revenue", value=0.05)
         .col_vals_gt(
             columns="session_duration",
@@ -179,6 +185,11 @@ def get_action_metadata():
     - the `metadata` is a dictionary that is used to craft the log message
     - the action is passed as a bare function to the `Actions` object within the `Validate` object
     (placing it within `Validate(actions=)` ensures it's set as an action for every validation step)
+    See Also
+    --------
+    Have a look at [`Actions`](`pointblank.Actions`) for more information on how to create custom
+    actions for validation steps that exceed a set threshold value.
     """
     if hasattr(_action_context, "metadata"):  # pragma: no cover
         return _action_context.metadata  # pragma: no cover
@@ -202,17 +213,19 @@ def _final_action_context_manager(summary):
             delattr(_final_action_context, "summary")
-def get_validation_summary():
+def get_validation_summary() -> dict | None:
     """Access validation summary information when authoring final actions.
     This function provides a convenient way to access summary information about the validation
     process within a final action. It returns a dictionary with key metrics from the validation
-    process.
+    process. This function can only be used within callables crafted for the
+    [`FinalActions`](`pointblank.FinalActions`) class.
     Returns
     -------
     dict | None
-        A dictionary containing validation metrics, or None if called outside a final action.
+        A dictionary containing validation metrics. If called outside of an final action context,
+        this function will return `None`.
     Description of the Summary Fields
     --------------------------------
@@ -302,6 +315,11 @@ def get_validation_summary():
     Final actions work well with both simple logging and more complex notification systems, allowing
     you to integrate validation results into your broader data quality workflows.
+    See Also
+    --------
+    Have a look at [`FinalActions`](`pointblank.FinalActions`) for more information on how to create
+    custom actions that are executed after all validation steps have been completed.
     """
     if hasattr(_final_action_context, "summary"):
         return _final_action_context.summary
@@ -514,10 +532,10 @@ def load_dataset(
         data_path = files("pointblank.data") / f"{dataset}-duckdb.zip"
         # Unzip the DuckDB dataset to a temporary directory
-        with ZipFile(data_path, "r") as z:
-            z.extractall(path="datasets")
+        with tempfile.TemporaryDirectory() as tmp, ZipFile(data_path, "r") as z:
+            z.extractall(path=tmp)
-            data_path = f"datasets/{dataset}.ddb"
+            data_path = f"{tmp}/{dataset}.ddb"
             dataset = ibis.connect(f"duckdb://{data_path}").table(dataset)
@@ -1781,14 +1799,15 @@ class _ValidationInfo:
     assertion_type
         The type of assertion. This is the method name of the validation (e.g., `"col_vals_gt"`).
     column
-        The column to validate. Currently we don't allow for column expressions (which may map to
-        multiple columns).
+        The column(s) to validate.
     values
         The value or values to compare against.
     na_pass
         Whether to pass test units that hold missing values.
     pre
         A preprocessing function or lambda to apply to the data table for the validation step.
+    segments
+        The segments to use for the validation step.
     thresholds
         The threshold values for the validation.
     actions
@@ -1839,11 +1858,12 @@ class _ValidationInfo:
     step_id: str | None = None
     sha1: str | None = None
     assertion_type: str | None = None
-    column: str | None = None
+    column: any | None = None
     values: any | list[any] | tuple | None = None
     inclusive: tuple[bool, bool] | None = None
     na_pass: bool | None = None
     pre: Callable | None = None
+    segments: any | None = None
     thresholds: Thresholds | None = None
     actions: Actions | None = None
     label: str | None = None
@@ -1907,7 +1927,7 @@ class Validate:
         The table to validate, which could be a DataFrame object or an Ibis table object. Read the
         *Supported Input Table Types* section for details on the supported table types.
     tbl_name
-        A optional name to assign to the input table object. If no value is provided, a name will
+        An optional name to assign to the input table object. If no value is provided, a name will
         be generated based on whatever information is available. This table name will be displayed
         in the header area of the tabular report.
     label
@@ -2321,6 +2341,7 @@ class Validate:
         value: float | int | Column,
         na_pass: bool = False,
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -2352,10 +2373,15 @@ class Validate:
             Should any encountered None, NA, or Null values be considered as passing test units? By
             default, this is `False`. Set to `True` to pass test units with missing values.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -2418,6 +2444,42 @@ class Validate:
         lifetime of the transformed table, it only exists during the validation step and is not
         stored in the `Validate` object or used in subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -2516,6 +2578,8 @@ class Validate:
         _check_column(column=columns)
         # _check_value_float_int(value=value)
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=na_pass, param_name="na_pass")
         _check_boolean_input(param=active, param_name="active")
@@ -2548,6 +2612,7 @@ class Validate:
                 values=value,
                 na_pass=na_pass,
                 pre=pre,
+                segments=segments,
                 thresholds=thresholds,
                 actions=actions,
                 brief=brief,
@@ -2564,6 +2629,7 @@ class Validate:
         value: float | int | Column,
         na_pass: bool = False,
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -2595,10 +2661,15 @@ class Validate:
             Should any encountered None, NA, or Null values be considered as passing test units? By
             default, this is `False`. Set to `True` to pass test units with missing values.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -2661,6 +2732,42 @@ class Validate:
         lifetime of the transformed table, it only exists during the validation step and is not
         stored in the `Validate` object or used in subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -2758,6 +2865,8 @@ class Validate:
         _check_column(column=columns)
         # _check_value_float_int(value=value)
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=na_pass, param_name="na_pass")
         _check_boolean_input(param=active, param_name="active")
@@ -2790,6 +2899,7 @@ class Validate:
                 values=value,
                 na_pass=na_pass,
                 pre=pre,
+                segments=segments,
                 thresholds=thresholds,
                 actions=actions,
                 brief=brief,
@@ -2806,6 +2916,7 @@ class Validate:
         value: float | int | Column,
         na_pass: bool = False,
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -2837,10 +2948,15 @@ class Validate:
             Should any encountered None, NA, or Null values be considered as passing test units? By
             default, this is `False`. Set to `True` to pass test units with missing values.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -2903,6 +3019,42 @@ class Validate:
         lifetime of the transformed table, it only exists during the validation step and is not
         stored in the `Validate` object or used in subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -2999,6 +3151,8 @@ class Validate:
         _check_column(column=columns)
         # _check_value_float_int(value=value)
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=na_pass, param_name="na_pass")
         _check_boolean_input(param=active, param_name="active")
@@ -3031,6 +3185,7 @@ class Validate:
                 values=value,
                 na_pass=na_pass,
                 pre=pre,
+                segments=segments,
                 thresholds=thresholds,
                 actions=actions,
                 brief=brief,
@@ -3047,6 +3202,7 @@ class Validate:
         value: float | int | Column,
         na_pass: bool = False,
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -3078,10 +3234,15 @@ class Validate:
             Should any encountered None, NA, or Null values be considered as passing test units? By
             default, this is `False`. Set to `True` to pass test units with missing values.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -3144,6 +3305,42 @@ class Validate:
         lifetime of the transformed table, it only exists during the validation step and is not
         stored in the `Validate` object or used in subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3238,6 +3435,8 @@ class Validate:
         _check_column(column=columns)
         # _check_value_float_int(value=value)
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=na_pass, param_name="na_pass")
         _check_boolean_input(param=active, param_name="active")
@@ -3270,6 +3469,7 @@ class Validate:
                 values=value,
                 na_pass=na_pass,
                 pre=pre,
+                segments=segments,
                 thresholds=thresholds,
                 actions=actions,
                 brief=brief,
@@ -3286,6 +3486,7 @@ class Validate:
         value: float | int | Column,
         na_pass: bool = False,
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -3317,10 +3518,15 @@ class Validate:
             Should any encountered None, NA, or Null values be considered as passing test units? By
             default, this is `False`. Set to `True` to pass test units with missing values.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -3383,6 +3589,42 @@ class Validate:
         lifetime of the transformed table, it only exists during the validation step and is not
         stored in the `Validate` object or used in subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3481,6 +3723,8 @@ class Validate:
         _check_column(column=columns)
         # _check_value_float_int(value=value)
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=na_pass, param_name="na_pass")
         _check_boolean_input(param=active, param_name="active")
@@ -3513,6 +3757,7 @@ class Validate:
                 values=value,
                 na_pass=na_pass,
                 pre=pre,
+                segments=segments,
                 thresholds=thresholds,
                 actions=actions,
                 brief=brief,
@@ -3529,6 +3774,7 @@ class Validate:
         value: float | int | Column,
         na_pass: bool = False,
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -3560,10 +3806,15 @@ class Validate:
             Should any encountered None, NA, or Null values be considered as passing test units? By
             default, this is `False`. Set to `True` to pass test units with missing values.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -3626,6 +3877,42 @@ class Validate:
         lifetime of the transformed table, it only exists during the validation step and is not
         stored in the `Validate` object or used in subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3724,6 +4011,8 @@ class Validate:
         _check_column(column=columns)
         # _check_value_float_int(value=value)
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=na_pass, param_name="na_pass")
         _check_boolean_input(param=active, param_name="active")
@@ -3756,6 +4045,7 @@ class Validate:
                 values=value,
                 na_pass=na_pass,
                 pre=pre,
+                segments=segments,
                 thresholds=thresholds,
                 actions=actions,
                 brief=brief,
@@ -3774,6 +4064,7 @@ class Validate:
         inclusive: tuple[bool, bool] = (True, True),
         na_pass: bool = False,
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -3815,10 +4106,15 @@ class Validate:
             Should any encountered None, NA, or Null values be considered as passing test units? By
             default, this is `False`. Set to `True` to pass test units with missing values.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -3883,6 +4179,42 @@ class Validate:
         lifetime of the transformed table, it only exists during the validation step and is not
         stored in the `Validate` object or used in subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -3990,6 +4322,8 @@ class Validate:
         # _check_value_float_int(value=left)
         # _check_value_float_int(value=right)
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=na_pass, param_name="na_pass")
         _check_boolean_input(param=active, param_name="active")
@@ -4027,6 +4361,7 @@ class Validate:
                 inclusive=inclusive,
                 na_pass=na_pass,
                 pre=pre,
+                segments=segments,
                 thresholds=thresholds,
                 actions=actions,
                 brief=brief,
@@ -4045,6 +4380,7 @@ class Validate:
         inclusive: tuple[bool, bool] = (True, True),
         na_pass: bool = False,
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -4086,10 +4422,15 @@ class Validate:
             Should any encountered None, NA, or Null values be considered as passing test units? By
             default, this is `False`. Set to `True` to pass test units with missing values.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -4154,6 +4495,42 @@ class Validate:
         lifetime of the transformed table, it only exists during the validation step and is not
         stored in the `Validate` object or used in subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4261,6 +4638,8 @@ class Validate:
         # _check_value_float_int(value=left)
         # _check_value_float_int(value=right)
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=na_pass, param_name="na_pass")
         _check_boolean_input(param=active, param_name="active")
@@ -4298,6 +4677,7 @@ class Validate:
                 inclusive=inclusive,
                 na_pass=na_pass,
                 pre=pre,
+                segments=segments,
                 thresholds=thresholds,
                 actions=actions,
                 brief=brief,
@@ -4311,8 +4691,9 @@ class Validate:
     def col_vals_in_set(
         self,
         columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
-        set: list[float | int],
+        set: Collection[Any],
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -4336,10 +4717,15 @@ class Validate:
         set
             A list of values to compare against.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -4381,6 +4767,42 @@ class Validate:
         only exists during the validation step and is not stored in the `Validate` object or used in
         subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4471,8 +4893,16 @@ class Validate:
         assertion_type = _get_fn_name()
         _check_column(column=columns)
-        _check_set_types(set=set)
+        for val in set:
+            if val is None:
+                continue
+            if not isinstance(val, (float, int, str)):
+                raise ValueError("`set=` must be a list of floats, integers, or strings.")
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=active, param_name="active")
@@ -4500,6 +4930,7 @@ class Validate:
                 column=column,
                 values=set,
                 pre=pre,
+                segments=segments,
                 thresholds=thresholds,
                 actions=actions,
                 brief=brief,
@@ -4515,6 +4946,7 @@ class Validate:
         columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
         set: list[float | int],
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -4538,10 +4970,15 @@ class Validate:
         set
             A list of values to compare against.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -4583,6 +5020,42 @@ class Validate:
         only exists during the validation step and is not stored in the `Validate` object or used in
         subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4676,6 +5149,8 @@ class Validate:
         _check_column(column=columns)
         _check_set_types(set=set)
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=active, param_name="active")
@@ -4703,6 +5178,7 @@ class Validate:
                 column=column,
                 values=set,
                 pre=pre,
+                segments=segments,
                 thresholds=thresholds,
                 actions=actions,
                 brief=brief,
@@ -4717,6 +5193,7 @@ class Validate:
         self,
         columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -4737,10 +5214,15 @@ class Validate:
             multiple columns are supplied or resolved, there will be a separate validation step
             generated for each column.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -4782,6 +5264,42 @@ class Validate:
         only exists during the validation step and is not stored in the `Validate` object or used in
         subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4871,6 +5389,8 @@ class Validate:
         _check_column(column=columns)
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=active, param_name="active")
@@ -4897,6 +5417,7 @@ class Validate:
                 assertion_type=assertion_type,
                 column=column,
                 pre=pre,
+                segments=segments,
                 thresholds=thresholds,
                 actions=actions,
                 brief=brief,
@@ -4911,6 +5432,7 @@ class Validate:
         self,
         columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -4931,10 +5453,15 @@ class Validate:
             multiple columns are supplied or resolved, there will be a separate validation step
             generated for each column.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -4976,6 +5503,42 @@ class Validate:
         only exists during the validation step and is not stored in the `Validate` object or used in
         subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5065,6 +5628,8 @@ class Validate:
         _check_column(column=columns)
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=active, param_name="active")
@@ -5091,6 +5656,7 @@ class Validate:
                 assertion_type=assertion_type,
                 column=column,
                 pre=pre,
+                segments=segments,
                 thresholds=thresholds,
                 actions=actions,
                 brief=brief,
@@ -5107,6 +5673,7 @@ class Validate:
         pattern: str,
         na_pass: bool = False,
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -5133,10 +5700,15 @@ class Validate:
             Should any encountered None, NA, or Null values be considered as passing test units? By
             default, this is `False`. Set to `True` to pass test units with missing values.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -5178,6 +5750,42 @@ class Validate:
         only exists during the validation step and is not stored in the `Validate` object or used in
         subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5269,6 +5877,8 @@ class Validate:
         _check_column(column=columns)
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=na_pass, param_name="na_pass")
         _check_boolean_input(param=active, param_name="active")
@@ -5298,6 +5908,7 @@ class Validate:
                 values=pattern,
                 na_pass=na_pass,
                 pre=pre,
+                segments=segments,
                 thresholds=thresholds,
                 actions=actions,
                 brief=brief,
@@ -5312,6 +5923,7 @@ class Validate:
         self,
         expr: any,
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -5333,10 +5945,15 @@ class Validate:
             be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
             should either be a lambda expression or a Narwhals column expression.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -5376,6 +5993,42 @@ class Validate:
         transformed table, it only exists during the validation step and is not stored in the
         `Validate` object or used in subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5453,6 +6106,8 @@ class Validate:
         # TODO: Add a check for the expression to ensure it's a valid expression object
         # _check_expr(expr=expr)
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=active, param_name="active")
@@ -5469,6 +6124,7 @@ class Validate:
             column=None,
             values=expr,
             pre=pre,
+            segments=segments,
             thresholds=thresholds,
             actions=actions,
             brief=brief,
@@ -5657,6 +6313,7 @@ class Validate:
         self,
         columns_subset: str | list[str] | None = None,
         pre: Callable | None = None,
+        segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
         actions: Actions | None = None,
         brief: str | bool | None = None,
@@ -5677,10 +6334,15 @@ class Validate:
             columns are supplied, the distinct comparison will be made over the combination of
             values in those columns.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
+        segments
+            An optional directive on segmentation, which serves to split a validation step into
+            multiple (one step per segment). Can be a single column name, a tuple that specifies a
+            column name and its corresponding values to segment on, or a combination of both
+            (provided as a list). Read the *Segmentation* section for usage information.
         thresholds
             Set threshold failure levels for reporting and reacting to exceedences of the levels.
             The thresholds are set at the step level and will override any global thresholds set in
@@ -5722,6 +6384,42 @@ class Validate:
         table, it only exists during the validation step and is not stored in the `Validate` object
         or used in subsequent validation steps.
+        Segmentation
+        ------------
+        The `segments=` argument allows for the segmentation of a validation step into multiple
+        segments. This is useful for applying the same validation step to different subsets of the
+        data. The segmentation can be done based on a single column or specific fields within a
+        column.
+        Providing a single column name will result in a separate validation step for each unique
+        value in that column. For example, if you have a column called `"region"` with values
+        `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
+        region.
+        Alternatively, you can provide a tuple that specifies a column name and its corresponding
+        values to segment on. For example, if you have a column called `"date"` and you want to
+        segment on only specific dates, you can provide a tuple like
+        `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
+        (i.e., no validation steps will be created for them).
+        A list with a combination of column names and tuples can be provided as well. This allows
+        for more complex segmentation scenarios. The following inputs are all valid:
+        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
+        in the `"region"` column and specific dates in the `"date"` column
+        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
+        columns
+        The segmentation is performed during interrogation, and the resulting validation steps will
+        be numbered sequentially. Each segment will have its own validation step, and the results
+        will be reported separately. This allows for a more granular analysis of the data and helps
+        identify issues within specific segments.
+        Importantly, the segmentation process will be performed after any preprocessing of the data
+        table. Because of this, one can conceivably use the `pre=` argument to generate a column
+        that can be used for segmentation. For example, you could create a new column called
+        `"segment"` through use of `pre=` and then use that column for segmentation.
         Thresholds
         ----------
         The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -5815,6 +6513,8 @@ class Validate:
         assertion_type = _get_fn_name()
         _check_pre(pre=pre)
+        # TODO: add check for segments
+        # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=active, param_name="active")
@@ -5835,6 +6535,7 @@ class Validate:
             assertion_type=assertion_type,
             column=columns_subset,
             pre=pre,
+            segments=segments,
             thresholds=thresholds,
             actions=actions,
             brief=brief,
@@ -5895,7 +6596,7 @@ class Validate:
             substring matches are allowed, so a schema data type of `Int` would match a target table
             data type of `Int64`.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
@@ -6108,7 +6809,7 @@ class Validate:
             Should the validation step be inverted? If `True`, then the expectation is that the row
             count of the target table should not match the specified `count=` value.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
@@ -6318,7 +7019,7 @@ class Validate:
             Should the validation step be inverted? If `True`, then the expectation is that the
             column count of the target table should not match the specified `count=` value.
         pre
-            A optional preprocessing function or lambda to apply to the data table during
+            An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
             Have a look at the *Preprocessing* section for more information on how to use this
             argument.
@@ -6836,10 +7537,14 @@ class Validate:
         self.time_start = datetime.datetime.now(datetime.timezone.utc)
-        # Expand `validation_info` by evaluating any column expressions in `column`
+        # Expand `validation_info` by evaluating any column expressions in `columns=`
         # (the `_evaluate_column_exprs()` method will eval and expand as needed)
         self._evaluate_column_exprs(validation_info=self.validation_info)
+        # Expand `validation_info` by evaluating for any segmentation directives
+        # provided in `segments=` (the `_evaluate_segments()` method will eval and expand as needed)
+        self._evaluate_segments(validation_info=self.validation_info)
         for validation in self.validation_info:
             # Set the `i` value for the validation step (this is 1-indexed)
             index_value = self.validation_info.index(validation) + 1
@@ -6875,6 +7580,10 @@ class Validate:
             validation.autobrief = autobrief
+            # ------------------------------------------------
+            # Bypassing the validation step if conditions met
+            # ------------------------------------------------
             # Skip the validation step if it is not active but still record the time of processing
             if not validation.active:
                 end_time = datetime.datetime.now(datetime.timezone.utc)
@@ -6931,6 +7640,17 @@ class Validate:
                 elif isinstance(validation.pre, Callable):
                     data_tbl_step = validation.pre(data_tbl_step)
+            # ------------------------------------------------
+            # Segmentation stage
+            # ------------------------------------------------
+            # Determine whether any segmentation directives are to be applied to the table
+            if validation.segments is not None:
+                data_tbl_step = _apply_segments(
+                    data_tbl=data_tbl_step, segments_expr=validation.segments
+                )
             validation.n = NumberOfTestUnits(df=data_tbl_step, column=column).get_test_units(
                 tbl_type=tbl_type
             )
@@ -8832,6 +9552,13 @@ class Validate:
         # will be made blank if the validation has not been performed
         interrogation_performed = validation_info_dict.get("proc_duration_s", [None])[0] is not None
+        # Determine which steps are those using segmented data
+        segmented_steps = [
+            i + 1
+            for i, segment in enumerate(validation_info_dict["segments"])
+            if segment is not None
+        ]
         # ------------------------------------------------
         # Process the `type_upd` entry
         # ------------------------------------------------
@@ -8841,6 +9568,7 @@ class Validate:
             assertion_str=validation_info_dict["assertion_type"],
             brief_str=validation_info_dict["brief"],
             autobrief_str=validation_info_dict["autobrief"],
+            segmentation_str=validation_info_dict["segments"],
             lang=lang,
         )
@@ -8972,11 +9700,14 @@ class Validate:
         # Add the `tbl` entry
         # ------------------------------------------------
-        # Depending on if there was some preprocessing done, get the appropriate icon
-        # for the table processing status to be displayed in the report under the `tbl` column
+        # Depending on if there was some preprocessing done, get the appropriate icon for
+        # the table processing status to be displayed in the report under the `tbl` column
+        # TODO: add the icon for the segmented data option when the step is segmented
         validation_info_dict["tbl"] = _transform_tbl_preprocessed(
-            pre=validation_info_dict["pre"], interrogation_performed=interrogation_performed
+            pre=validation_info_dict["pre"],
+            seg=validation_info_dict["segments"],
+            interrogation_performed=interrogation_performed,
         )
         # ------------------------------------------------
@@ -9011,8 +9742,9 @@ class Validate:
         # Process `pass` and `fail` entries
         # ------------------------------------------------
-        # Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries (the length
-        # of the `pass` entry should be equal to the length of the `n_passed` and `n_failed` entries)
+        # Create a `pass` entry that concatenates the `n_passed` and `n_failed` entries
+        # (the length of the `pass` entry should be equal to the length of the
+        # `n_passed` and `n_failed` entries)
         validation_info_dict["pass"] = _transform_passed_failed(
             n_passed_failed=validation_info_dict["n_passed"],
@@ -9165,6 +9897,9 @@ class Validate:
         # Remove the `pre` entry from the dictionary
         validation_info_dict.pop("pre")
+        # Remove the `segments` entry from the dictionary
+        validation_info_dict.pop("segments")
         # Remove the `proc_duration_s` entry from the dictionary
         validation_info_dict.pop("proc_duration_s")
@@ -9247,6 +9982,10 @@ class Validate:
                     columns=["type_upd", "columns_upd", "values_upd", "test_units", "pass", "fail"]
                 ),
             )
+            .tab_style(
+                style=style.css("overflow-x: visible; white-space: nowrap;"),
+                locations=loc.body(columns="type_upd", rows=segmented_steps),
+            )
             .tab_style(
                 style=style.fill(color="#FCFCFC" if interrogation_performed else "white"),
                 locations=loc.body(columns=["w_upd", "e_upd", "c_upd"]),
@@ -9421,8 +10160,8 @@ class Validate:
         table object, which can be displayed in a notebook or exported to an HTML file.
         :::{.callout-warning}
-        The `get_step_report()` is still experimental. Please report any issues you encounter in the
-        [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
+        The `get_step_report()` method is still experimental. Please report any issues you encounter
+        in the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
         :::
         Parameters
@@ -9455,6 +10194,35 @@ class Validate:
         GT
             A GT table object that represents the detailed report for the validation step.
+        Types of Step Reports
+        ---------------------
+        The `get_step_report()` method produces a report based on the *type* of validation step.
+        The following row-based validation methods will produce a report that shows the rows of the
+        data that failed because of failing test units within one or more columns failed:
+        - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
+        - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
+        - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
+        - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
+        - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
+        - [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
+        - [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
+        - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
+        - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
+        - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
+        - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
+        - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
+        - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
+        - [`conjointly()`](`pointblank.Validate.conjointly`)
+        The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
+        report that shows duplicate rows (or duplicate values in one or a set of columns as defined
+        in that method's `columns_subset=` parameter.
+        The [`col_schema_match()`](`pointblank.Validate.col_schema_match`) validation step will
+        produce a report that shows the schema of the data table and the schema of the validation
+        step. The report will indicate whether the schemas match or not.
         Examples
         --------
         ```{python}
@@ -9480,7 +10248,7 @@ class Validate:
             .col_vals_lt(columns="d", value=3500)
             .col_vals_between(columns="c", left=1, right=8)
             .col_vals_gt(columns="a", value=3)
-            .col_vals_regex(columns="b", pattern=r"\d-[a-z]{3}-\d{3}")
+            .col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
             .interrogate()
         )
@@ -9768,6 +10536,95 @@ class Validate:
         return self
+    def _evaluate_segments(self, validation_info):
+        """
+        Evaluate any segmentation expressions stored in the `segments` attribute and expand each
+        validation step with such directives into multiple. This is done by evaluating the
+        segmentation expression and creating a new validation step for each segment. Errors in
+        evaluation (such as no segments matched) will be caught and recorded in the `eval_error`
+        attribute.
+        Parameters
+        ----------
+        validation_info
+            Information about the validation to evaluate and expand.
+        """
+        # Create a list to store the expanded validation steps
+        expanded_validation_info = []
+        # Iterate over the validation steps
+        for i, validation in enumerate(validation_info):
+            # Get the segments expression
+            segments_expr = validation.segments
+            # If the value is None, then skip the evaluation and append the validation step to the
+            # list of expanded validation steps
+            if segments_expr is None:
+                expanded_validation_info.append(validation)
+                continue
+            # Evaluate the segments expression
+            try:
+                # Get the table for this step, it can either be:
+                # 1. the target table itself
+                # 2. the target table modified by a `pre` attribute
+                if validation.pre is None:
+                    table = self.data
+                else:
+                    table = validation.pre(self.data)
+                # If the `segments` expression is a string, that string is taken as a column name
+                # for which segmentation should occur across unique values in the column
+                if isinstance(segments_expr, str):
+                    seg_tuples = _seg_expr_from_string(data_tbl=table, segments_expr=segments_expr)
+                # If the 'segments' expression is a tuple, then normalize it to a list of tuples
+                # - ("col", "value") -> [("col", "value")]
+                # - ("col", ["value1", "value2"]) -> [("col", "value1"), ("col", "value2")]
+                elif isinstance(segments_expr, tuple):
+                    seg_tuples = _seg_expr_from_tuple(segments_expr=segments_expr)
+                # If the 'segments' expression is a list of strings or tuples (can be mixed) then
+                # normalize it to a list of tuples following the rules above
+                elif isinstance(segments_expr, list):
+                    seg_tuples = []
+                    for seg in segments_expr:
+                        if isinstance(seg, str):
+                            # Use the utility function for string items
+                            str_seg_tuples = _seg_expr_from_string(
+                                data_tbl=table, segments_expr=seg
+                            )
+                            seg_tuples.extend(str_seg_tuples)
+                        elif isinstance(seg, tuple):
+                            # Use the utility function for tuple items
+                            tuple_seg_tuples = _seg_expr_from_tuple(segments_expr=seg)
+                            seg_tuples.extend(tuple_seg_tuples)
+                        else:  # pragma: no cover
+                            # Handle invalid segment type
+                            raise ValueError(
+                                f"Invalid segment expression item type: {type(seg)}. "
+                                "Must be either string or tuple."
+                            )
+            except Exception:  # pragma: no cover
+                validation.eval_error = True
+            # For each segmentation resolved, create a new validation step and add it to the list of
+            # expanded validation steps
+            for seg in seg_tuples:
+                new_validation = copy.deepcopy(validation)
+                new_validation.segments = seg
+                expanded_validation_info.append(new_validation)
+        # Replace the `validation_info` attribute with the expanded version
+        self.validation_info = expanded_validation_info
+        return self
     def _get_validation_dict(self, i: int | list[int] | None, attr: str) -> dict[int, int]:
         """
         Utility function to get a dictionary of validation attributes for each validation step.
@@ -10485,6 +11342,143 @@ def _prep_values_text(
     return values_str
+def _seg_expr_from_string(data_tbl: any, segments_expr: str) -> tuple[str, str]:
+    """
+    Obtain the segmentation categories from a table column.
+    The `segments_expr` value will have been checked to be a string, so there's no need to check for
+    that here. The function will return a list of tuples representing pairings of a column name and
+    a value. The task is to obtain the unique values in the column (handling different table types)
+    and produce a normalized list of tuples of the form: `(column, value)`.
+    This function is used to create a list of segments for the validation step. And since there will
+    usually be more than one segment, the validation step will be expanded into multiple during
+    interrogation (where this function is called).
+    Parameters
+    ----------
+    data_tbl
+        The table from which to obtain the segmentation categories.
+    segments_expr
+        The column name for which segmentation should occur across unique values in the column.
+    Returns
+    -------
+    list[tuple[str, str]]
+        A list of tuples representing pairings of a column name and a value in the column.
+    """
+    # Determine if the table is a DataFrame or a DB table
+    tbl_type = _get_tbl_type(data=data_tbl)
+    # Obtain the segmentation categories from the table column given as `segments_expr`
+    if tbl_type == "polars":
+        seg_categories = data_tbl[segments_expr].unique().to_list()
+    elif tbl_type == "pandas":
+        seg_categories = data_tbl[segments_expr].unique().tolist()
+    elif tbl_type in IBIS_BACKENDS:
+        distinct_col_vals = data_tbl.select(segments_expr).distinct()
+        seg_categories = distinct_col_vals[segments_expr].to_list()
+    else:  # pragma: no cover
+        raise ValueError(f"Unsupported table type: {tbl_type}")
+    # Ensure that the categories are sorted
+    seg_categories.sort()
+    # Place each category and each value in a list of tuples as: `(column, value)`
+    seg_tuples = [(segments_expr, category) for category in seg_categories]
+    return seg_tuples
+def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, str]]:
+    """
+    Normalize the segments expression to a list of tuples, given a single tuple.
+    The `segments_expr` value will have been checked to be a tuple, so there's no need to check for
+    that here. The function will return a list of tuples representing pairings of a column name and
+    a value. The task is to normalize the tuple into a list of tuples of the form:
+    `(column, value)`.
+    The following examples show how this normalzation works:
+    - `("col", "value")` -> `[("col", "value")]` (single tuple, upgraded to a list of tuples)
+    - `("col", ["value1", "value2"])` -> `[("col", "value1"), ("col", "value2")]` (tuple with a list
+      of values, expanded into multiple tuples within a list)
+    This function is used to create a list of segments for the validation step. And since there will
+    usually be more than one segment, the validation step will be expanded into multiple during
+    interrogation (where this function is called).
+    Parameters
+    ----------
+    segments_expr
+        The segments expression to normalize. It can be a tuple of the form
+        `(column, value)` or `(column, [value1, value2])`.
+    Returns
+    -------
+    list[tuple[str, str]]
+        A list of tuples representing pairings of a column name and a value in the column.
+    """
+    # Check if the first element is a string
+    if isinstance(segments_expr[0], str):
+        # If the second element is a list, create a list of tuples
+        if isinstance(segments_expr[1], list):
+            seg_tuples = [(segments_expr[0], value) for value in segments_expr[1]]
+        # If the second element is not a list, create a single tuple
+        else:
+            seg_tuples = [(segments_expr[0], segments_expr[1])]
+    # If the first element is not a string, raise an error
+    else:  # pragma: no cover
+        raise ValueError("The first element of the segments expression must be a string.")
+    return seg_tuples
+def _apply_segments(data_tbl: any, segments_expr: tuple[str, str]) -> any:
+    """
+    Apply the segments expression to the data table.
+    Filter the data table based on the `segments_expr=` value, where the first element is the
+    column name and the second element is the value to filter by.
+    Parameters
+    ----------
+    data_tbl
+        The data table to filter. It can be a Pandas DataFrame, Polars DataFrame, or an Ibis
+        backend table.
+    segments_expr
+        The segments expression to apply. It is a tuple of the form `(column, value)`.
+    Returns
+    -------
+    any
+        The filtered data table. It will be of the same type as the input table.
+    """
+    # Get the table type
+    tbl_type = _get_tbl_type(data=data_tbl)
+    if tbl_type in ["pandas", "polars"]:
+        # If the table is a Pandas or Polars DataFrame, transforming to a Narwhals table
+        # and perform the filtering operation
+        # Transform to Narwhals table if a DataFrame
+        data_tbl_nw = nw.from_native(data_tbl)
+        # Filter the data table based on the column name and value
+        data_tbl_nw = data_tbl_nw.filter(nw.col(segments_expr[0]) == segments_expr[1])
+        # Transform back to the original table type
+        data_tbl = data_tbl_nw.to_native()
+    elif tbl_type in IBIS_BACKENDS:
+        # If the table is an Ibis backend table, perform the filtering operation directly
+        # Filter the data table based on the column name and value
+        data_tbl = data_tbl[data_tbl[segments_expr[0]] == segments_expr[1]]
+    return data_tbl
 def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
     """
     Convert a `_ValidationInfo` object to a dictionary.
@@ -10509,6 +11503,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
         "inclusive",
         "na_pass",
         "pre",
+        "segments",
         "label",
         "brief",
         "autobrief",
@@ -10623,7 +11618,7 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s
     return title_text
-def _transform_tbl_preprocessed(pre: str, interrogation_performed: bool) -> list[str]:
+def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: bool) -> list[str]:
     # If no interrogation was performed, return a list of empty strings
     if not interrogation_performed:
         return ["" for _ in range(len(pre))]
@@ -10632,11 +11627,13 @@ def _transform_tbl_preprocessed(pre: str, interrogation_performed: bool) -> list
     # (either 'unchanged' (None) or 'modified' (not None))
     status_list = []
-    for status in pre:
-        if status is None:
-            status_list.append("unchanged")
-        else:
+    for i in range(len(pre)):
+        if seg[i] is not None:
+            status_list.append("segmented")
+        elif pre[i] is not None:
             status_list.append("modified")
+        else:
+            status_list.append("unchanged")
     return _get_preprocessed_table_icon(icon=status_list)
@@ -10744,7 +11741,11 @@ def _transform_w_e_c(values, color, interrogation_performed):
 def _transform_assertion_str(
-    assertion_str: list[str], brief_str: list[str | None], autobrief_str: list[str], lang: str
+    assertion_str: list[str],
+    brief_str: list[str | None],
+    autobrief_str: list[str],
+    segmentation_str: list[tuple | None],
+    lang: str,
 ) -> list[str]:
     # Get the SVG icons for the assertion types
     svg_icon = _get_assertion_icon(icon=assertion_str)
@@ -10805,6 +11806,26 @@ def _transform_assertion_str(
         for assertion, svg, size, brief_div in zip(assertion_str, svg_icon, text_size, brief_divs)
     ]
+    # If the `segments` list is not empty, prepend a segmentation div to the `type_upd` strings
+    if segmentation_str:
+        for i in range(len(type_upd)):
+            if segmentation_str[i] is not None:
+                # Get the column name and value from the segmentation expression
+                column_name = segmentation_str[i][0]
+                column_value = segmentation_str[i][1]
+                # Create the segmentation div
+                segmentation_div = (
+                    "<div style='margin-top: 0px; margin-bottom: 0px; "
+                    "white-space: pre; font-size: 8px; color: darkblue; padding-bottom: 4px; "
+                    "'>"
+                    "<strong><span style='font-family: Helvetica, arial, sans-serif;'>"
+                    f"SEGMENT&nbsp;&nbsp;</span></strong><span>{column_name} / {column_value}"
+                    "</span>"
+                    "</div>"
+                )
+                # Prepend the segmentation div to the type_upd string
+                type_upd[i] = f"{segmentation_div} {type_upd[i]}"
     return type_upd

pointblank 0.8.6__py3-none-any.whl → 0.9.0__py3-none-any.whl

pointblank 0.8.6py3-none-any.whl → 0.9.0py3-none-any.whl