PyPI - pointblank - Versions diffs - 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

pointblank 0.9.5py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

pointblank/__init__.py +4 -0
pointblank/_constants.py +6 -0
pointblank/_datascan_utils.py +65 -0
pointblank/_utils.py +128 -0
pointblank/_utils_html.py +40 -0
pointblank/actions.py +3 -3
pointblank/assistant.py +1 -3
pointblank/column.py +4 -4
pointblank/compare.py +27 -0
pointblank/data/api-docs.txt +769 -138
pointblank/datascan.py +318 -959
pointblank/scan_profile.py +321 -0
pointblank/scan_profile_stats.py +180 -0
pointblank/schema.py +14 -3
pointblank/thresholds.py +2 -2
pointblank/validate.py +1594 -207
{pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/METADATA +6 -3
pointblank-0.10.0.dist-info/RECORD +37 -0
{pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/WHEEL +1 -1
pointblank-0.9.5.dist-info/RECORD +0 -33
{pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/licenses/LICENSE +0 -0
{pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/top_level.txt +0 -0

pointblank/data/api-docs.txt CHANGED Viewed

@@ -42,8 +42,14 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
     Parameters
     ----------
     data
-        The table to validate, which could be a DataFrame object or an Ibis table object. Read the
-        *Supported Input Table Types* section for details on the supported table types.
+        The table to validate, which could be a DataFrame object, an Ibis table object, a CSV
+        file path, a Parquet file path, or a database connection string. When providing a CSV or
+        Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
+        loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
+        glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
+        Connection strings enable direct database access via Ibis with optional table specification
+        using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
+        on the supported table types.
     tbl_name
         An optional name to assign to the input table object. If no value is provided, a name will
         be generated based on whatever information is available. This table name will be displayed
@@ -107,13 +113,40 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
     - MySQL table (`"mysql"`)*
     - PostgreSQL table (`"postgresql"`)*
     - SQLite table (`"sqlite"`)*
+    - Microsoft SQL Server table (`"mssql"`)*
+    - Snowflake table (`"snowflake"`)*
+    - Databricks table (`"databricks"`)*
+    - PySpark table (`"pyspark"`)*
+    - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
+    - CSV files (string path or `pathlib.Path` object with `.csv` extension)
+    - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
+    extension, or partitioned dataset)
+    - Database connection strings (URI format with optional table specification)
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
     `ibis.expr.types.relations.Table`). Furthermore, the use of `Validate` with such tables requires
     the Ibis library v9.5.0 and above to be installed. If the input table is a Polars or Pandas
     DataFrame, the Ibis library is not required.
+    To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
+    provided. The file will be automatically detected and loaded using the best available DataFrame
+    library. The loading preference is Polars first, then Pandas as a fallback.
+    Connection strings follow database URL formats and must also specify a table using the
+    `::table_name` suffix. Examples include:
+    ```
+    "duckdb:///path/to/database.ddb::table_name"
+    "sqlite:///path/to/database.db::table_name"
+    "postgresql://user:password@localhost:5432/database::table_name"
+    "mysql://user:password@localhost:3306/database::table_name"
+    "bigquery://project/dataset::table_name"
+    "snowflake://user:password@account/database/schema::table_name"
+    ```
+    When using connection strings, the Ibis library with the appropriate backend driver is required.
     Thresholds
     ----------
     The `thresholds=` parameter is used to set the failure-condition levels for all validation
@@ -270,8 +303,8 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
     ```python
     import pointblank as pb
-    # Load the small_table dataset
-    small_table = pb.load_dataset()
+    # Load the `small_table` dataset
+    small_table = pb.load_dataset(dataset="small_table", tbl_type="polars")
     # Preview the table
     pb.preview(small_table)
@@ -337,7 +370,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
     brief). Here's an example of a global setting for briefs:
     ```python
-    validation = (
+    validation_2 = (
         pb.Validate(
             data=pb.load_dataset(),
             tbl_name="small_table",
@@ -354,7 +387,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
         .interrogate()
     )
-    validation
+    validation_2
     ```
     We see the text of the briefs appear in the `STEP` column of the reporting table. Furthermore,
@@ -372,7 +405,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
     the data extracts for each validation step.
     ```python
-    validation.get_data_extracts()
+    validation_2.get_data_extracts()
     ```
     We can also view step reports for each validation step using the
@@ -380,7 +413,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
     type of validation step and shows the relevant information for a step's validation.
     ```python
-    validation.get_step_report(i=2)
+    validation_2.get_step_report(i=2)
     ```
     The `Validate` class also has a method for getting the sundered data, which is the data that
@@ -388,11 +421,141 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
     [`get_sundered_data()`](`pointblank.Validate.get_sundered_data`) method.
     ```python
-    pb.preview(validation.get_sundered_data())
+    pb.preview(validation_2.get_sundered_data())
     ```
     The sundered data is a DataFrame that contains the rows that passed or failed the validation.
     The default behavior is to return the rows that failed the validation, as shown above.
+    ### Working with CSV Files
+    The `Validate` class can directly accept CSV file paths, making it easy to validate data stored
+    in CSV files without manual loading:
+    ```python
+    # Get a path to a CSV file from the package data
+    csv_path = pb.get_data_path("global_sales", "csv")
+    validation_3 = (
+        pb.Validate(
+            data=csv_path,
+            label="CSV validation example"
+        )
+        .col_exists(["customer_id", "product_id", "revenue"])
+        .col_vals_not_null(["customer_id", "product_id"])
+        .col_vals_gt(columns="revenue", value=0)
+        .interrogate()
+    )
+    validation_3
+    ```
+    You can also use a Path object to specify the CSV file. Here's an example of how to do that:
+    ```python
+    from pathlib import Path
+    csv_file = Path(pb.get_data_path("game_revenue", "csv"))
+    validation_4 = (
+        pb.Validate(data=csv_file, label="Game Revenue Validation")
+        .col_exists(["player_id", "session_id", "item_name"])
+        .col_vals_regex(
+            columns="session_id",
+            pattern=r"[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}"
+        )
+        .col_vals_gt(columns="item_revenue", value=0, na_pass=True)
+        .interrogate()
+    )
+    validation_4
+    ```
+    The CSV loading is automatic, so when a string or Path with a `.csv` extension is provided,
+    Pointblank will automatically load the file using the best available DataFrame library (Polars
+    preferred, Pandas as fallback). The loaded data can then be used with all validation methods
+    just like any other supported table type.
+    ### Working with Parquet Files
+    The `Validate` class can directly accept Parquet files and datasets in various formats. The
+    following examples illustrate how to validate Parquet files:
+    ```python
+    # Single Parquet file from package data
+    parquet_path = pb.get_data_path("nycflights", "parquet")
+    validation_5 = (
+        pb.Validate(
+            data=parquet_path,
+            tbl_name="NYC Flights Data"
+        )
+        .col_vals_not_null(["carrier", "origin", "dest"])
+        .col_vals_gt(columns="distance", value=0)
+        .interrogate()
+    )
+    validation_5
+    ```
+    You can also use glob patterns and directories. Here are some examples for how to:
+    1. load multiple Parquet files
+    2. load a Parquet-containing directory
+    3. load a partitioned Parquet dataset
+    ```python
+    # Multiple Parquet files with glob patterns
+    validation_6 = pb.Validate(data="data/sales_*.parquet")
+    # Directory containing Parquet files
+    validation_7 = pb.Validate(data="parquet_data/")
+    # Partitioned Parquet dataset
+    validation_8 = (
+        pb.Validate(data="sales_data/")  # Contains year=2023/quarter=Q1/region=US/sales.parquet
+        .col_exists(["transaction_id", "amount", "year", "quarter", "region"])
+        .interrogate()
+    )
+    ```
+    When you point to a directory that contains a partitioned Parquet dataset (with subdirectories
+    like `year=2023/quarter=Q1/region=US/`), Pointblank will automatically:
+    - discover all Parquet files recursively
+    - extract partition column values from directory paths
+    - add partition columns to the final DataFrame
+    - combine all partitions into a single table for validation
+    Both Polars and Pandas handle partitioned datasets natively, so this works seamlessly with
+    either DataFrame library. The loading preference is Polars first, then Pandas as a fallback.
+    ### Working with Database Connection Strings
+    The `Validate` class supports database connection strings for direct validation of database
+    tables. Connection strings must specify a table using the `::table_name` suffix:
+    ```python
+    # Get path to a DuckDB database file from package data
+    duckdb_path = pb.get_data_path("game_revenue", "duckdb")
+    validation_9 = (
+        pb.Validate(
+            data=f"duckdb:///{duckdb_path}::game_revenue",
+            label="DuckDB Game Revenue Validation"
+        )
+        .col_exists(["player_id", "session_id", "item_revenue"])
+        .col_vals_gt(columns="item_revenue", value=0)
+        .interrogate()
+    )
+    validation_9
+    ```
+    For comprehensive documentation on supported connection string formats, error handling, and
+    installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
+    function. This function handles all the connection logic and provides helpful error messages
+    when table specifications are missing or backend dependencies are not installed.
 Thresholds(warning: 'int | float | bool | None' = None, error: 'int | float | bool | None' = None, critical: 'int | float | bool | None' = None) -> None
@@ -580,7 +743,7 @@ Actions(warning: 'str | Callable | list[str | Callable] | None' = None, error: '
             thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
             actions=pb.Actions(critical="Major data quality issue found in step {step}."),
         )
-        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
+        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
         .col_vals_gt(columns="item_revenue", value=0.05)
         .col_vals_gt(columns="session_duration", value=15)
         .interrogate()
@@ -610,7 +773,7 @@ Actions(warning: 'str | Callable | list[str | Callable] | None' = None, error: '
             data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"),
             thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
         )
-        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
+        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
         .col_vals_gt(columns="item_revenue", value=0.05)
         .col_vals_gt(
             columns="session_duration",
@@ -1282,12 +1445,16 @@ col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -1503,12 +1670,16 @@ col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -1724,12 +1895,16 @@ col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -1945,12 +2120,16 @@ col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -2166,12 +2345,16 @@ col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -2385,12 +2568,16 @@ col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -2614,12 +2801,16 @@ col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | Col
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -2855,12 +3046,16 @@ col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | Col
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3055,12 +3250,16 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3241,12 +3440,16 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3342,9 +3545,9 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
 col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
-        Validate whether values in a column are NULL.
+        Validate whether values in a column are Null.
-        The `col_vals_null()` validation method checks whether column values in a table are NULL.
+        The `col_vals_null()` validation method checks whether column values in a table are Null.
         This validation will operate over the number of test units that is equal to the number
         of rows in the table.
@@ -3425,12 +3628,16 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3524,10 +3731,10 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
 col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
-        Validate whether values in a column are not NULL.
+        Validate whether values in a column are not Null.
         The `col_vals_not_null()` validation method checks whether column values in a table are not
-        NULL. This validation will operate over the number of test units that is equal to the number
+        Null. This validation will operate over the number of test units that is equal to the number
         of rows in the table.
         Parameters
@@ -3607,12 +3814,16 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3795,12 +4006,16 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3977,12 +4192,16 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -4267,12 +4486,16 @@ rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -4453,12 +4676,16 @@ rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -6231,7 +6458,7 @@ matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches'
     `[rev_01, rev_02, profit_01, profit_02, age]`
     and you want to validate columns that have two digits at the end of the name, you can use
-    `columns=matches(r"\d{2}$")`. This will select the `rev_01`, `rev_02`, `profit_01`, and
+    `columns=matches(r"[0-9]{2}$")`. This will select the `rev_01`, `rev_02`, `profit_01`, and
     `profit_02` columns.
     There will be a validation step created for every resolved column. Note that if there aren't any
@@ -6285,7 +6512,7 @@ matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches'
     [`col()`](`pointblank.col`) function, like this:
     ```python
-    col(matches(r"^\d{5}") & ends_with("_id"))
+    col(matches(r"^[0-9]{5}") & ends_with("_id"))
     ```
     There are four operators that can be used to compose column selectors:
@@ -6324,7 +6551,7 @@ matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches'
     validation = (
         pb.Validate(data=tbl)
-        .col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID\d{4}")
+        .col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID[0-9]{4}")
         .interrogate()
     )
@@ -6332,7 +6559,7 @@ matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches'
     ```
     From the results of the validation table we get two validation steps, one for `id_old` and one
-    for `new_identifier`. The values in both columns all match the pattern `"ID\d{4}"`.
+    for `new_identifier`. The values in both columns all match the pattern `"ID[0-9]{4}"`.
     We can also use the `matches()` function in combination with other column selectors (within
     [`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select
@@ -6875,7 +7102,7 @@ interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' =
         After interrogation is complete, the `Validate` object will have gathered information, and
         we can use methods like [`n_passed()`](`pointblank.Validate.n_passed`),
-        [`f_failed()`](`pointblank.Validate.f_failed`)`, etc., to understand how the table performed
+        [`f_failed()`](`pointblank.Validate.f_failed`), etc., to understand how the table performed
         against the validation plan. A visual representation of the validation results can be viewed
         by printing the `Validate` object; this will display the validation table in an HTML viewing
         environment.
@@ -7085,24 +7312,25 @@ get_step_report(self, i: 'int', columns_subset: 'str | list[str] | Column | None
         Types of Step Reports
         ---------------------
         The `get_step_report()` method produces a report based on the *type* of validation step.
-        The following row-based validation methods will produce a report that shows the rows of the
-        data that failed because of failing test units within one or more columns failed:
+        The following column-value or row-based validation step validation methods will produce a
+        report that shows the rows of the data that failed:
         - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
+        - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
         - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
+        - [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
         - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
         - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
-        - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
-        - [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
         - [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
         - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
         - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
         - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
-        - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
         - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
         - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
-        - [`rows_complete()`](`pointblank.Validate.rows_complete`)
+        - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
+        - [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
         - [`conjointly()`](`pointblank.Validate.conjointly`)
+        - [`rows_complete()`](`pointblank.Validate.rows_complete`)
         The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
         report that shows duplicate rows (or duplicate values in one or a set of columns as defined
@@ -7320,10 +7548,10 @@ get_sundered_data(self, type='pass') -> 'FrameT'
         Get the data that passed or failed the validation steps.
         Validation of the data is one thing but, sometimes, you want to use the best part of the
-        input dataset for something else. The `get_sundered_data()` method works with a Validate
+        input dataset for something else. The `get_sundered_data()` method works with a `Validate`
         object that has been interrogated (i.e., the
         [`interrogate()`](`pointblank.Validate.interrogate`) method was used). We can get either the
-        'pass' data piece (rows with no failing test units across all row-based validation
+        'pass' data piece (rows with no failing test units across all column-value based validation
         functions), or, the 'fail' data piece (rows with at least one failing test unit across the
         same series of validations).
@@ -7332,7 +7560,7 @@ get_sundered_data(self, type='pass') -> 'FrameT'
         There are some caveats to sundering. The validation steps considered for this splitting will
         only involve steps where:
-        - of certain check types, where test units are cells checked row-by-row (e.g., the
+        - of certain check types, where test units are cells checked down a column (e.g., the
         `col_vals_*()` methods)
         - `active=` is not set to `False`
         - `pre=` has not been given an expression for modifying the input table
@@ -7401,11 +7629,13 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
         Get the rows that failed for each validation step.
         After the [`interrogate()`](`pointblank.Validate.interrogate`) method has been called, the
-        `get_data_extracts()` method can be used to extract the rows that failed in each row-based
-        validation step (e.g., [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), etc.). The
-        method returns a dictionary of tables containing the rows that failed in every row-based
-        validation function. If `frame=True` and `i=` is a scalar, the value is conveniently
-        returned as a table (forgoing the dictionary structure).
+        `get_data_extracts()` method can be used to extract the rows that failed in each
+        column-value or row-based validation step (e.g.,
+        [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`),
+        [`rows_distinct()`](`pointblank.Validate.rows_distinct`), etc.). The method returns a
+        dictionary of tables containing the rows that failed in every validation step. If
+        `frame=True` and `i=` is a scalar, the value is conveniently returned as a table (forgoing
+        the dictionary structure).
         Parameters
         ----------
@@ -7418,13 +7648,13 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
         Returns
         -------
         dict[int, FrameT | None] | FrameT | None
-            A dictionary of tables containing the rows that failed in every row-based validation
-            step or a DataFrame.
+            A dictionary of tables containing the rows that failed in every compatible validation
+            step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
-        Validation Methods that are Row-Based
-        -------------------------------------
-        The following validation methods are row-based and will have rows extracted when there are
-        failing test units.
+        Compatible Validation Methods for Yielding Extracted Rows
+        ---------------------------------------------------------
+        The following validation methods operate on column values and will have rows extracted when
+        there are failing test units.
         - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
         - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
@@ -7439,11 +7669,20 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
         - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
         - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
         - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
+        - [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
+        - [`conjointly()`](`pointblank.Validate.conjointly`)
+        An extracted row for these validation methods means that a test unit failed for that row in
+        the validation step.
+        These row-based validation methods will also have rows extracted should there be failing
+        rows:
         - [`rows_distinct()`](`pointblank.Validate.rows_distinct`)
+        - [`rows_complete()`](`pointblank.Validate.rows_complete`)
-        An extracted row means that a test unit failed for that row in the validation step. The
-        extracted rows are a subset of the original table and are useful for further analysis or for
-        understanding the nature of the failing test units.
+        The extracted rows are a subset of the original table and are useful for further analysis
+        or for understanding the nature of the failing test units.
         Examples
         --------
@@ -7578,6 +7817,10 @@ assert_passing(self) -> 'None'
         assertion made is printed in the `AssertionError` message if a failure occurs, ensuring
         some details are preserved.
+        If the validation has not yet been interrogated, this method will automatically call
+        [`interrogate()`](`pointblank.Validate.interrogate`) with default parameters before checking
+        for passing tests.
         Raises
         -------
         AssertionError
@@ -7587,8 +7830,9 @@ assert_passing(self) -> 'None'
         --------
         In the example below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and
         `c`). There will be three validation steps, and the second step will have a failing test
-        unit (the value `10` isn't less than `9`). After interrogation, the `assert_passing()`
-        method is used to assert that all validation steps passed perfectly.
+        unit (the value `10` isn't less than `9`). The `assert_passing()` method is used to assert
+        that all validation steps passed perfectly, automatically performing the interrogation if
+        needed.
         ```python
         #| error: True
@@ -7609,13 +7853,221 @@ assert_passing(self) -> 'None'
             .col_vals_gt(columns="a", value=0)
             .col_vals_lt(columns="b", value=9) # this assertion is false
             .col_vals_in_set(columns="c", set=["a", "b"])
-            .interrogate()
         )
+        # No need to call [`interrogate()`](`pointblank.Validate.interrogate`) explicitly
         validation.assert_passing()
         ```
+assert_below_threshold(self, level: 'str' = 'warning', i: 'int | None' = None, message: 'str | None' = None) -> 'None'
+        Raise an `AssertionError` if validation steps exceed a specified threshold level.
+        The `assert_below_threshold()` method checks whether validation steps' failure rates are
+        below a given threshold level (`"warning"`, `"error"`, or `"critical"`). This is
+        particularly useful in automated testing environments where you want to ensure your data
+        quality meets minimum standards before proceeding.
+        If any validation step exceeds the specified threshold level, an `AssertionError` will be
+        raised with details about which steps failed. If the validation has not yet been
+        interrogated, this method will automatically call
+        [`interrogate()`](`pointblank.Validate.interrogate`) with default parameters.
+        Parameters
+        ----------
+        level
+            The threshold level to check against, which could be any of `"warning"` (the default),
+            `"error"`, or `"critical"`. An `AssertionError` will be raised if any validation step
+            exceeds this level.
+        i
+            Specific validation step number(s) to check. Can be provided as a single integer or a
+            list of integers. If `None` (the default), all steps are checked.
+        message
+            Custom error message to use if assertion fails. If `None`, a default message will be
+            generated that lists the specific steps that exceeded the threshold.
+        Returns
+        -------
+        None
+        Raises
+        ------
+        AssertionError
+            If any specified validation step exceeds the given threshold level.
+        ValueError
+            If an invalid threshold level is provided.
+        Examples
+        --------
+        Below are some examples of how to use the `assert_below_threshold()` method. First, we'll
+        create a simple Polars DataFrame with two columns (`a` and `b`).
+        ```python
+        import polars as pl
+        tbl = pl.DataFrame({
+            "a": [7, 4, 9, 7, 12],
+            "b": [9, 8, 10, 5, 10]
+        })
+        ```
+        Then a validation plan will be created with thresholds (`warning=0.1`, `error=0.2`,
+        `critical=0.3`). After interrogating, we display the validation report table:
+        ```python
+        import pointblank as pb
+        validation = (
+            pb.Validate(data=tbl, thresholds=(0.1, 0.2, 0.3))
+            .col_vals_gt(columns="a", value=5)   # 1 failing test unit
+            .col_vals_lt(columns="b", value=10)  # 2 failing test units
+            .interrogate()
+        )
+        validation
+        ```
+        Using `assert_below_threshold(level="warning")` will raise an `AssertionError` if any step
+        exceeds the 'warning' threshold:
+        Check a specific step against the 'critical' threshold using the `i=` parameter:
+        ```python
+        validation.assert_below_threshold(level="critical", i=1)  # Won't raise an error
+        ```
+        As the first step is below the 'critical' threshold (it exceeds the 'warning' and 'error'
+        thresholds), no error is raised and nothing is printed.
+        We can also provide a custom error message with the `message=` parameter. Let's try that
+        here:
+        ```python
+        try:
+            validation.assert_below_threshold(
+                level="error",
+                message="Data quality too low for processing!"
+            )
+        except AssertionError as e:
+            print(f"Custom error: {e}")
+        ```
+        See Also
+        --------
+        - [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
+        step
+        - [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
+        - [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
+        validation step
+        - [`assert_passing()`](`pointblank.Validate.assert_passing`): assert all validations pass
+        completely
+above_threshold(self, level: 'str' = 'warning', i: 'int | None' = None) -> 'bool'
+        Check if any validation steps exceed a specified threshold level.
+        The `above_threshold()` method checks whether validation steps exceed a given threshold
+        level. This provides a non-exception-based alternative to
+        [`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`) for conditional
+        workflow control based on validation results.
+        This method is useful in scenarios where you want to check if any validation steps failed
+        beyond a certain threshold without raising an exception, allowing for more flexible
+        programmatic responses to validation issues.
+        Parameters
+        ----------
+        level
+            The threshold level to check against. Valid options are: `"warning"` (the least severe
+            threshold level), `"error"` (the middle severity threshold level), and `"critical"` (the
+            most severe threshold level). The default is `"warning"`.
+        i
+            Specific validation step number(s) to check. If a single integer, checks only that step.
+            If a list of integers, checks all specified steps. If `None` (the default), checks all
+            validation steps. Step numbers are 1-based (first step is `1`, not `0`).
+        Returns
+        -------
+        bool
+            `True` if any of the specified validation steps exceed the given threshold level,
+            `False` otherwise.
+        Raises
+        ------
+        ValueError
+            If an invalid threshold level is provided.
+        Examples
+        --------
+        Below are some examples of how to use the `above_threshold()` method. First, we'll create a
+        simple Polars DataFrame with a single column (`values`).
+        Then a validation plan will be created with thresholds (`warning=0.1`, `error=0.2`,
+        `critical=0.3`). After interrogating, we display the validation report table:
+        ```python
+        import pointblank as pb
+        validation = (
+            pb.Validate(data=tbl, thresholds=(0.1, 0.2, 0.3))
+            .col_vals_gt(columns="values", value=0)
+            .col_vals_lt(columns="values", value=10)
+            .col_vals_between(columns="values", left=0, right=5)
+            .interrogate()
+        )
+        validation
+        ```
+        Let's check if any steps exceed the 'warning' threshold with the `above_threshold()` method.
+        A message will be printed if that's the case:
+        ```python
+        if validation.above_threshold(level="warning"):
+            print("Some steps have exceeded the warning threshold")
+        ```
+        Check if only steps 2 and 3 exceed the 'error' threshold through use of the `i=` argument:
+        ```python
+        if validation.above_threshold(level="error", i=[2, 3]):
+            print("Steps 2 and/or 3 have exceeded the error threshold")
+        ```
+        You can use this in a workflow to conditionally trigger processes. Here's a snippet of how
+        you might use this in a function:
+        ```python
+        def process_data(validation_obj):
+            # Only continue processing if validation passes critical thresholds
+            if not validation_obj.above_threshold(level="critical"):
+                # Continue with processing
+                print("Data meets critical quality thresholds, proceeding...")
+                return True
+            else:
+                # Log failure and stop processing
+                print("Data fails critical quality checks, aborting...")
+                return False
+        ```
+        Note that this is just a suggestion for how to implement conditional workflow processes. You
+        should adapt this pattern to your specific requirements, which might include  different
+        threshold levels, custom logging mechanisms, or integration with your organization's data
+        pipelines and notification systems.
+        See Also
+        --------
+        - [`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`): a similar
+        method that raises an exception if thresholds are exceeded
+        - [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
+        step
+        - [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
+        - [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
+        validation step
 n(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int, int] | int'
         Provides a dictionary of the number of test units for each validation step.
@@ -8237,7 +8689,7 @@ critical(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'd
         Get the 'critical' level status for each validation step.
         The 'critical' status for a validation step is `True` if the fraction of failing test units
-        meets or exceeds the threshold for the notification level. Otherwise, the status is `False`.
+        meets or exceeds the threshold for the 'critical' level. Otherwise, the status is `False`.
         The ascribed name of 'critical' is semantic and is thus simply a status indicator that could
         be used to trigger some action to be take. Here's how it fits in with other status
@@ -8249,14 +8701,14 @@ critical(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'd
         severity
         - 'critical': the status obtained by calling `critical()`, most severe
-        This method provides a dictionary of the notification status for each validation step. If
-        the `scalar=True` argument is provided and `i=` is a scalar, the value is returned as a
-        scalar instead of a dictionary.
+        This method provides a dictionary of the 'critical' status for each validation step. If the
+        `scalar=True` argument is provided and `i=` is a scalar, the value is returned as a scalar
+        instead of a dictionary.
         Parameters
         ----------
         i
-            The validation step number(s) from which the notification status is obtained. Can be
+            The validation step number(s) from which the 'critical' status is obtained. Can be
             provided as a list of integers or a single integer. If `None`, all steps are included.
         scalar
             If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary.
@@ -8264,7 +8716,7 @@ critical(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'd
         Returns
         -------
         dict[int, bool] | bool
-            A dictionary of the notification status for each validation step or a scalar value.
+            A dictionary of the 'critical' status for each validation step or a scalar value.
         Examples
         --------
@@ -8344,7 +8796,7 @@ datasets included in the package can be accessed via the `load_dataset()` functi
 `config()` utility lets us set global configuration parameters. Want to chat with an assistant? Use
 the `assistant()` function to get help with Pointblank.
-DataScan(data: 'FrameT | Any', tbl_name: 'str | None' = None) -> None
+DataScan(data: 'IntoFrameT', tbl_name: 'str | None' = None) -> 'None'
     Get a summary of a dataset.
@@ -8458,8 +8910,14 @@ preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None'
     Parameters
     ----------
     data
-        The table to preview, which could be a DataFrame object or an Ibis table object. Read the
-        *Supported Input Table Types* section for details on the supported table types.
+        The table to preview, which could be a DataFrame object, an Ibis table object, a CSV
+        file path, a Parquet file path, or a database connection string. When providing a CSV or
+        Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
+        loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
+        glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
+        Connection strings enable direct database access via Ibis with optional table specification
+        using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
+        on the supported table types.
     columns_subset
         The columns to display in the table, by default `None` (all columns are shown). This can
         be a string, a list of strings, a `Column` object, or a `ColumnSelector` object. The latter
@@ -8504,13 +8962,40 @@ preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None'
     - MySQL table (`"mysql"`)*
     - PostgreSQL table (`"postgresql"`)*
     - SQLite table (`"sqlite"`)*
+    - Microsoft SQL Server table (`"mssql"`)*
+    - Snowflake table (`"snowflake"`)*
+    - Databricks table (`"databricks"`)*
+    - PySpark table (`"pyspark"`)*
+    - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
+    - CSV files (string path or `pathlib.Path` object with `.csv` extension)
+    - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
+    extension, or partitioned dataset)
+    - Database connection strings (URI format with optional table specification)
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
     `ibis.expr.types.relations.Table`). Furthermore, using `preview()` with these types of tables
     requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or
     Pandas DataFrame, the availability of Ibis is not needed.
+    To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
+    provided. The file will be automatically detected and loaded using the best available DataFrame
+    library. The loading preference is Polars first, then Pandas as a fallback.
+    Connection strings follow database URL formats and must also specify a table using the
+    `::table_name` suffix. Examples include:
+    ```
+    "duckdb:///path/to/database.ddb::table_name"
+    "sqlite:///path/to/database.db::table_name"
+    "postgresql://user:password@localhost:5432/database::table_name"
+    "mysql://user:password@localhost:3306/database::table_name"
+    "bigquery://project/dataset::table_name"
+    "snowflake://user:password@account/database/schema::table_name"
+    ```
+    When using connection strings, the Ibis library with the appropriate backend driver is required.
     Examples
     --------
     It's easy to preview a table using the `preview()` function. Here's an example using the
@@ -8569,6 +9054,39 @@ preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None'
       columns_subset=pb.col(pb.starts_with("item") | pb.matches("player"))
     )
     ```
+    ### Working with CSV Files
+    The `preview()` function can directly accept CSV file paths, making it easy to preview data
+    stored in CSV files without manual loading:
+    You can also use a Path object to specify the CSV file:
+    ### Working with Parquet Files
+    The `preview()` function can directly accept Parquet files and datasets in various formats:
+    You can also use glob patterns and directories:
+    ```python
+    # Multiple Parquet files with glob patterns
+    pb.preview("data/sales_*.parquet")
+    # Directory containing Parquet files
+    pb.preview("parquet_data/")
+    # Partitioned Parquet dataset
+    pb.preview("sales_data/")  # Auto-discovers partition columns
+    ```
+    ### Working with Database Connection Strings
+    The `preview()` function supports database connection strings for direct preview of database
+    tables. Connection strings must specify a table using the `::table_name` suffix:
+    For comprehensive documentation on supported connection string formats, error handling, and
+    installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
+    function.
 col_summary_tbl(data: 'FrameT | Any', tbl_name: 'str | None' = None) -> 'GT'
@@ -8672,6 +9190,11 @@ missing_vals_tbl(data: 'FrameT | Any') -> 'GT'
     - MySQL table (`"mysql"`)*
     - PostgreSQL table (`"postgresql"`)*
     - SQLite table (`"sqlite"`)*
+    - Microsoft SQL Server table (`"mssql"`)*
+    - Snowflake table (`"snowflake"`)*
+    - Databricks table (`"databricks"`)*
+    - PySpark table (`"pyspark"`)*
+    - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
@@ -8932,6 +9455,104 @@ load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'glo
     regions: North America, Europe, or Asia.
+get_data_path(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'global_sales']" = 'small_table', file_type: "Literal['csv', 'parquet', 'duckdb']" = 'csv') -> 'str'
+    Get the file path to a dataset included with the Pointblank package.
+    This function provides direct access to the file paths of datasets included with Pointblank.
+    These paths can be used in examples and documentation to demonstrate file-based data loading
+    without requiring the actual data files. The returned paths can be used with
+    `Validate(data=path)` to demonstrate CSV and Parquet file loading capabilities.
+    Parameters
+    ----------
+    dataset
+        The name of the dataset to get the path for. Current options are `"small_table"`,
+        `"game_revenue"`, `"nycflights"`, and `"global_sales"`.
+    file_type
+        The file format to get the path for. Options are `"csv"`, `"parquet"`, or `"duckdb"`.
+    Returns
+    -------
+    str
+        The file path to the requested dataset file.
+    Included Datasets
+    -----------------
+    The available datasets are the same as those in [`load_dataset()`](`pointblank.load_dataset`):
+    - `"small_table"`: A small dataset with 13 rows and 8 columns. Ideal for testing and examples.
+    - `"game_revenue"`: A dataset with 2000 rows and 11 columns. Revenue data for a game company.
+    - `"nycflights"`: A dataset with 336,776 rows and 18 columns. Flight data from NYC airports.
+    - `"global_sales"`: A dataset with 50,000 rows and 20 columns. Global sales data across regions.
+    File Types
+    ----------
+    Each dataset is available in multiple formats:
+    - `"csv"`: Comma-separated values file (`.csv`)
+    - `"parquet"`: Parquet file (`.parquet`)
+    - `"duckdb"`: DuckDB database file (`.ddb`)
+    Examples
+    --------
+    Get the path to a CSV file and use it with `Validate`:
+    ```python
+    import pointblank as pb
+    # Get path to the small_table CSV file
+    csv_path = pb.get_data_path("small_table", "csv")
+    print(csv_path)
+    # Use the path directly with Validate
+    validation = (
+        pb.Validate(data=csv_path)
+        .col_exists(["a", "b", "c"])
+        .col_vals_gt(columns="d", value=0)
+        .interrogate()
+    )
+    validation
+    ```
+    Get a Parquet file path for validation examples:
+    ```python
+    # Get path to the game_revenue Parquet file
+    parquet_path = pb.get_data_path(dataset="game_revenue", file_type="parquet")
+    # Validate the Parquet file directly
+    validation = (
+        pb.Validate(data=parquet_path, label="Game Revenue Data Validation")
+        .col_vals_not_null(columns=["player_id", "session_id"])
+        .col_vals_gt(columns="item_revenue", value=0)
+        .interrogate()
+    )
+    validation
+    ```
+    This is particularly useful for documentation examples where you want to demonstrate
+    file-based workflows without requiring users to have specific data files:
+    ```python
+    # Example showing CSV file validation
+    sales_csv = pb.get_data_path(dataset="global_sales", file_type="csv")
+    validation = (
+        pb.Validate(data=sales_csv, label="Sales Data Validation")
+        .col_exists(["customer_id", "product_id", "amount"])
+        .col_vals_regex(columns="customer_id", pattern=r"CUST_[0-9]{6}")
+        .interrogate()
+    )
+    ```
+    See Also
+    --------
+    [`load_dataset()`](`pointblank.load_dataset`) for loading datasets directly as table objects.
 ## The Utility Functions family
@@ -8971,6 +9592,11 @@ get_column_count(data: 'FrameT | Any') -> 'int'
     - MySQL table (`"mysql"`)*
     - PostgreSQL table (`"postgresql"`)*
     - SQLite table (`"sqlite"`)*
+    - Microsoft SQL Server table (`"mssql"`)*
+    - Snowflake table (`"snowflake"`)*
+    - Databricks table (`"databricks"`)*
+    - PySpark table (`"pyspark"`)*
+    - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
@@ -9028,6 +9654,11 @@ get_row_count(data: 'FrameT | Any') -> 'int'
     - MySQL table (`"mysql"`)*
     - PostgreSQL table (`"postgresql"`)*
     - SQLite table (`"sqlite"`)*
+    - Microsoft SQL Server table (`"mssql"`)*
+    - Snowflake table (`"snowflake"`)*
+    - Databricks table (`"databricks"`)*
+    - PySpark table (`"pyspark"`)*
+    - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
@@ -9467,7 +10098,7 @@ send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None'
             thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
             actions=pb.Actions(critical=notify_slack),
         )
-        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
+        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
         .col_vals_gt(columns="item_revenue", value=0.05)
         .col_vals_gt(columns="session_duration", value=15)
         .interrogate()
@@ -9499,7 +10130,7 @@ send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None'
             thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
             final_actions=pb.FinalActions(notify_slack),
         )
-        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
+        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
         .col_vals_gt(columns="item_revenue", value=0.05)
         .col_vals_gt(columns="session_duration", value=15)
         .interrogate()
@@ -9567,7 +10198,7 @@ send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None'
             actions=pb.Actions(default=notify_slack),
             final_actions=pb.FinalActions(notify_slack),
         )
-        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
+        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
         .col_vals_gt(columns="item_revenue", value=0.05)
         .col_vals_gt(columns="session_duration", value=15)
         .interrogate()

pointblank 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl

pointblank 0.9.5py3-none-any.whl → 0.10.0py3-none-any.whl