PyPI - pointblank - Versions diffs - 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

pointblank 0.9.5py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

pointblank/__init__.py +4 -0
pointblank/_constants.py +6 -0
pointblank/_datascan_utils.py +65 -0
pointblank/_utils.py +128 -0
pointblank/_utils_html.py +40 -0
pointblank/actions.py +3 -3
pointblank/assistant.py +1 -3
pointblank/column.py +4 -4
pointblank/compare.py +27 -0
pointblank/data/api-docs.txt +769 -138
pointblank/datascan.py +318 -959
pointblank/scan_profile.py +321 -0
pointblank/scan_profile_stats.py +180 -0
pointblank/schema.py +14 -3
pointblank/thresholds.py +2 -2
pointblank/validate.py +1594 -207
{pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/METADATA +6 -3
pointblank-0.10.0.dist-info/RECORD +37 -0
{pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/WHEEL +1 -1
pointblank-0.9.5.dist-info/RECORD +0 -33
{pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/licenses/LICENSE +0 -0
{pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/top_level.txt +0 -0

pointblank/validate.py CHANGED Viewed

@@ -17,6 +17,7 @@ from zipfile import ZipFile
 import commonmark
 import narwhals as nw
 from great_tables import GT, from_column, google_font, html, loc, md, style, vals
+from great_tables.gt import _get_column_of_values
 from great_tables.vals import fmt_integer, fmt_number
 from importlib_resources import files
 from narwhals.typing import FrameT
@@ -64,11 +65,15 @@ from pointblank._typing import SegmentSpec
 from pointblank._utils import (
     _check_any_df_lib,
     _check_invalid_fields,
+    _count_null_values_in_column,
+    _count_true_values_in_column,
     _derive_bounds,
     _format_to_integer_value,
     _get_fn_name,
     _get_tbl_type,
+    _is_lazy_frame,
     _is_lib_present,
+    _is_narwhals_table,
     _is_value_a_df,
     _select_df_lib,
 )
@@ -99,11 +104,13 @@ __all__ = [
     "Validate",
     "load_dataset",
     "config",
+    "connect_to_table",
     "preview",
     "missing_vals_tbl",
+    "get_action_metadata",
     "get_column_count",
+    "get_data_path",
     "get_row_count",
-    "get_action_metadata",
     "get_validation_summary",
 ]
@@ -495,7 +502,9 @@ def load_dataset(
         raise ValueError(
             f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
             "- `small_table`\n"
-            "- `game_revenue`"
+            "- `game_revenue`\n"
+            "- `nycflights`\n"
+            "- `global_sales`"
         )
     # Raise an error if the `tbl_type=` value is not of the supported types
@@ -560,6 +569,405 @@ def load_dataset(
     return dataset
+def get_data_path(
+    dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
+    file_type: Literal["csv", "parquet", "duckdb"] = "csv",
+) -> str:
+    """
+    Get the file path to a dataset included with the Pointblank package.
+    This function provides direct access to the file paths of datasets included with Pointblank.
+    These paths can be used in examples and documentation to demonstrate file-based data loading
+    without requiring the actual data files. The returned paths can be used with
+    `Validate(data=path)` to demonstrate CSV and Parquet file loading capabilities.
+    Parameters
+    ----------
+    dataset
+        The name of the dataset to get the path for. Current options are `"small_table"`,
+        `"game_revenue"`, `"nycflights"`, and `"global_sales"`.
+    file_type
+        The file format to get the path for. Options are `"csv"`, `"parquet"`, or `"duckdb"`.
+    Returns
+    -------
+    str
+        The file path to the requested dataset file.
+    Included Datasets
+    -----------------
+    The available datasets are the same as those in [`load_dataset()`](`pointblank.load_dataset`):
+    - `"small_table"`: A small dataset with 13 rows and 8 columns. Ideal for testing and examples.
+    - `"game_revenue"`: A dataset with 2000 rows and 11 columns. Revenue data for a game company.
+    - `"nycflights"`: A dataset with 336,776 rows and 18 columns. Flight data from NYC airports.
+    - `"global_sales"`: A dataset with 50,000 rows and 20 columns. Global sales data across regions.
+    File Types
+    ----------
+    Each dataset is available in multiple formats:
+    - `"csv"`: Comma-separated values file (`.csv`)
+    - `"parquet"`: Parquet file (`.parquet`)
+    - `"duckdb"`: DuckDB database file (`.ddb`)
+    Examples
+    --------
+    Get the path to a CSV file and use it with `Validate`:
+    ```{python}
+    import pointblank as pb
+    # Get path to the small_table CSV file
+    csv_path = pb.get_data_path("small_table", "csv")
+    print(csv_path)
+    # Use the path directly with Validate
+    validation = (
+        pb.Validate(data=csv_path)
+        .col_exists(["a", "b", "c"])
+        .col_vals_gt(columns="d", value=0)
+        .interrogate()
+    )
+    validation
+    ```
+    Get a Parquet file path for validation examples:
+    ```{python}
+    # Get path to the game_revenue Parquet file
+    parquet_path = pb.get_data_path(dataset="game_revenue", file_type="parquet")
+    # Validate the Parquet file directly
+    validation = (
+        pb.Validate(data=parquet_path, label="Game Revenue Data Validation")
+        .col_vals_not_null(columns=["player_id", "session_id"])
+        .col_vals_gt(columns="item_revenue", value=0)
+        .interrogate()
+    )
+    validation
+    ```
+    This is particularly useful for documentation examples where you want to demonstrate
+    file-based workflows without requiring users to have specific data files:
+    ```{python}
+    # Example showing CSV file validation
+    sales_csv = pb.get_data_path(dataset="global_sales", file_type="csv")
+    validation = (
+        pb.Validate(data=sales_csv, label="Sales Data Validation")
+        .col_exists(["customer_id", "product_id", "amount"])
+        .col_vals_regex(columns="customer_id", pattern=r"CUST_[0-9]{6}")
+        .interrogate()
+    )
+    ```
+    See Also
+    --------
+    [`load_dataset()`](`pointblank.load_dataset`) for loading datasets directly as table objects.
+    """
+    # Validate inputs
+    if dataset not in ["small_table", "game_revenue", "nycflights", "global_sales"]:
+        raise ValueError(
+            f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
+            "- `small_table`\n"
+            "- `game_revenue`\n"
+            "- `nycflights`\n"
+            "- `global_sales`"
+        )
+    if file_type not in ["csv", "parquet", "duckdb"]:
+        raise ValueError(
+            f"The file type `{file_type}` is not valid. Choose one of the following:\n"
+            "- `csv`\n"
+            "- `parquet`\n"
+            "- `duckdb`"
+        )
+    if file_type == "csv":
+        # Return path to CSV file inside the zip
+        data_path = files("pointblank.data") / f"{dataset}.zip"
+        # For CSV files, we need to extract from zip to a temporary location
+        # since most libraries expect actual file paths, not zip contents
+        with tempfile.NamedTemporaryFile(mode="wb", suffix=".csv", delete=False) as tmp_file:
+            with ZipFile(data_path) as zip_file:
+                csv_content = zip_file.read(f"{dataset}.csv")
+                tmp_file.write(csv_content)
+                return tmp_file.name
+    elif file_type == "parquet":
+        # Create a temporary parquet file from the CSV data
+        data_path = files("pointblank.data") / f"{dataset}.zip"
+        # We'll need to convert CSV to Parquet temporarily
+        with tempfile.NamedTemporaryFile(mode="wb", suffix=".parquet", delete=False) as tmp_file:
+            # Load CSV data and save as Parquet
+            if _is_lib_present(lib_name="polars"):
+                import polars as pl
+                df = pl.read_csv(ZipFile(data_path).read(f"{dataset}.csv"), try_parse_dates=True)
+                df.write_parquet(tmp_file.name)
+            elif _is_lib_present(lib_name="pandas"):
+                import pandas as pd
+                df = pd.read_csv(data_path)
+                df.to_parquet(tmp_file.name, index=False)
+            else:
+                raise ImportError(
+                    "Either Polars or Pandas is required to create temporary Parquet files."
+                )
+            return tmp_file.name
+    elif file_type == "duckdb":
+        # Return path to DuckDB file
+        data_path = files("pointblank.data") / f"{dataset}-duckdb.zip"
+        # Extract DuckDB file to temporary location
+        with tempfile.NamedTemporaryFile(mode="wb", suffix=".ddb", delete=False) as tmp_file:
+            with ZipFile(data_path) as zip_file:
+                ddb_content = zip_file.read(f"{dataset}.ddb")
+                tmp_file.write(ddb_content)
+                return tmp_file.name
+# =============================================================================
+# Utility functions for processing input data (shared by preview() and Validate class)
+# =============================================================================
+def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
+    """
+    Process data parameter to handle database connection strings.
+    Uses the `connect_to_table()` utility function to handle URI-formatted connection strings with
+    table specifications. Returns the original data if it's not a connection string.
+    For more details on supported connection string formats, see the documentation
+    for `connect_to_table()`.
+    """
+    # Check if data is a string that looks like a connection string
+    if not isinstance(data, str):
+        return data
+    # Basic connection string patterns
+    connection_patterns = [
+        "://",  # General URL-like pattern
+    ]
+    # Check if it looks like a connection string
+    if not any(pattern in data for pattern in connection_patterns):
+        return data
+    # Use the utility function to connect to the table
+    return connect_to_table(data)
+def _process_csv_input(data: FrameT | Any) -> FrameT | Any:
+    """
+    Process data parameter to handle CSV file inputs.
+    If data is a string or Path with .csv extension, reads the CSV file
+    using available libraries (Polars preferred, then Pandas).
+    Returns the original data if it's not a CSV file path.
+    """
+    from pathlib import Path
+    # Check if data is a string or Path-like object with .csv extension
+    csv_path = None
+    if isinstance(data, (str, Path)):
+        path_obj = Path(data)
+        if path_obj.suffix.lower() == ".csv":
+            csv_path = path_obj
+    # If it's not a CSV file path, return the original data
+    if csv_path is None:
+        return data
+    # Check if the CSV file exists
+    if not csv_path.exists():
+        raise FileNotFoundError(f"CSV file not found: {csv_path}")
+    # Determine which library to use for reading CSV
+    # Prefer Polars, fallback to Pandas
+    if _is_lib_present(lib_name="polars"):
+        try:
+            import polars as pl
+            return pl.read_csv(csv_path, try_parse_dates=True)
+        except Exception as e:
+            # If Polars fails, try Pandas if available
+            if _is_lib_present(lib_name="pandas"):
+                import pandas as pd
+                return pd.read_csv(csv_path)
+            else:
+                raise RuntimeError(
+                    f"Failed to read CSV file with Polars: {e}. "
+                    "Pandas is not available as fallback."
+                ) from e
+    elif _is_lib_present(lib_name="pandas"):
+        try:
+            import pandas as pd
+            return pd.read_csv(csv_path)
+        except Exception as e:
+            raise RuntimeError(f"Failed to read CSV file with Pandas: {e}") from e
+    else:
+        raise ImportError(
+            "Neither Polars nor Pandas is available for reading CSV files. "
+            "Please install either 'polars' or 'pandas' to use CSV file inputs."
+        )
+def _process_parquet_input(data: FrameT | Any) -> FrameT | Any:
+    """
+    Process data parameter to handle Parquet file inputs.
+    Supports:
+    - single .parquet file (string or Path)
+    - glob patterns for multiple .parquet files (e.g., "data/*.parquet")
+    - directory containing .parquet files
+    - partitioned Parquet datasets with automatic partition column inference
+    - list/sequence of .parquet file paths
+    Returns the original data if it's not a Parquet file input.
+    """
+    import glob
+    from pathlib import Path
+    parquet_paths = []
+    # Handle different input types
+    if isinstance(data, (str, Path)):
+        data_str = str(data)
+        path_obj = Path(data)
+        # Check if it's a glob pattern containing .parquet first; look for glob
+        # characters: `*`, `?`, `[`, `]`
+        if ".parquet" in data_str.lower() and any(
+            char in data_str for char in ["*", "?", "[", "]"]
+        ):
+            parquet_files = glob.glob(data_str)
+            if parquet_files:
+                parquet_paths = sorted([Path(f) for f in parquet_files])
+            else:
+                raise FileNotFoundError(f"No files found matching pattern: {data}")
+        # Check if it's a single .parquet file
+        elif path_obj.suffix.lower() == ".parquet":
+            if path_obj.exists():
+                parquet_paths = [path_obj]
+            else:
+                raise FileNotFoundError(f"Parquet file not found: {path_obj}")
+        # Check if it's a directory
+        elif path_obj.is_dir():
+            # First, try to read as a partitioned parquet dataset; This handles datasets where
+            # Parquet files are in subdirectories with partition columns encoded in paths
+            try:
+                # Both Polars and Pandas can handle partitioned datasets natively
+                if _is_lib_present(lib_name="polars"):
+                    import polars as pl
+                    # Try reading as partitioned dataset first
+                    df = pl.read_parquet(str(path_obj))
+                    return df
+                elif _is_lib_present(lib_name="pandas"):
+                    import pandas as pd
+                    # Try reading as partitioned dataset first
+                    df = pd.read_parquet(str(path_obj))
+                    return df
+            except Exception:
+                # If partitioned read fails, fall back to simple directory scan
+                pass
+            # Fallback: Look for .parquet files directly in the directory
+            parquet_files = list(path_obj.glob("*.parquet"))
+            if parquet_files:
+                parquet_paths = sorted(parquet_files)
+            else:
+                raise FileNotFoundError(
+                    f"No .parquet files found in directory: {path_obj}. "
+                    f"This could be a non-partitioned directory without .parquet files, "
+                    f"or a partitioned dataset that couldn't be read."
+                )
+            # If it's not a parquet file, directory, or glob pattern, return original data
+        else:
+            return data
+    # Handle list/sequence of paths
+    elif isinstance(data, (list, tuple)):
+        for item in data:
+            item_path = Path(item)
+            if item_path.suffix.lower() == ".parquet":
+                if item_path.exists():
+                    parquet_paths.append(item_path)
+                else:
+                    raise FileNotFoundError(f"Parquet file not found: {item_path}")
+            else:
+                # If any item is not a parquet file, return original data
+                return data
+    # If no parquet files found, return original data
+    if not parquet_paths:
+        return data
+    # Read the parquet file(s) using available libraries; prefer Polars, fallback to Pandas
+    if _is_lib_present(lib_name="polars"):
+        try:
+            import polars as pl
+            if len(parquet_paths) == 1:
+                # Single file
+                return pl.read_parquet(parquet_paths[0])
+            else:
+                # Multiple files: concatenate them
+                dfs = [pl.read_parquet(path) for path in parquet_paths]
+                return pl.concat(dfs, how="vertical_relaxed")
+        except Exception as e:
+            # If Polars fails, try Pandas if available
+            if _is_lib_present(lib_name="pandas"):
+                import pandas as pd
+                if len(parquet_paths) == 1:
+                    return pd.read_parquet(parquet_paths[0])
+                else:
+                    # Multiple files: concatenate them
+                    dfs = [pd.read_parquet(path) for path in parquet_paths]
+                    return pd.concat(dfs, ignore_index=True)
+            else:
+                raise RuntimeError(
+                    f"Failed to read Parquet file(s) with Polars: {e}. "
+                    "Pandas is not available as fallback."
+                ) from e
+    elif _is_lib_present(lib_name="pandas"):
+        try:
+            import pandas as pd
+            if len(parquet_paths) == 1:
+                return pd.read_parquet(parquet_paths[0])
+            else:
+                # Multiple files: concatenate them
+                dfs = [pd.read_parquet(path) for path in parquet_paths]
+                return pd.concat(dfs, ignore_index=True)
+        except Exception as e:
+            raise RuntimeError(f"Failed to read Parquet file(s) with Pandas: {e}") from e
+    else:
+        raise ImportError(
+            "Neither Polars nor Pandas is available for reading Parquet files. "
+            "Please install either 'polars' or 'pandas' to use Parquet file inputs."
+        )
 def preview(
     data: FrameT | Any,
     columns_subset: str | list[str] | Column | None = None,
@@ -590,8 +998,14 @@ def preview(
     Parameters
     ----------
     data
-        The table to preview, which could be a DataFrame object or an Ibis table object. Read the
-        *Supported Input Table Types* section for details on the supported table types.
+        The table to preview, which could be a DataFrame object, an Ibis table object, a CSV
+        file path, a Parquet file path, or a database connection string. When providing a CSV or
+        Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
+        loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
+        glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
+        Connection strings enable direct database access via Ibis with optional table specification
+        using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
+        on the supported table types.
     columns_subset
         The columns to display in the table, by default `None` (all columns are shown). This can
         be a string, a list of strings, a `Column` object, or a `ColumnSelector` object. The latter
@@ -636,13 +1050,40 @@ def preview(
     - MySQL table (`"mysql"`)*
     - PostgreSQL table (`"postgresql"`)*
     - SQLite table (`"sqlite"`)*
+    - Microsoft SQL Server table (`"mssql"`)*
+    - Snowflake table (`"snowflake"`)*
+    - Databricks table (`"databricks"`)*
+    - PySpark table (`"pyspark"`)*
+    - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
+    - CSV files (string path or `pathlib.Path` object with `.csv` extension)
+    - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
+    extension, or partitioned dataset)
+    - Database connection strings (URI format with optional table specification)
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
     `ibis.expr.types.relations.Table`). Furthermore, using `preview()` with these types of tables
     requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or
     Pandas DataFrame, the availability of Ibis is not needed.
+    To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
+    provided. The file will be automatically detected and loaded using the best available DataFrame
+    library. The loading preference is Polars first, then Pandas as a fallback.
+    Connection strings follow database URL formats and must also specify a table using the
+    `::table_name` suffix. Examples include:
+    ```
+    "duckdb:///path/to/database.ddb::table_name"
+    "sqlite:///path/to/database.db::table_name"
+    "postgresql://user:password@localhost:5432/database::table_name"
+    "mysql://user:password@localhost:3306/database::table_name"
+    "bigquery://project/dataset::table_name"
+    "snowflake://user:password@account/database/schema::table_name"
+    ```
+    When using connection strings, the Ibis library with the appropriate backend driver is required.
     Examples
     --------
     It's easy to preview a table using the `preview()` function. Here's an example using the
@@ -709,8 +1150,80 @@ def preview(
       columns_subset=pb.col(pb.starts_with("item") | pb.matches("player"))
     )
     ```
+    ### Working with CSV Files
+    The `preview()` function can directly accept CSV file paths, making it easy to preview data
+    stored in CSV files without manual loading:
+    ```{python}
+    # Get a path to a CSV file from the package data
+    csv_path = pb.get_data_path("global_sales", "csv")
+    pb.preview(csv_path)
+    ```
+    You can also use a Path object to specify the CSV file:
+    ```{python}
+    from pathlib import Path
+    csv_file = Path(pb.get_data_path("game_revenue", "csv"))
+    pb.preview(csv_file, n_head=3, n_tail=3)
+    ```
+    ### Working with Parquet Files
+    The `preview()` function can directly accept Parquet files and datasets in various formats:
+    ```{python}
+    # Single Parquet file from package data
+    parquet_path = pb.get_data_path("nycflights", "parquet")
+    pb.preview(parquet_path)
+    ```
+    You can also use glob patterns and directories:
+    ```python
+    # Multiple Parquet files with glob patterns
+    pb.preview("data/sales_*.parquet")
+    # Directory containing Parquet files
+    pb.preview("parquet_data/")
+    # Partitioned Parquet dataset
+    pb.preview("sales_data/")  # Auto-discovers partition columns
+    ```
+    ### Working with Database Connection Strings
+    The `preview()` function supports database connection strings for direct preview of database
+    tables. Connection strings must specify a table using the `::table_name` suffix:
+    ```{python}
+    # Get path to a DuckDB database file from package data
+    duckdb_path = pb.get_data_path("game_revenue", "duckdb")
+    pb.preview(f"duckdb:///{duckdb_path}::game_revenue")
+    ```
+    For comprehensive documentation on supported connection string formats, error handling, and
+    installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
+    function.
     """
+    # Process input data to handle different data source types
+    # Handle connection string input (e.g., "duckdb:///path/to/file.ddb::table_name")
+    data = _process_connection_string(data)
+    # Handle CSV file input (e.g., "data.csv" or Path("data.csv"))
+    data = _process_csv_input(data)
+    # Handle Parquet file input (e.g., "data.parquet", "data/*.parquet", "data/")
+    data = _process_parquet_input(data)
     if incl_header is None:
         incl_header = global_config.preview_incl_header
@@ -908,7 +1421,7 @@ def _generate_display_table(
         k: v.split("(")[0] if "(" in v else v for k, v in col_dtype_dict.items()
     }
-    # Create a dictionary of column and row positions where the value is None/NA/NULL
+    # Create a dictionary of column and row positions where the value is None/NA/Null
     # This is used to highlight these values in the table
     if df_lib_name_gt == "polars":
         none_values = {k: data[k].is_null().to_list() for k in col_names}
@@ -932,7 +1445,10 @@ def _generate_display_table(
         column_values = gt.gt._get_column_of_values(built_gt, column_name=column, context="html")
         # Get the maximum number of characters in the column
-        max_length_col_vals.append(max([len(str(val)) for val in column_values]))
+        if column_values:  # Check if column_values is not empty
+            max_length_col_vals.append(max([len(str(val)) for val in column_values]))
+        else:
+            max_length_col_vals.append(0)  # Use 0 for empty columns
     length_col_names = [len(column) for column in col_dtype_dict.keys()]
     length_data_types = [len(dtype) for dtype in col_dtype_dict_short.values()]
@@ -1003,8 +1519,12 @@ def _generate_display_table(
         # Get the highest number in the `row_number_list` and calculate a width that will
         # safely fit a number of that magnitude
-        max_row_num = max(row_number_list)
-        max_row_num_width = len(str(max_row_num)) * 7.8 + 10
+        if row_number_list:  # Check if list is not empty
+            max_row_num = max(row_number_list)
+            max_row_num_width = len(str(max_row_num)) * 7.8 + 10
+        else:
+            # If row_number_list is empty, use a default width
+            max_row_num_width = 7.8 * 2 + 10  # Width for 2-digit numbers
         # Update the col_width_dict to include the row number column
         col_width_dict = {"_row_num_": f"{max_row_num_width}px"} | col_width_dict
@@ -1134,6 +1654,11 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
     - MySQL table (`"mysql"`)*
     - PostgreSQL table (`"postgresql"`)*
     - SQLite table (`"sqlite"`)*
+    - Microsoft SQL Server table (`"mssql"`)*
+    - Snowflake table (`"snowflake"`)*
+    - Databricks table (`"databricks"`)*
+    - PySpark table (`"pyspark"`)*
+    - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
@@ -1663,6 +2188,11 @@ def get_column_count(data: FrameT | Any) -> int:
     - MySQL table (`"mysql"`)*
     - PostgreSQL table (`"postgresql"`)*
     - SQLite table (`"sqlite"`)*
+    - Microsoft SQL Server table (`"mssql"`)*
+    - Snowflake table (`"snowflake"`)*
+    - Databricks table (`"databricks"`)*
+    - PySpark table (`"pyspark"`)*
+    - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
@@ -1707,6 +2237,9 @@ def get_column_count(data: FrameT | Any) -> int:
     elif "pandas" in str(type(data)):
         return data.shape[1]
+    elif "narwhals" in str(type(data)):
+        return len(data.columns)
     else:
         raise ValueError("The input table type supplied in `data=` is not supported.")
@@ -1741,6 +2274,11 @@ def get_row_count(data: FrameT | Any) -> int:
     - MySQL table (`"mysql"`)*
     - PostgreSQL table (`"postgresql"`)*
     - SQLite table (`"sqlite"`)*
+    - Microsoft SQL Server table (`"mssql"`)*
+    - Snowflake table (`"snowflake"`)*
+    - Databricks table (`"databricks"`)*
+    - PySpark table (`"pyspark"`)*
+    - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
@@ -1795,6 +2333,9 @@ def get_row_count(data: FrameT | Any) -> int:
     elif "pandas" in str(type(data)):
         return data.shape[0]
+    elif "narwhals" in str(type(data)):
+        return data.shape[0]
     else:
         raise ValueError("The input table type supplied in `data=` is not supported.")
@@ -1910,6 +2451,239 @@ class _ValidationInfo:
         return self.val_info
+def connect_to_table(connection_string: str) -> Any:
+    """
+    Connect to a database table using a connection string.
+    This utility function tests whether a connection string leads to a valid table and returns
+    the table object if successful. It provides helpful error messages when no table is specified
+    or when backend dependencies are missing.
+    Parameters
+    ----------
+    connection_string
+        A database connection string with a required table specification using the `::table_name`
+        suffix. Supported formats are outlined in the *Supported Connection String Formats* section.
+    Returns
+    -------
+    Any
+        An Ibis table object for the specified database table.
+    Supported Connection String Formats
+    -----------------------------------
+    The `connection_string` parameter must include a valid connection string with a table name
+    specified using the `::` syntax. Here are some examples on how to format connection strings
+    for various backends:
+    ```
+    DuckDB:     "duckdb:///path/to/database.ddb::table_name"
+    SQLite:     "sqlite:///path/to/database.db::table_name"
+    PostgreSQL: "postgresql://user:password@localhost:5432/database::table_name"
+    MySQL:      "mysql://user:password@localhost:3306/database::table_name"
+    BigQuery:   "bigquery://project/dataset::table_name"
+    Snowflake:  "snowflake://user:password@account/database/schema::table_name"
+    ```
+    If the connection string does not include a table name, the function will attempt to connect to
+    the database and list available tables, providing guidance on how to specify a table.
+    Examples
+    --------
+    Connect to a DuckDB table:
+    ```{python}
+    import pointblank as pb
+    # Get path to a DuckDB database file from package data
+    duckdb_path = pb.get_data_path("game_revenue", "duckdb")
+    # Connect to the `game_revenue` table in the DuckDB database
+    game_revenue = pb.connect_to_table(f"duckdb:///{duckdb_path}::game_revenue")
+    # Use with the `preview()` function
+    pb.preview(game_revenue)
+    ```
+    Here are some backend-specific connection examples:
+    ```python
+    # PostgreSQL
+    pg_table = pb.connect_to_table(
+        "postgresql://user:password@localhost:5432/warehouse::customer_data"
+    )
+    # SQLite
+    sqlite_table = pb.connect_to_table("sqlite:///local_data.db::products")
+    # BigQuery
+    bq_table = pb.connect_to_table("bigquery://my-project/analytics::daily_metrics")
+    ```
+    This function requires the Ibis library with appropriate backend drivers:
+    ```bash
+    # You can install a set of common backends:
+    pip install 'ibis-framework[duckdb,postgres,mysql,sqlite]'
+    # ...or specific backends as needed:
+    pip install 'ibis-framework[duckdb]'    # for DuckDB
+    pip install 'ibis-framework[postgres]'  # for PostgreSQL
+    ```
+    """
+    # Check if Ibis is available
+    if not _is_lib_present(lib_name="ibis"):
+        raise ImportError(
+            "The Ibis library is not installed but is required for database connection strings.\n"
+            "Install it with: pip install 'ibis-framework[duckdb]' (or other backend as needed)"
+        )
+    import ibis
+    # Check if connection string includes table specification
+    if "::" not in connection_string:
+        # Try to connect to get available tables for helpful error message
+        try:
+            # Extract the base connection string (without table name)
+            base_connection = connection_string
+            # Connect to the database
+            conn = ibis.connect(base_connection)
+            # Get list of available tables
+            try:
+                available_tables = conn.list_tables()
+            except Exception:
+                available_tables = []
+            conn.disconnect()
+            # Create helpful error message
+            if available_tables:
+                table_list = "\n".join(f"  - {table}" for table in available_tables)
+                error_msg = (
+                    f"No table specified in connection string: {connection_string}\n\n"
+                    f"Available tables in the database:\n{table_list}\n\n"
+                    f"To access a specific table, use the format:\n"
+                    f"  {connection_string}::TABLE_NAME\n\n"
+                    f"Examples:\n"
+                )
+                # Add examples with first few table names
+                for table in available_tables[:3]:
+                    error_msg += f"  {connection_string}::{table}\n"
+            else:
+                error_msg = (
+                    f"No table specified in connection string: {connection_string}\n\n"
+                    f"No tables found in the database or unable to list tables.\n\n"
+                    f"To access a specific table, use the format:\n"
+                    f"  {connection_string}::TABLE_NAME"
+                )
+            raise ValueError(error_msg)
+        except Exception as e:
+            if isinstance(e, ValueError):
+                raise  # Re-raise our custom ValueError
+            # Check for backend-specific errors and provide installation guidance
+            error_str = str(e).lower()
+            backend_install_map = {
+                "duckdb": "pip install 'ibis-framework[duckdb]'",
+                "postgresql": "pip install 'ibis-framework[postgres]'",
+                "postgres": "pip install 'ibis-framework[postgres]'",
+                "mysql": "pip install 'ibis-framework[mysql]'",
+                "sqlite": "pip install 'ibis-framework[sqlite]'",
+                "bigquery": "pip install 'ibis-framework[bigquery]'",
+                "snowflake": "pip install 'ibis-framework[snowflake]'",
+            }
+            # Check if this is a missing backend dependency
+            for backend, install_cmd in backend_install_map.items():
+                if backend in error_str and ("not found" in error_str or "no module" in error_str):
+                    raise ConnectionError(
+                        f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
+                        f"  {install_cmd}\n\n"
+                        f"Original error: {e}\n\n"
+                        f"Supported connection string formats:\n"
+                        f"- DuckDB: 'duckdb:///path/to/file.ddb::table_name'\n"
+                        f"- SQLite: 'sqlite:///path/to/file.db::table_name'\n"
+                        f"- PostgreSQL: 'postgresql://user:pass@host:port/db::table_name'\n"
+                        f"- MySQL: 'mysql://user:pass@host:port/db::table_name'\n"
+                        f"- BigQuery: 'bigquery://project/dataset::table_name'\n"
+                        f"- Snowflake: 'snowflake://user:pass@account/db/schema::table_name'\n"
+                        f"\nNote: Use '::table_name' to specify the table within the database."
+                    ) from e
+            # Generic connection error
+            raise ConnectionError(
+                f"Failed to connect to database using connection string: {connection_string}\n"
+                f"Error: {e}\n\n"
+                f"No table specified. Use the format: {connection_string}::TABLE_NAME"
+            ) from e
+    # Split connection string and table name
+    try:
+        base_connection, table_name = connection_string.rsplit("::", 1)
+    except ValueError:
+        raise ValueError(f"Invalid connection string format: {connection_string}")
+    # Connect to database and get table
+    try:
+        conn = ibis.connect(base_connection)
+        table = conn.table(table_name)
+        return table
+    except Exception as e:
+        # Check for backend-specific errors and provide installation guidance
+        error_str = str(e).lower()
+        backend_install_map = {
+            "duckdb": "pip install 'ibis-framework[duckdb]'",
+            "postgresql": "pip install 'ibis-framework[postgres]'",
+            "postgres": "pip install 'ibis-framework[postgres]'",
+            "mysql": "pip install 'ibis-framework[mysql]'",
+            "sqlite": "pip install 'ibis-framework[sqlite]'",
+            "bigquery": "pip install 'ibis-framework[bigquery]'",
+            "snowflake": "pip install 'ibis-framework[snowflake]'",
+        }
+        # Check if this is a missing backend dependency
+        for backend, install_cmd in backend_install_map.items():
+            if backend in error_str and ("not found" in error_str or "no module" in error_str):
+                raise ConnectionError(
+                    f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
+                    f"  {install_cmd}\n\n"
+                    f"Original error: {e}"
+                ) from e
+        # Check if table doesn't exist
+        if "table" in error_str and ("not found" in error_str or "does not exist" in error_str):
+            # Try to get available tables for helpful message
+            try:
+                available_tables = conn.list_tables()
+                if available_tables:
+                    table_list = "\n".join(f"  - {table}" for table in available_tables)
+                    raise ValueError(
+                        f"Table '{table_name}' not found in database.\n\n"
+                        f"Available tables:\n{table_list}\n\n"
+                        f"Check the table name and try again with:\n"
+                        f"  {base_connection}::CORRECT_TABLE_NAME"
+                    ) from e
+                else:
+                    raise ValueError(
+                        f"Table '{table_name}' not found and no tables available in database."
+                    ) from e
+            except Exception:
+                raise ValueError(
+                    f"Table '{table_name}' not found in database. "
+                    f"Check the table name and connection string."
+                ) from e
+        # Generic connection error
+        raise ConnectionError(
+            f"Failed to connect to table '{table_name}' using: {base_connection}\nError: {e}"
+        ) from e
 @dataclass
 class Validate:
     """
@@ -1942,8 +2716,14 @@ class Validate:
     Parameters
     ----------
     data
-        The table to validate, which could be a DataFrame object or an Ibis table object. Read the
-        *Supported Input Table Types* section for details on the supported table types.
+        The table to validate, which could be a DataFrame object, an Ibis table object, a CSV
+        file path, a Parquet file path, or a database connection string. When providing a CSV or
+        Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
+        loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
+        glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
+        Connection strings enable direct database access via Ibis with optional table specification
+        using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
+        on the supported table types.
     tbl_name
         An optional name to assign to the input table object. If no value is provided, a name will
         be generated based on whatever information is available. This table name will be displayed
@@ -2007,13 +2787,40 @@ class Validate:
     - MySQL table (`"mysql"`)*
     - PostgreSQL table (`"postgresql"`)*
     - SQLite table (`"sqlite"`)*
+    - Microsoft SQL Server table (`"mssql"`)*
+    - Snowflake table (`"snowflake"`)*
+    - Databricks table (`"databricks"`)*
+    - PySpark table (`"pyspark"`)*
+    - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
+    - CSV files (string path or `pathlib.Path` object with `.csv` extension)
+    - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
+    extension, or partitioned dataset)
+    - Database connection strings (URI format with optional table specification)
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
     `ibis.expr.types.relations.Table`). Furthermore, the use of `Validate` with such tables requires
     the Ibis library v9.5.0 and above to be installed. If the input table is a Polars or Pandas
     DataFrame, the Ibis library is not required.
+    To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
+    provided. The file will be automatically detected and loaded using the best available DataFrame
+    library. The loading preference is Polars first, then Pandas as a fallback.
+    Connection strings follow database URL formats and must also specify a table using the
+    `::table_name` suffix. Examples include:
+    ```
+    "duckdb:///path/to/database.ddb::table_name"
+    "sqlite:///path/to/database.db::table_name"
+    "postgresql://user:password@localhost:5432/database::table_name"
+    "mysql://user:password@localhost:3306/database::table_name"
+    "bigquery://project/dataset::table_name"
+    "snowflake://user:password@account/database/schema::table_name"
+    ```
+    When using connection strings, the Ibis library with the appropriate backend driver is required.
     Thresholds
     ----------
     The `thresholds=` parameter is used to set the failure-condition levels for all validation
@@ -2170,8 +2977,8 @@ class Validate:
     ```{python}
     import pointblank as pb
-    # Load the small_table dataset
-    small_table = pb.load_dataset()
+    # Load the `small_table` dataset
+    small_table = pb.load_dataset(dataset="small_table", tbl_type="polars")
     # Preview the table
     pb.preview(small_table)
@@ -2237,7 +3044,7 @@ class Validate:
     brief). Here's an example of a global setting for briefs:
     ```{python}
-    validation = (
+    validation_2 = (
         pb.Validate(
             data=pb.load_dataset(),
             tbl_name="small_table",
@@ -2254,7 +3061,7 @@ class Validate:
         .interrogate()
     )
-    validation
+    validation_2
     ```
     We see the text of the briefs appear in the `STEP` column of the reporting table. Furthermore,
@@ -2272,7 +3079,7 @@ class Validate:
     the data extracts for each validation step.
     ```{python}
-    validation.get_data_extracts()
+    validation_2.get_data_extracts()
     ```
     We can also view step reports for each validation step using the
@@ -2280,7 +3087,7 @@ class Validate:
     type of validation step and shows the relevant information for a step's validation.
     ```{python}
-    validation.get_step_report(i=2)
+    validation_2.get_step_report(i=2)
     ```
     The `Validate` class also has a method for getting the sundered data, which is the data that
@@ -2288,11 +3095,141 @@ class Validate:
     [`get_sundered_data()`](`pointblank.Validate.get_sundered_data`) method.
     ```{python}
-    pb.preview(validation.get_sundered_data())
+    pb.preview(validation_2.get_sundered_data())
     ```
     The sundered data is a DataFrame that contains the rows that passed or failed the validation.
     The default behavior is to return the rows that failed the validation, as shown above.
+    ### Working with CSV Files
+    The `Validate` class can directly accept CSV file paths, making it easy to validate data stored
+    in CSV files without manual loading:
+    ```{python}
+    # Get a path to a CSV file from the package data
+    csv_path = pb.get_data_path("global_sales", "csv")
+    validation_3 = (
+        pb.Validate(
+            data=csv_path,
+            label="CSV validation example"
+        )
+        .col_exists(["customer_id", "product_id", "revenue"])
+        .col_vals_not_null(["customer_id", "product_id"])
+        .col_vals_gt(columns="revenue", value=0)
+        .interrogate()
+    )
+    validation_3
+    ```
+    You can also use a Path object to specify the CSV file. Here's an example of how to do that:
+    ```{python}
+    from pathlib import Path
+    csv_file = Path(pb.get_data_path("game_revenue", "csv"))
+    validation_4 = (
+        pb.Validate(data=csv_file, label="Game Revenue Validation")
+        .col_exists(["player_id", "session_id", "item_name"])
+        .col_vals_regex(
+            columns="session_id",
+            pattern=r"[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}"
+        )
+        .col_vals_gt(columns="item_revenue", value=0, na_pass=True)
+        .interrogate()
+    )
+    validation_4
+    ```
+    The CSV loading is automatic, so when a string or Path with a `.csv` extension is provided,
+    Pointblank will automatically load the file using the best available DataFrame library (Polars
+    preferred, Pandas as fallback). The loaded data can then be used with all validation methods
+    just like any other supported table type.
+    ### Working with Parquet Files
+    The `Validate` class can directly accept Parquet files and datasets in various formats. The
+    following examples illustrate how to validate Parquet files:
+    ```{python}
+    # Single Parquet file from package data
+    parquet_path = pb.get_data_path("nycflights", "parquet")
+    validation_5 = (
+        pb.Validate(
+            data=parquet_path,
+            tbl_name="NYC Flights Data"
+        )
+        .col_vals_not_null(["carrier", "origin", "dest"])
+        .col_vals_gt(columns="distance", value=0)
+        .interrogate()
+    )
+    validation_5
+    ```
+    You can also use glob patterns and directories. Here are some examples for how to:
+    1. load multiple Parquet files
+    2. load a Parquet-containing directory
+    3. load a partitioned Parquet dataset
+    ```python
+    # Multiple Parquet files with glob patterns
+    validation_6 = pb.Validate(data="data/sales_*.parquet")
+    # Directory containing Parquet files
+    validation_7 = pb.Validate(data="parquet_data/")
+    # Partitioned Parquet dataset
+    validation_8 = (
+        pb.Validate(data="sales_data/")  # Contains year=2023/quarter=Q1/region=US/sales.parquet
+        .col_exists(["transaction_id", "amount", "year", "quarter", "region"])
+        .interrogate()
+    )
+    ```
+    When you point to a directory that contains a partitioned Parquet dataset (with subdirectories
+    like `year=2023/quarter=Q1/region=US/`), Pointblank will automatically:
+    - discover all Parquet files recursively
+    - extract partition column values from directory paths
+    - add partition columns to the final DataFrame
+    - combine all partitions into a single table for validation
+    Both Polars and Pandas handle partitioned datasets natively, so this works seamlessly with
+    either DataFrame library. The loading preference is Polars first, then Pandas as a fallback.
+    ### Working with Database Connection Strings
+    The `Validate` class supports database connection strings for direct validation of database
+    tables. Connection strings must specify a table using the `::table_name` suffix:
+    ```{python}
+    # Get path to a DuckDB database file from package data
+    duckdb_path = pb.get_data_path("game_revenue", "duckdb")
+    validation_9 = (
+        pb.Validate(
+            data=f"duckdb:///{duckdb_path}::game_revenue",
+            label="DuckDB Game Revenue Validation"
+        )
+        .col_exists(["player_id", "session_id", "item_revenue"])
+        .col_vals_gt(columns="item_revenue", value=0)
+        .interrogate()
+    )
+    validation_9
+    ```
+    For comprehensive documentation on supported connection string formats, error handling, and
+    installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
+    function. This function handles all the connection logic and provides helpful error messages
+    when table specifications are missing or backend dependencies are not installed.
     """
     data: FrameT | Any
@@ -2306,6 +3243,15 @@ class Validate:
     locale: str | None = None
     def __post_init__(self):
+        # Handle connection string input for the data parameter
+        self.data = _process_connection_string(self.data)
+        # Handle CSV file input for the data parameter
+        self.data = _process_csv_input(self.data)
+        # Handle Parquet file input for the data parameter
+        self.data = _process_parquet_input(self.data)
         # Check input of the `thresholds=` argument
         _check_thresholds(thresholds=self.thresholds)
@@ -2481,12 +3427,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -2769,12 +3719,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3056,12 +4010,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3342,12 +4300,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3626,12 +4588,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3914,12 +4880,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -4216,12 +5186,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -4532,12 +5506,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -4804,12 +5782,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -5057,12 +6039,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -5218,9 +6204,9 @@ class Validate:
         active: bool = True,
     ) -> Validate:
         """
-        Validate whether values in a column are NULL.
+        Validate whether values in a column are Null.
-        The `col_vals_null()` validation method checks whether column values in a table are NULL.
+        The `col_vals_null()` validation method checks whether column values in a table are Null.
         This validation will operate over the number of test units that is equal to the number
         of rows in the table.
@@ -5301,12 +6287,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -5457,10 +6447,10 @@ class Validate:
         active: bool = True,
     ) -> Validate:
         """
-        Validate whether values in a column are not NULL.
+        Validate whether values in a column are not Null.
         The `col_vals_not_null()` validation method checks whether column values in a table are not
-        NULL. This validation will operate over the number of test units that is equal to the number
+        Null. This validation will operate over the number of test units that is equal to the number
         of rows in the table.
         Parameters
@@ -5540,12 +6530,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -5787,12 +6781,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -6030,12 +7028,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -6421,12 +7423,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -6658,12 +7664,16 @@ class Validate:
         (i.e., no validation steps will be created for them).
         A list with a combination of column names and tuples can be provided as well. This allows
-        for more complex segmentation scenarios. The following inputs are all valid:
+        for more complex segmentation scenarios. The following inputs are both valid:
-        - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
-        in the `"region"` column and specific dates in the `"date"` column
-        - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
-        columns
+        ```
+        # Segments from all unique values in the `region` column
+        # and specific dates in the `date` column
+        segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
+        # Segments from all unique values in the `region` and `date` columns
+        segments=["region", "date"]
+        ```
         The segmentation is performed during interrogation, and the resulting validation steps will
         be numbered sequentially. Each segment will have its own validation step, and the results
@@ -8216,37 +9226,47 @@ class Validate:
             # Determine whether any preprocessing functions are to be applied to the table
             if validation.pre is not None:
-                # Read the text of the preprocessing function
-                pre_text = _pre_processing_funcs_to_str(validation.pre)
+                try:
+                    # Read the text of the preprocessing function
+                    pre_text = _pre_processing_funcs_to_str(validation.pre)
-                # Determine if the preprocessing function is a lambda function; return a boolean
-                is_lambda = re.match(r"^lambda", pre_text) is not None
+                    # Determine if the preprocessing function is a lambda function; return a boolean
+                    is_lambda = re.match(r"^lambda", pre_text) is not None
-                # If the preprocessing function is a lambda function, then check if there is
-                # a keyword argument called `dfn` in the lamda signature; if so, that's a cue
-                # to use a Narwhalified version of the table
-                if is_lambda:
-                    # Get the signature of the lambda function
-                    sig = inspect.signature(validation.pre)
+                    # If the preprocessing function is a lambda function, then check if there is
+                    # a keyword argument called `dfn` in the lamda signature; if so, that's a cue
+                    # to use a Narwhalified version of the table
+                    if is_lambda:
+                        # Get the signature of the lambda function
+                        sig = inspect.signature(validation.pre)
-                    # Check if the lambda function has a keyword argument called `dfn`
-                    if "dfn" in sig.parameters:
-                        # Convert the table to a Narwhals DataFrame
-                        data_tbl_step = nw.from_native(data_tbl_step)
+                        # Check if the lambda function has a keyword argument called `dfn`
+                        if "dfn" in sig.parameters:
+                            # Convert the table to a Narwhals DataFrame
+                            data_tbl_step = nw.from_native(data_tbl_step)
-                        # Apply the preprocessing function to the table
-                        data_tbl_step = validation.pre(dfn=data_tbl_step)
+                            # Apply the preprocessing function to the table
+                            data_tbl_step = validation.pre(dfn=data_tbl_step)
-                        # Convert the table back to its original format
-                        data_tbl_step = nw.to_native(data_tbl_step)
+                            # Convert the table back to its original format
+                            data_tbl_step = nw.to_native(data_tbl_step)
-                    else:
-                        # Apply the preprocessing function to the table
+                        else:
+                            # Apply the preprocessing function to the table
+                            data_tbl_step = validation.pre(data_tbl_step)
+                    # If the preprocessing function is a function, apply it to the table
+                    elif isinstance(validation.pre, Callable):
                         data_tbl_step = validation.pre(data_tbl_step)
-                # If the preprocessing function is a function, apply it to the table
-                elif isinstance(validation.pre, Callable):
-                    data_tbl_step = validation.pre(data_tbl_step)
+                except Exception:
+                    # If preprocessing fails, mark the validation as having an eval_error
+                    validation.eval_error = True
+                    end_time = datetime.datetime.now(datetime.timezone.utc)
+                    validation.proc_duration_s = (end_time - start_time).total_seconds()
+                    validation.time_processed = end_time.isoformat(timespec="milliseconds")
+                    validation.active = False
+                    continue
             # ------------------------------------------------
             # Segmentation stage
@@ -8259,12 +9279,28 @@ class Validate:
                     data_tbl=data_tbl_step, segments_expr=validation.segments
                 )
+            # ------------------------------------------------
+            # Determine table type and `collect()` if needed
+            # ------------------------------------------------
+            if tbl_type not in IBIS_BACKENDS:
+                tbl_type = "local"
+            # If the table is a lazy frame, we need to collect it
+            if _is_lazy_frame(data_tbl_step):
+                data_tbl_step = data_tbl_step.collect()
+            # ------------------------------------------------
+            # Set the number of test units
+            # ------------------------------------------------
             validation.n = NumberOfTestUnits(df=data_tbl_step, column=column).get_test_units(
                 tbl_type=tbl_type
             )
-            if tbl_type not in IBIS_BACKENDS:
-                tbl_type = "local"
+            # ------------------------------------------------
+            # Validation stage
+            # ------------------------------------------------
             if assertion_category == "COMPARE_ONE":
                 results_tbl = ColValsCompareOne(
@@ -8455,36 +9491,32 @@ class Validate:
                 else:
                     # If the result is not a list, then we assume it's a table in the conventional
-                    # form (where the column is `pb_is_good_` exists, with boolean values)
+                    # form (where the column is `pb_is_good_` exists, with boolean values
                     results_tbl = results_tbl_list
             # If the results table is not `None`, then we assume there is a table with a column
             # called `pb_is_good_` that contains boolean values; we can then use this table to
             # determine the number of test units that passed and failed
             if results_tbl is not None:
-                # Extract the `pb_is_good_` column from the table as a results list
-                if tbl_type in IBIS_BACKENDS:
-                    # Select the DataFrame library to use for getting the results list
-                    df_lib = _select_df_lib(preference="polars")
-                    df_lib_name = df_lib.__name__
-                    if df_lib_name == "pandas":
-                        results_list = (
-                            results_tbl.select("pb_is_good_").to_pandas()["pb_is_good_"].to_list()
-                        )
-                    else:
-                        results_list = (
-                            results_tbl.select("pb_is_good_").to_polars()["pb_is_good_"].to_list()
-                        )
+                # Count the number of passing and failing test units
+                validation.n_passed = _count_true_values_in_column(
+                    tbl=results_tbl, column="pb_is_good_"
+                )
+                validation.n_failed = _count_true_values_in_column(
+                    tbl=results_tbl, column="pb_is_good_", inverse=True
+                )
-                else:
-                    results_list = nw.from_native(results_tbl)["pb_is_good_"].to_list()
+                # Solely for the col_vals_in_set assertion type, any Null values in the
+                # `pb_is_good_` column are counted as failing test units
+                if assertion_type == "col_vals_in_set":
+                    null_count = _count_null_values_in_column(tbl=results_tbl, column="pb_is_good_")
+                    validation.n_failed += null_count
+                # For column-value validations, the number of test units is the number of rows
+                validation.n = get_row_count(data=results_tbl)
-                validation.all_passed = all(results_list)
-                validation.n = len(results_list)
-                validation.n_passed = results_list.count(True)
-                validation.n_failed = results_list.count(False)
+                # Set the `all_passed` attribute based on whether there are any failing test units
+                validation.all_passed = validation.n_failed == 0
             # Calculate fractions of passing and failing test units
             # - `f_passed` is the fraction of test units that passed
@@ -8831,7 +9863,7 @@ class Validate:
             raise AssertionError(msg)
     def assert_below_threshold(
-        self, level: str = "warning", i: int = None, message: str = None
+        self, level: str = "warning", i: int | None = None, message: str | None = None
     ) -> None:
         """
         Raise an `AssertionError` if validation steps exceed a specified threshold level.
@@ -8940,12 +9972,12 @@ class Validate:
         See Also
         --------
-        - [`warning()`](`pointblank.Validate.warning`): Get the 'warning' status for each validation
+        - [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
         step
-        - [`error()`](`pointblank.Validate.error`): Get the 'error' status for each validation step
-        - [`critical()`](`pointblank.Validate.critical`): Get the 'critical' status for each
+        - [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
+        - [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
         validation step
-        - [`assert_passing()`](`pointblank.Validate.assert_passing`): Assert all validations pass
+        - [`assert_passing()`](`pointblank.Validate.assert_passing`): assert all validations pass
         completely
         """
         # Check if validation has been interrogated
@@ -8991,6 +10023,145 @@ class Validate:
                 )
             raise AssertionError(msg)
+    def above_threshold(self, level: str = "warning", i: int | None = None) -> bool:
+        """
+        Check if any validation steps exceed a specified threshold level.
+        The `above_threshold()` method checks whether validation steps exceed a given threshold
+        level. This provides a non-exception-based alternative to
+        [`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`) for conditional
+        workflow control based on validation results.
+        This method is useful in scenarios where you want to check if any validation steps failed
+        beyond a certain threshold without raising an exception, allowing for more flexible
+        programmatic responses to validation issues.
+        Parameters
+        ----------
+        level
+            The threshold level to check against. Valid options are: `"warning"` (the least severe
+            threshold level), `"error"` (the middle severity threshold level), and `"critical"` (the
+            most severe threshold level). The default is `"warning"`.
+        i
+            Specific validation step number(s) to check. If a single integer, checks only that step.
+            If a list of integers, checks all specified steps. If `None` (the default), checks all
+            validation steps. Step numbers are 1-based (first step is `1`, not `0`).
+        Returns
+        -------
+        bool
+            `True` if any of the specified validation steps exceed the given threshold level,
+            `False` otherwise.
+        Raises
+        ------
+        ValueError
+            If an invalid threshold level is provided.
+        Examples
+        --------
+        ```{python}
+        #| echo: false
+        #| output: false
+        import pointblank as pb
+        pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
+        ```
+        Below are some examples of how to use the `above_threshold()` method. First, we'll create a
+        simple Polars DataFrame with a single column (`values`).
+        ```{python}
+        import polars as pl
+        tbl = pl.DataFrame({
+            "values": [1, 2, 3, 4, 5, 0, -1]
+        })
+        ```
+        Then a validation plan will be created with thresholds (`warning=0.1`, `error=0.2`,
+        `critical=0.3`). After interrogating, we display the validation report table:
+        ```{python}
+        import pointblank as pb
+        validation = (
+            pb.Validate(data=tbl, thresholds=(0.1, 0.2, 0.3))
+            .col_vals_gt(columns="values", value=0)
+            .col_vals_lt(columns="values", value=10)
+            .col_vals_between(columns="values", left=0, right=5)
+            .interrogate()
+        )
+        validation
+        ```
+        Let's check if any steps exceed the 'warning' threshold with the `above_threshold()` method.
+        A message will be printed if that's the case:
+        ```{python}
+        if validation.above_threshold(level="warning"):
+            print("Some steps have exceeded the warning threshold")
+        ```
+        Check if only steps 2 and 3 exceed the 'error' threshold through use of the `i=` argument:
+        ```{python}
+        if validation.above_threshold(level="error", i=[2, 3]):
+            print("Steps 2 and/or 3 have exceeded the error threshold")
+        ```
+        You can use this in a workflow to conditionally trigger processes. Here's a snippet of how
+        you might use this in a function:
+        ```python
+        def process_data(validation_obj):
+            # Only continue processing if validation passes critical thresholds
+            if not validation_obj.above_threshold(level="critical"):
+                # Continue with processing
+                print("Data meets critical quality thresholds, proceeding...")
+                return True
+            else:
+                # Log failure and stop processing
+                print("Data fails critical quality checks, aborting...")
+                return False
+        ```
+        Note that this is just a suggestion for how to implement conditional workflow processes. You
+        should adapt this pattern to your specific requirements, which might include  different
+        threshold levels, custom logging mechanisms, or integration with your organization's data
+        pipelines and notification systems.
+        See Also
+        --------
+        - [`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`): a similar
+        method that raises an exception if thresholds are exceeded
+        - [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
+        step
+        - [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
+        - [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
+        validation step
+        """
+        # Ensure validation has been run
+        if not hasattr(self, "time_start") or self.time_start is None:
+            return False
+        # Validate the level parameter
+        level = level.lower()
+        if level not in ["warning", "error", "critical"]:
+            raise ValueError(
+                f"Invalid threshold level: {level}. Must be one of 'warning', 'error', or 'critical'."
+            )
+        # Get the threshold status using the appropriate method
+        if level == "warning":
+            status = self.warning(i=i)
+        elif level == "error":
+            status = self.error(i=i)
+        elif level == "critical":
+            status = self.critical(i=i)
+        # Return True if any steps exceeded the threshold
+        return any(status.values())
     def n(self, i: int | list[int] | None = None, scalar: bool = False) -> dict[int, int] | int:
         """
         Provides a dictionary of the number of test units for each validation step.
@@ -9654,7 +10825,7 @@ class Validate:
         Get the 'critical' level status for each validation step.
         The 'critical' status for a validation step is `True` if the fraction of failing test units
-        meets or exceeds the threshold for the notification level. Otherwise, the status is `False`.
+        meets or exceeds the threshold for the 'critical' level. Otherwise, the status is `False`.
         The ascribed name of 'critical' is semantic and is thus simply a status indicator that could
         be used to trigger some action to be take. Here's how it fits in with other status
@@ -9666,14 +10837,14 @@ class Validate:
         severity
         - 'critical': the status obtained by calling `critical()`, most severe
-        This method provides a dictionary of the notification status for each validation step. If
-        the `scalar=True` argument is provided and `i=` is a scalar, the value is returned as a
-        scalar instead of a dictionary.
+        This method provides a dictionary of the 'critical' status for each validation step. If the
+        `scalar=True` argument is provided and `i=` is a scalar, the value is returned as a scalar
+        instead of a dictionary.
         Parameters
         ----------
         i
-            The validation step number(s) from which the notification status is obtained. Can be
+            The validation step number(s) from which the 'critical' status is obtained. Can be
             provided as a list of integers or a single integer. If `None`, all steps are included.
         scalar
             If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary.
@@ -9681,7 +10852,7 @@ class Validate:
         Returns
         -------
         dict[int, bool] | bool
-            A dictionary of the notification status for each validation step or a scalar value.
+            A dictionary of the 'critical' status for each validation step or a scalar value.
         Examples
         --------
@@ -9760,11 +10931,13 @@ class Validate:
         Get the rows that failed for each validation step.
         After the [`interrogate()`](`pointblank.Validate.interrogate`) method has been called, the
-        `get_data_extracts()` method can be used to extract the rows that failed in each row-based
-        validation step (e.g., [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), etc.). The
-        method returns a dictionary of tables containing the rows that failed in every row-based
-        validation function. If `frame=True` and `i=` is a scalar, the value is conveniently
-        returned as a table (forgoing the dictionary structure).
+        `get_data_extracts()` method can be used to extract the rows that failed in each
+        column-value or row-based validation step (e.g.,
+        [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`),
+        [`rows_distinct()`](`pointblank.Validate.rows_distinct`), etc.). The method returns a
+        dictionary of tables containing the rows that failed in every validation step. If
+        `frame=True` and `i=` is a scalar, the value is conveniently returned as a table (forgoing
+        the dictionary structure).
         Parameters
         ----------
@@ -9777,13 +10950,13 @@ class Validate:
         Returns
         -------
         dict[int, FrameT | None] | FrameT | None
-            A dictionary of tables containing the rows that failed in every row-based validation
-            step or a DataFrame.
+            A dictionary of tables containing the rows that failed in every compatible validation
+            step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
-        Validation Methods that are Row-Based
-        -------------------------------------
-        The following validation methods are row-based and will have rows extracted when there are
-        failing test units.
+        Compatible Validation Methods for Yielding Extracted Rows
+        ---------------------------------------------------------
+        The following validation methods operate on column values and will have rows extracted when
+        there are failing test units.
         - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
         - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
@@ -9798,11 +10971,20 @@ class Validate:
         - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
         - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
         - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
+        - [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
+        - [`conjointly()`](`pointblank.Validate.conjointly`)
+        An extracted row for these validation methods means that a test unit failed for that row in
+        the validation step.
+        These row-based validation methods will also have rows extracted should there be failing
+        rows:
         - [`rows_distinct()`](`pointblank.Validate.rows_distinct`)
+        - [`rows_complete()`](`pointblank.Validate.rows_complete`)
-        An extracted row means that a test unit failed for that row in the validation step. The
-        extracted rows are a subset of the original table and are useful for further analysis or for
-        understanding the nature of the failing test units.
+        The extracted rows are a subset of the original table and are useful for further analysis
+        or for understanding the nature of the failing test units.
         Examples
         --------
@@ -10058,10 +11240,10 @@ class Validate:
         Get the data that passed or failed the validation steps.
         Validation of the data is one thing but, sometimes, you want to use the best part of the
-        input dataset for something else. The `get_sundered_data()` method works with a Validate
+        input dataset for something else. The `get_sundered_data()` method works with a `Validate`
         object that has been interrogated (i.e., the
         [`interrogate()`](`pointblank.Validate.interrogate`) method was used). We can get either the
-        'pass' data piece (rows with no failing test units across all row-based validation
+        'pass' data piece (rows with no failing test units across all column-value based validation
         functions), or, the 'fail' data piece (rows with at least one failing test unit across the
         same series of validations).
@@ -10070,7 +11252,7 @@ class Validate:
         There are some caveats to sundering. The validation steps considered for this splitting will
         only involve steps where:
-        - of certain check types, where test units are cells checked row-by-row (e.g., the
+        - of certain check types, where test units are cells checked down a column (e.g., the
         `col_vals_*()` methods)
         - `active=` is not set to `False`
         - `pre=` has not been given an expression for modifying the input table
@@ -10301,6 +11483,19 @@ class Validate:
         # Get information on the input data table
         tbl_info = _get_tbl_type(data=self.data)
+        # If the table is a Polars one, determine if it's a LazyFrame
+        if tbl_info == "polars":
+            if _is_lazy_frame(self.data):
+                tbl_info = "polars-lazy"
+        # Determine if the input table is a Narwhals DF
+        if _is_narwhals_table(self.data):
+            # Determine if the Narwhals table is a LazyFrame
+            if _is_lazy_frame(self.data):
+                tbl_info = "narwhals-lazy"
+            else:
+                tbl_info = "narwhals"
         # Get the thresholds object
         thresholds = self.thresholds
@@ -10353,7 +11548,9 @@ class Validate:
             # Create the label, table type, and thresholds HTML fragments
             label_html = _create_label_html(label=self.label, start_time="")
             table_type_html = _create_table_type_html(tbl_type=tbl_info, tbl_name=self.tbl_name)
-            thresholds_html = _create_thresholds_html(thresholds=thresholds, locale=locale)
+            thresholds_html = _create_thresholds_html(
+                thresholds=thresholds, locale=locale, df_lib=df_lib
+            )
             # Compose the subtitle HTML fragment
             combined_subtitle = (
@@ -10666,6 +11863,7 @@ class Validate:
             interrogation_performed=interrogation_performed,
             active=active,
             locale=locale,
+            df_lib=df_lib,
         )
         # ------------------------------------------------
@@ -10682,6 +11880,7 @@ class Validate:
             interrogation_performed=interrogation_performed,
             active=active,
             locale=locale,
+            df_lib=df_lib,
         )
         validation_info_dict["fail"] = _transform_passed_failed(
@@ -10690,6 +11889,7 @@ class Validate:
             interrogation_performed=interrogation_performed,
             active=active,
             locale=locale,
+            df_lib=df_lib,
         )
         # ------------------------------------------------
@@ -10869,7 +12069,9 @@ class Validate:
         # Create the label, table type, and thresholds HTML fragments
         label_html = _create_label_html(label=self.label, start_time=self.time_start)
         table_type_html = _create_table_type_html(tbl_type=tbl_info, tbl_name=self.tbl_name)
-        thresholds_html = _create_thresholds_html(thresholds=thresholds, locale=locale)
+        thresholds_html = _create_thresholds_html(
+            thresholds=thresholds, locale=locale, df_lib=df_lib
+        )
         # Compose the subtitle HTML fragment
         combined_subtitle = (
@@ -11127,24 +12329,25 @@ class Validate:
         Types of Step Reports
         ---------------------
         The `get_step_report()` method produces a report based on the *type* of validation step.
-        The following row-based validation methods will produce a report that shows the rows of the
-        data that failed because of failing test units within one or more columns failed:
+        The following column-value or row-based validation step validation methods will produce a
+        report that shows the rows of the data that failed:
         - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
+        - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
         - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
+        - [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
         - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
         - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
-        - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
-        - [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
         - [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
         - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
         - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
         - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
-        - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
         - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
         - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
-        - [`rows_complete()`](`pointblank.Validate.rows_complete`)
+        - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
+        - [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
         - [`conjointly()`](`pointblank.Validate.conjointly`)
+        - [`rows_complete()`](`pointblank.Validate.rows_complete`)
         The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
         report that shows duplicate rows (or duplicate values in one or a set of columns as defined
@@ -12671,20 +13874,78 @@ def _transform_eval(
     return symbol_list
+def _format_numbers_with_gt(
+    values: list[int], n_sigfig: int = 3, compact: bool = True, locale: str = "en"
+) -> list[str]:
+    """Format numbers using Great Tables GT object to avoid pandas dependency."""
+    import polars as pl
+    # Create a single-column DataFrame with all values
+    df = pl.DataFrame({"values": values})
+    # Create GT object and format the column
+    gt_obj = GT(df).fmt_number(columns="values", n_sigfig=n_sigfig, compact=compact, locale=locale)
+    # Extract the formatted values using _get_column_of_values
+    formatted_values = _get_column_of_values(gt_obj, column_name="values", context="html")
+    return formatted_values
+def _format_single_number_with_gt(
+    value: int, n_sigfig: int = 3, compact: bool = True, locale: str = "en", df_lib=None
+) -> str:
+    """Format a single number using Great Tables GT object to avoid pandas dependency."""
+    if df_lib is None:
+        # Use library detection to select appropriate DataFrame library
+        if _is_lib_present("polars"):
+            import polars as pl
+            df_lib = pl
+        elif _is_lib_present("pandas"):
+            import pandas as pd
+            df_lib = pd
+        else:
+            raise ImportError("Neither Polars nor Pandas is available for formatting")
+    # Create a single-row, single-column DataFrame using the specified library
+    df = df_lib.DataFrame({"value": [value]})
+    # Create GT object and format the column
+    gt_obj = GT(df).fmt_number(columns="value", n_sigfig=n_sigfig, compact=compact, locale=locale)
+    # Extract the formatted value using _get_column_of_values
+    formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
+    return formatted_values[0]  # Return the single formatted value
 def _transform_test_units(
-    test_units: list[int], interrogation_performed: bool, active: list[bool], locale: str
+    test_units: list[int],
+    interrogation_performed: bool,
+    active: list[bool],
+    locale: str,
+    df_lib=None,
 ) -> list[str]:
     # If no interrogation was performed, return a list of empty strings
     if not interrogation_performed:
         return ["" for _ in range(len(test_units))]
+    # Define the helper function that'll format numbers safely with Great Tables
+    def _format_number_safe(value: int) -> str:
+        if df_lib is not None:
+            # Use GT-based formatting to avoid Pandas dependency completely
+            return _format_single_number_with_gt(
+                value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
+            )
+        else:
+            # Fallback to the original behavior
+            return str(vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0])
     return [
         (
-            (
-                str(test_units[i])
-                if test_units[i] < 10000
-                else str(vals.fmt_number(test_units[i], n_sigfig=3, compact=True, locale=locale)[0])
-            )
+            (str(test_units[i]) if test_units[i] < 10000 else _format_number_safe(test_units[i]))
             if active[i]
             else "&mdash;"
         )
@@ -12692,8 +13953,43 @@ def _transform_test_units(
     ]
-def _fmt_lg(value: int, locale: str) -> str:
-    return vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0]
+def _fmt_lg(value: int, locale: str, df_lib=None) -> str:
+    if df_lib is not None:
+        # Use GT-based formatting if a DataFrame library is provided
+        return _format_single_number_with_gt(
+            value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
+        )
+    else:
+        # Fallback to the original behavior
+        return vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0]
+def _format_single_float_with_gt(
+    value: float, decimals: int = 2, locale: str = "en", df_lib=None
+) -> str:
+    if df_lib is None:
+        # Use library detection to select appropriate DataFrame library
+        if _is_lib_present("polars"):
+            import polars as pl
+            df_lib = pl
+        elif _is_lib_present("pandas"):
+            import pandas as pd
+            df_lib = pd
+        else:
+            raise ImportError("Neither Polars nor Pandas is available for formatting")
+    # Create a single-row, single-column DataFrame using the specified library
+    df = df_lib.DataFrame({"value": [value]})
+    # Create GT object and format the column
+    gt_obj = GT(df).fmt_number(columns="value", decimals=decimals, locale=locale)
+    # Extract the formatted value using _get_column_of_values
+    formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
+    return formatted_values[0]  # Return the single formatted value
 def _transform_passed_failed(
@@ -12702,14 +13998,24 @@ def _transform_passed_failed(
     interrogation_performed: bool,
     active: list[bool],
     locale: str,
+    df_lib=None,
 ) -> list[str]:
     if not interrogation_performed:
         return ["" for _ in range(len(n_passed_failed))]
+    # Helper function to format numbers safely
+    def _format_float_safe(value: float) -> str:
+        if df_lib is not None:
+            # Use GT-based formatting to avoid Pandas dependency completely
+            return _format_single_float_with_gt(value, decimals=2, locale=locale, df_lib=df_lib)
+        else:
+            # Fallback to the original behavior
+            return vals.fmt_number(value, decimals=2, locale=locale)[0]
     passed_failed = [
         (
-            f"{n_passed_failed[i] if n_passed_failed[i] < 10000 else _fmt_lg(n_passed_failed[i], locale=locale)}"
-            f"<br />{vals.fmt_number(f_passed_failed[i], decimals=2, locale=locale)[0]}"
+            f"{n_passed_failed[i] if n_passed_failed[i] < 10000 else _fmt_lg(n_passed_failed[i], locale=locale, df_lib=df_lib)}"
+            f"<br />{_format_float_safe(f_passed_failed[i])}"
             if active[i]
             else "&mdash;"
         )
@@ -12920,41 +14226,122 @@ def _create_label_html(label: str | None, start_time: str) -> str:
     )
-def _create_thresholds_html(thresholds: Thresholds, locale: str) -> str:
+def _format_single_integer_with_gt(value: int, locale: str = "en", df_lib=None) -> str:
+    """Format a single integer using Great Tables GT object to avoid pandas dependency."""
+    if df_lib is None:
+        # Use library detection to select appropriate DataFrame library
+        if _is_lib_present("polars"):
+            import polars as pl
+            df_lib = pl
+        elif _is_lib_present("pandas"):
+            import pandas as pd
+            df_lib = pd
+        else:
+            raise ImportError("Neither Polars nor Pandas is available for formatting")
+    # Create a single-row, single-column DataFrame using the specified library
+    df = df_lib.DataFrame({"value": [value]})
+    # Create GT object and format the column
+    gt_obj = GT(df).fmt_integer(columns="value", locale=locale)
+    # Extract the formatted value using _get_column_of_values
+    formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
+    return formatted_values[0]  # Return the single formatted value
+def _format_single_float_with_gt_custom(
+    value: float,
+    decimals: int = 2,
+    drop_trailing_zeros: bool = False,
+    locale: str = "en",
+    df_lib=None,
+) -> str:
+    """Format a single float with custom options using Great Tables GT object to avoid pandas dependency."""
+    if df_lib is None:
+        # Use library detection to select appropriate DataFrame library
+        if _is_lib_present("polars"):
+            import polars as pl
+            df_lib = pl
+        elif _is_lib_present("pandas"):
+            import pandas as pd
+            df_lib = pd
+        else:
+            raise ImportError("Neither Polars nor Pandas is available for formatting")
+    # Create a single-row, single-column DataFrame using the specified library
+    df = df_lib.DataFrame({"value": [value]})
+    # Create GT object and format the column
+    gt_obj = GT(df).fmt_number(
+        columns="value", decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
+    )
+    # Extract the formatted value using _get_column_of_values
+    formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
+    return formatted_values[0]  # Return the single formatted value
+def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> str:
     if thresholds == Thresholds():
         return ""
+    # Helper functions to format numbers safely
+    def _format_number_safe(value: float, decimals: int, drop_trailing_zeros: bool = False) -> str:
+        if df_lib is not None and value is not None:
+            # Use GT-based formatting to avoid Pandas dependency completely
+            return _format_single_float_with_gt_custom(
+                value,
+                decimals=decimals,
+                drop_trailing_zeros=drop_trailing_zeros,
+                locale=locale,
+                df_lib=df_lib,
+            )
+        else:
+            # Fallback to the original behavior
+            return fmt_number(
+                value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
+            )[0]
+    def _format_integer_safe(value: int) -> str:
+        if df_lib is not None and value is not None:
+            # Use GT-based formatting to avoid Pandas dependency completely
+            return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
+        else:
+            # Fallback to the original behavior
+            return fmt_integer(value, locale=locale)[0]
     warning = (
-        fmt_number(
-            thresholds.warning_fraction, decimals=3, drop_trailing_zeros=True, locale=locale
-        )[0]
+        _format_number_safe(thresholds.warning_fraction, decimals=3, drop_trailing_zeros=True)
         if thresholds.warning_fraction is not None
         else (
-            fmt_integer(thresholds.warning_count, locale=locale)[0]
+            _format_integer_safe(thresholds.warning_count)
             if thresholds.warning_count is not None
             else "&mdash;"
         )
     )
     error = (
-        fmt_number(thresholds.error_fraction, decimals=3, drop_trailing_zeros=True, locale=locale)[
-            0
-        ]
+        _format_number_safe(thresholds.error_fraction, decimals=3, drop_trailing_zeros=True)
         if thresholds.error_fraction is not None
         else (
-            fmt_integer(thresholds.error_count, locale=locale)[0]
+            _format_integer_safe(thresholds.error_count)
             if thresholds.error_count is not None
             else "&mdash;"
         )
     )
     critical = (
-        fmt_number(
-            thresholds.critical_fraction, decimals=3, drop_trailing_zeros=True, locale=locale
-        )[0]
+        _format_number_safe(thresholds.critical_fraction, decimals=3, drop_trailing_zeros=True)
         if thresholds.critical_fraction is not None
         else (
-            fmt_integer(thresholds.critical_count, locale=locale)[0]
+            _format_integer_safe(thresholds.critical_count)
             if thresholds.critical_count is not None
             else "&mdash;"
         )

pointblank 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl

pointblank 0.9.5py3-none-any.whl → 0.10.0py3-none-any.whl