PyPI - pointblank - Versions diffs - 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl - Mend

pointblank 0.11.0py3-none-any.whl → 0.11.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

pointblank/assistant.py +14 -3
pointblank/cli.py +2853 -1686
pointblank/compare.py +9 -0
pointblank/datascan.py +25 -3
pointblank/validate.py +346 -37
{pointblank-0.11.0.dist-info → pointblank-0.11.2.dist-info}/METADATA +28 -24
{pointblank-0.11.0.dist-info → pointblank-0.11.2.dist-info}/RECORD +11 -11
{pointblank-0.11.0.dist-info → pointblank-0.11.2.dist-info}/WHEEL +0 -0
{pointblank-0.11.0.dist-info → pointblank-0.11.2.dist-info}/entry_points.txt +0 -0
{pointblank-0.11.0.dist-info → pointblank-0.11.2.dist-info}/licenses/LICENSE +0 -0
{pointblank-0.11.0.dist-info → pointblank-0.11.2.dist-info}/top_level.txt +0 -0

pointblank/compare.py CHANGED Viewed

@@ -10,6 +10,15 @@ if TYPE_CHECKING:
 class Compare:
     def __init__(self, a: IntoFrame, b: IntoFrame) -> None:
+        # Import processing functions from validate module
+        from pointblank.validate import _process_data
+        # Process input data for table a
+        a = _process_data(a)
+        # Process input data for table b
+        b = _process_data(b)
         self.a: IntoFrame = a
         self.b: IntoFrame = b

pointblank/datascan.py CHANGED Viewed

@@ -56,7 +56,9 @@ class DataScan:
     Parameters
     ----------
     data
-        The data to scan and summarize.
+        The data to scan and summarize. This could be a DataFrame object, an Ibis table object,
+        a CSV file path, a Parquet file path, a GitHub URL pointing to a CSV or Parquet file,
+        or a database connection string.
     tbl_name
         Optionally, the name of the table could be provided as `tbl_name`.
@@ -122,6 +124,14 @@ class DataScan:
     # TODO: This needs to be generically typed at the class level, ie. DataScan[T]
     def __init__(self, data: IntoFrameT, tbl_name: str | None = None) -> None:
+        # Import processing functions from validate module
+        from pointblank.validate import (
+            _process_data,
+        )
+        # Process input data to handle different data source types
+        data = _process_data(data)
         as_native = nw.from_native(data)
         if as_native.implementation.name == "IBIS" and as_native._level == "lazy":
@@ -514,8 +524,9 @@ def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
     Parameters
     ----------
     data
-        The table to summarize, which could be a DataFrame object or an Ibis table object. Read the
-        *Supported Input Table Types* section for details on the supported table types.
+        The table to summarize, which could be a DataFrame object, an Ibis table object, a CSV
+        file path, a Parquet file path, or a database connection string. Read the *Supported Input
+        Table Types* section for details on the supported table types.
     tbl_name
         Optionally, the name of the table could be provided as `tbl_name=`.
@@ -535,6 +546,11 @@ def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
     - PostgreSQL table (`"postgresql"`)*
     - SQLite table (`"sqlite"`)*
     - Parquet table (`"parquet"`)*
+    - CSV files (string path or `pathlib.Path` object with `.csv` extension)
+    - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
+    extension, or partitioned dataset)
+    - GitHub URLs (direct links to CSV or Parquet files on GitHub)
+    - Database connection strings (URI format with optional table specification)
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
     `ibis.expr.types.relations.Table`). Furthermore, using `col_summary_tbl()` with these types of
@@ -566,5 +582,11 @@ def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
     ```
     """
+    # Import processing functions from validate module
+    from pointblank.validate import _process_data
+    # Process input data to handle different data source types
+    data = _process_data(data)
     scanner = DataScan(data=data, tbl_name=tbl_name)
     return scanner.get_tabular_report()

pointblank/validate.py CHANGED Viewed

@@ -735,9 +735,157 @@ def get_data_path(
                 return tmp_file.name
-# =============================================================================
-# Utility functions for processing input data (shared by preview() and Validate class)
-# =============================================================================
+def _process_data(data: FrameT | Any) -> FrameT | Any:
+    """
+    Centralized data processing pipeline that handles all supported input types.
+    This function consolidates the data processing pipeline used across multiple
+    classes and functions in Pointblank. It processes data through a consistent
+    sequence of transformations to handle different data source types.
+    The processing order is important:
+    1. GitHub URLs (must come before connection string processing)
+    2. Database connection strings
+    3. CSV file paths
+    4. Parquet file paths
+    Parameters
+    ----------
+    data : FrameT | Any
+        The input data which could be:
+        - a DataFrame object (Polars, Pandas, Ibis, etc.)
+        - a GitHub URL pointing to a CSV or Parquet file
+        - a database connection string (e.g., "duckdb:///path/to/file.ddb::table_name")
+        - a CSV file path (string or Path object with .csv extension)
+        - a Parquet file path, glob pattern, directory, or partitioned dataset
+        - any other data type (returned unchanged)
+    Returns
+    -------
+    FrameT | Any
+        Processed data as a DataFrame if input was a supported data source type,
+        otherwise the original data unchanged.
+    """
+    # Handle GitHub URL input (e.g., "https://github.com/user/repo/blob/main/data.csv")
+    data = _process_github_url(data)
+    # Handle connection string input (e.g., "duckdb:///path/to/file.ddb::table_name")
+    data = _process_connection_string(data)
+    # Handle CSV file input (e.g., "data.csv" or Path("data.csv"))
+    data = _process_csv_input(data)
+    # Handle Parquet file input (e.g., "data.parquet", "data/*.parquet", "data/")
+    data = _process_parquet_input(data)
+    return data
+def _process_github_url(data: FrameT | Any) -> FrameT | Any:
+    """
+    Process data parameter to handle GitHub URLs pointing to CSV or Parquet files.
+    Handles both standard GitHub URLs and raw GitHub content URLs, downloading the content
+    and processing it as a local file.
+    Supports:
+    - Standard github.com URLs pointing to CSV or Parquet files (automatically transformed to raw URLs)
+    - Raw raw.githubusercontent.com URLs pointing to CSV or Parquet files (processed directly)
+    - Both CSV and Parquet file formats
+    - Automatic temporary file management and cleanup
+    Parameters
+    ----------
+    data : FrameT | Any
+        The data parameter which may be a GitHub URL string or any other data type.
+    Returns
+    -------
+    FrameT | Any
+        If the input is a supported GitHub URL, returns a DataFrame loaded from the downloaded file.
+        Otherwise, returns the original data unchanged.
+    Examples
+    --------
+    Standard GitHub URL (automatically transformed):
+    >>> url = "https://github.com/user/repo/blob/main/data.csv"
+    >>> df = _process_github_url(url)
+    Raw GitHub URL (used directly):
+    >>> raw_url = "https://raw.githubusercontent.com/user/repo/main/data.csv"
+    >>> df = _process_github_url(raw_url)
+    """
+    import re
+    import tempfile
+    from urllib.parse import urlparse
+    from urllib.request import urlopen
+    # Check if data is a string that looks like a GitHub URL
+    if not isinstance(data, str):
+        return data
+    # Parse the URL to check if it's a GitHub URL
+    try:
+        parsed = urlparse(data)
+    except Exception:
+        return data
+    # Check if it's a GitHub URL (standard or raw)
+    is_standard_github = parsed.netloc in ["github.com", "www.github.com"]
+    is_raw_github = parsed.netloc == "raw.githubusercontent.com"
+    if not (is_standard_github or is_raw_github):
+        return data
+    # Check if it points to a CSV or Parquet file
+    path_lower = parsed.path.lower()
+    if not (path_lower.endswith(".csv") or path_lower.endswith(".parquet")):
+        return data
+    # Determine the raw URL to download from
+    if is_raw_github:
+        # Already a raw GitHub URL, use it directly
+        raw_url = data
+    else:
+        # Transform GitHub URL to raw content URL
+        # Pattern: https://github.com/user/repo/blob/branch/path/file.ext
+        # Becomes: https://raw.githubusercontent.com/user/repo/branch/path/file.ext
+        github_pattern = r"github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+)"
+        match = re.search(github_pattern, data)
+        if not match:
+            # If URL doesn't match expected GitHub blob pattern, return original data
+            return data
+        user, repo, branch, file_path = match.groups()
+        raw_url = f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/{file_path}"
+    # Download the file content to a temporary file
+    try:
+        with urlopen(raw_url) as response:
+            content = response.read()
+        # Determine file extension
+        file_ext = ".csv" if path_lower.endswith(".csv") else ".parquet"
+        # Create a temporary file
+        with tempfile.NamedTemporaryFile(mode="wb", suffix=file_ext, delete=False) as tmp_file:
+            tmp_file.write(content)
+            tmp_file_path = tmp_file.name
+        # Process the temporary file using existing CSV or Parquet processing functions
+        if file_ext == ".csv":
+            return _process_csv_input(tmp_file_path)
+        else:  # .parquet
+            return _process_parquet_input(tmp_file_path)
+    except Exception:
+        # If download or processing fails, return original data
+        return data
+    except Exception as e:
+        raise RuntimeError(f"Failed to download or process GitHub file from {raw_url}: {e}") from e
 def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
@@ -1215,14 +1363,7 @@ def preview(
     """
     # Process input data to handle different data source types
-    # Handle connection string input (e.g., "duckdb:///path/to/file.ddb::table_name")
-    data = _process_connection_string(data)
-    # Handle CSV file input (e.g., "data.csv" or Path("data.csv"))
-    data = _process_csv_input(data)
-    # Handle Parquet file input (e.g., "data.parquet", "data/*.parquet", "data/")
-    data = _process_parquet_input(data)
+    data = _process_data(data)
     if incl_header is None:
         incl_header = global_config.preview_incl_header
@@ -1635,9 +1776,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
     Parameters
     ----------
     data
-        The table for which to display the missing values. This could be a DataFrame object or an
-        Ibis table object. Read the *Supported Input Table Types* section for details on the
-        supported table types.
+        The table for which to display the missing values. This could be a DataFrame object, an
+        Ibis table object, a CSV file path, a Parquet file path, or a database connection string.
+        Read the *Supported Input Table Types* section for details on the supported table types.
     Returns
     -------
@@ -1660,6 +1801,10 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
     - PySpark table (`"pyspark"`)*
     - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
+    - CSV files (string path or `pathlib.Path` object with `.csv` extension)
+    - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
+    extension, or partitioned dataset)
+    - Database connection strings (URI format with optional table specification)
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
     `ibis.expr.types.relations.Table`). Furthermore, using `missing_vals_tbl()` with these types of
@@ -1702,6 +1847,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
     sector. Many columns have no missing values at all, and those sectors are colored light blue.
     """
+    # Process input data to handle different data source types
+    data = _process_data(data)
     # Make a copy of the data to avoid modifying the original
     data = copy.deepcopy(data)
@@ -2164,14 +2312,15 @@ def get_column_count(data: FrameT | Any) -> int:
     The `get_column_count()` function returns the number of columns in a table. The function works
     with any table that is supported by the `pointblank` library, including Pandas, Polars, and Ibis
-    backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.).
+    backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.). It also supports
+    direct input of CSV files, Parquet files, and database connection strings.
     Parameters
     ----------
     data
-        The table for which to get the column count, which could be a DataFrame object or an Ibis
-        table object. Read the *Supported Input Table Types* section for details on the supported
-        table types.
+        The table for which to get the column count, which could be a DataFrame object, an Ibis
+        table object, a CSV file path, a Parquet file path, or a database connection string.
+        Read the *Supported Input Table Types* section for details on the supported table types.
     Returns
     -------
@@ -2194,12 +2343,39 @@ def get_column_count(data: FrameT | Any) -> int:
     - PySpark table (`"pyspark"`)*
     - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
+    - CSV files (string path or `pathlib.Path` object with `.csv` extension)
+    - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
+    extension, or partitioned dataset)
+    - Database connection strings (URI format with optional table specification)
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
     `ibis.expr.types.relations.Table`). Furthermore, using `get_column_count()` with these types of
     tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a
     Polars or Pandas DataFrame, the availability of Ibis is not needed.
+    To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
+    provided. The file will be automatically detected and loaded using the best available DataFrame
+    library. The loading preference is Polars first, then Pandas as a fallback.
+    GitHub URLs pointing to CSV or Parquet files are automatically detected and converted to raw
+    content URLs for downloading. The URL format should be:
+    `https://github.com/user/repo/blob/branch/path/file.csv` or
+    `https://github.com/user/repo/blob/branch/path/file.parquet`
+    Connection strings follow database URL formats and must also specify a table using the
+    `::table_name` suffix. Examples include:
+    ```
+    "duckdb:///path/to/database.ddb::table_name"
+    "sqlite:///path/to/database.db::table_name"
+    "postgresql://user:password@localhost:5432/database::table_name"
+    "mysql://user:password@localhost:3306/database::table_name"
+    "bigquery://project/dataset::table_name"
+    "snowflake://user:password@account/database/schema::table_name"
+    ```
+    When using connection strings, the Ibis library with the appropriate backend driver is required.
     Examples
     --------
     To get the number of columns in a table, we can use the `get_column_count()` function. Here's an
@@ -2224,9 +2400,63 @@ def get_column_count(data: FrameT | Any) -> int:
     pb.get_column_count(small_table_duckdb)
     ```
+    #### Working with CSV Files
+    The `get_column_count()` function can directly accept CSV file paths:
+    ```{python}
+    # Get a path to a CSV file from the package data
+    csv_path = pb.get_data_path("global_sales", "csv")
+    pb.get_column_count(csv_path)
+    ```
+    #### Working with Parquet Files
+    The function supports various Parquet input formats:
+    ```{python}
+    # Single Parquet file from package data
+    parquet_path = pb.get_data_path("nycflights", "parquet")
+    pb.get_column_count(parquet_path)
+    ```
+    You can also use glob patterns and directories:
+    ```python
+    # Multiple Parquet files with glob patterns
+    pb.get_column_count("data/sales_*.parquet")
+    # Directory containing Parquet files
+    pb.get_column_count("parquet_data/")
+    # Partitioned Parquet dataset
+    pb.get_column_count("sales_data/")  # Auto-discovers partition columns
+    ```
+    #### Working with Database Connection Strings
+    The function supports database connection strings for direct access to database tables:
+    ```{python}
+    # Get path to a DuckDB database file from package data
+    duckdb_path = pb.get_data_path("game_revenue", "duckdb")
+    pb.get_column_count(f"duckdb:///{duckdb_path}::game_revenue")
+    ```
     The function always returns the number of columns in the table as an integer value, which is
     `8` for the `small_table` dataset.
     """
+    from pathlib import Path
+    # Process different input types
+    if isinstance(data, str) or isinstance(data, Path):
+        data = _process_data(data)
+    elif isinstance(data, list):
+        # Handle list of file paths (likely Parquet files)
+        data = _process_parquet_input(data)
     if "ibis.expr.types.relations.Table" in str(type(data)):
         return len(data.columns)
@@ -2250,14 +2480,15 @@ def get_row_count(data: FrameT | Any) -> int:
     The `get_row_count()` function returns the number of rows in a table. The function works with
     any table that is supported by the `pointblank` library, including Pandas, Polars, and Ibis
-    backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.).
+    backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.). It also supports
+    direct input of CSV files, Parquet files, and database connection strings.
     Parameters
     ----------
     data
-        The table for which to get the row count, which could be a DataFrame object or an Ibis table
-        object. Read the *Supported Input Table Types* section for details on the supported table
-        types.
+        The table for which to get the row count, which could be a DataFrame object, an Ibis table
+        object, a CSV file path, a Parquet file path, or a database connection string.
+        Read the *Supported Input Table Types* section for details on the supported table types.
     Returns
     -------
@@ -2280,12 +2511,40 @@ def get_row_count(data: FrameT | Any) -> int:
     - PySpark table (`"pyspark"`)*
     - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
+    - CSV files (string path or `pathlib.Path` object with `.csv` extension)
+    - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
+    extension, or partitioned dataset)
+    - GitHub URLs (direct links to CSV or Parquet files on GitHub)
+    - Database connection strings (URI format with optional table specification)
     The table types marked with an asterisk need to be prepared as Ibis tables (with type of
     `ibis.expr.types.relations.Table`). Furthermore, using `get_row_count()` with these types of
     tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a
     Polars or Pandas DataFrame, the availability of Ibis is not needed.
+    To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
+    provided. The file will be automatically detected and loaded using the best available DataFrame
+    library. The loading preference is Polars first, then Pandas as a fallback.
+    GitHub URLs pointing to CSV or Parquet files are automatically detected and converted to raw
+    content URLs for downloading. The URL format should be:
+    `https://github.com/user/repo/blob/branch/path/file.csv` or
+    `https://github.com/user/repo/blob/branch/path/file.parquet`
+    Connection strings follow database URL formats and must also specify a table using the
+    `::table_name` suffix. Examples include:
+    ```
+    "duckdb:///path/to/database.ddb::table_name"
+    "sqlite:///path/to/database.db::table_name"
+    "postgresql://user:password@localhost:5432/database::table_name"
+    "mysql://user:password@localhost:3306/database::table_name"
+    "bigquery://project/dataset::table_name"
+    "snowflake://user:password@account/database/schema::table_name"
+    ```
+    When using connection strings, the Ibis library with the appropriate backend driver is required.
     Examples
     --------
     Getting the number of rows in a table is easily done by using the `get_row_count()` function.
@@ -2310,9 +2569,63 @@ def get_row_count(data: FrameT | Any) -> int:
     pb.get_row_count(game_revenue_duckdb)
     ```
+    #### Working with CSV Files
+    The `get_row_count()` function can directly accept CSV file paths:
+    ```{python}
+    # Get a path to a CSV file from the package data
+    csv_path = pb.get_data_path("global_sales", "csv")
+    pb.get_row_count(csv_path)
+    ```
+    #### Working with Parquet Files
+    The function supports various Parquet input formats:
+    ```{python}
+    # Single Parquet file from package data
+    parquet_path = pb.get_data_path("nycflights", "parquet")
+    pb.get_row_count(parquet_path)
+    ```
+    You can also use glob patterns and directories:
+    ```python
+    # Multiple Parquet files with glob patterns
+    pb.get_row_count("data/sales_*.parquet")
+    # Directory containing Parquet files
+    pb.get_row_count("parquet_data/")
+    # Partitioned Parquet dataset
+    pb.get_row_count("sales_data/")  # Auto-discovers partition columns
+    ```
+    #### Working with Database Connection Strings
+    The function supports database connection strings for direct access to database tables:
+    ```{python}
+    # Get path to a DuckDB database file from package data
+    duckdb_path = pb.get_data_path("game_revenue", "duckdb")
+    pb.get_row_count(f"duckdb:///{duckdb_path}::game_revenue")
+    ```
     The function always returns the number of rows in the table as an integer value, which is `2000`
     for the `game_revenue` dataset.
     """
+    from pathlib import Path
+    # Process different input types
+    if isinstance(data, str) or isinstance(data, Path):
+        data = _process_data(data)
+    elif isinstance(data, list):
+        # Handle list of file paths (likely Parquet files)
+        data = _process_parquet_input(data)
     if "ibis.expr.types.relations.Table" in str(type(data)):
         # Determine whether Pandas or Polars is available to get the row count
@@ -2717,13 +3030,15 @@ class Validate:
     ----------
     data
         The table to validate, which could be a DataFrame object, an Ibis table object, a CSV
-        file path, a Parquet file path, or a database connection string. When providing a CSV or
-        Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
-        loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
-        glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
-        Connection strings enable direct database access via Ibis with optional table specification
-        using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
-        on the supported table types.
+        file path, a Parquet file path, a GitHub URL pointing to a CSV or Parquet file, or a
+        database connection string. When providing a CSV or Parquet file path (as a string or
+        `pathlib.Path` object), the file will be automatically loaded using an available DataFrame
+        library (Polars or Pandas). Parquet input also supports glob patterns, directories
+        containing .parquet files, and Spark-style partitioned datasets. GitHub URLs are
+        automatically transformed to raw content URLs and downloaded. Connection strings enable
+        direct database access via Ibis with optional table specification using the `::table_name`
+        suffix. Read the *Supported Input Table Types* section for details on the supported table
+        types.
     tbl_name
         An optional name to assign to the input table object. If no value is provided, a name will
         be generated based on whatever information is available. This table name will be displayed
@@ -3243,14 +3558,8 @@ class Validate:
     locale: str | None = None
     def __post_init__(self):
-        # Handle connection string input for the data parameter
-        self.data = _process_connection_string(self.data)
-        # Handle CSV file input for the data parameter
-        self.data = _process_csv_input(self.data)
-        # Handle Parquet file input for the data parameter
-        self.data = _process_parquet_input(self.data)
+        # Process data through the centralized data processing pipeline
+        self.data = _process_data(self.data)
         # Check input of the `thresholds=` argument
         _check_thresholds(thresholds=self.thresholds)

{pointblank-0.11.0.dist-info → pointblank-0.11.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pointblank
-Version: 0.11.0
+Version: 0.11.2
 Summary: Find out if your data is what you think it is.
 Author-email: Richard Iannone <riannone@me.com>
 License: MIT License
@@ -156,11 +156,11 @@ validation
 ## Why Choose Pointblank?
-- **Works with your existing stack** - Seamlessly integrates with Polars, Pandas, DuckDB, MySQL, PostgreSQL, SQLite, Parquet, PySpark, Snowflake, and more!
-- **Beautiful, interactive reports** - Crystal-clear validation results that highlight issues and help communicate data quality
-- **Composable validation pipeline** - Chain validation steps into a complete data quality workflow
-- **Threshold-based alerts** - Set 'warning', 'error', and 'critical' thresholds with custom actions
-- **Practical outputs** - Use validation results to filter tables, extract problematic data, or trigger downstream processes
+- **Works with your existing stack**: Seamlessly integrates with Polars, Pandas, DuckDB, MySQL, PostgreSQL, SQLite, Parquet, PySpark, Snowflake, and more!
+- **Beautiful, interactive reports**: Crystal-clear validation results that highlight issues and help communicate data quality
+- **Composable validation pipeline**: Chain validation steps into a complete data quality workflow
+- **Threshold-based alerts**: Set 'warning', 'error', and 'critical' thresholds with custom actions
+- **Practical outputs**: Use validation results to filter tables, extract problematic data, or trigger downstream processes
 ## Real-World Example
@@ -240,7 +240,7 @@ validation.get_step_report(i=3).show("browser")  # Get failing records from step
 Pointblank includes a powerful CLI utility called `pb` that lets you run data validation workflows directly from the command line. Perfect for CI/CD pipelines, scheduled data quality checks, or quick validation tasks.
 <div align="center">
-<img src="https://posit-dev.github.io/pointblank/assets/vhs/cli-complete-workflow.gif" width="800px">
+<img src="https://posit-dev.github.io/pointblank/assets/vhs/cli-complete-workflow.gif" width="100%">
 </div>
 **Explore Your Data**
@@ -249,43 +249,47 @@ Pointblank includes a powerful CLI utility called `pb` that lets you run data va
 # Get a quick preview of your data
 pb preview small_table
-# Check for missing values
-pb missing small_table
+# Preview data from GitHub URLs
+pb preview "https://github.com/user/repo/blob/main/data.csv"
-# Generate column summaries
-pb scan small_table
+# Check for missing values in Parquet files
+pb missing data.parquet
+# Generate column summaries from database connections
+pb scan "duckdb:///data/sales.ddb::customers"
 ```
 **Run Essential Validations**
 ```bash
 # Check for duplicate rows
-pb validate-simple small_table --check rows-distinct
+pb validate small_table --check rows-distinct
+# Validate data directly from GitHub
+pb validate "https://github.com/user/repo/blob/main/sales.csv" --check col-vals-not-null --column customer_id
-# Verify no null values
-pb validate-simple small_table --check col-vals-not-null --column a
+# Verify no null values in Parquet datasets
+pb validate "data/*.parquet" --check col-vals-not-null --column a
 # Extract failing data for debugging
-pb validate-simple small_table --check col-vals-gt --column a --value 5 --show-extract
+pb validate small_table --check col-vals-gt --column a --value 5 --show-extract
 ```
 **Integrate with CI/CD**
 ```bash
 # Use exit codes for automation (0 = pass, 1 = fail)
-pb validate-simple small_table --check rows-distinct && echo "✅ Quality checks passed"
+pb validate small_table --check rows-distinct --exit-code
 ```
-Learn more in our [CLI documentation](https://posit-dev.github.io/pointblank/user-guide/cli.html).
 ## Features That Set Pointblank Apart
-- **Complete validation workflow** - From data access to validation to reporting in a single pipeline
-- **Built for collaboration** - Share results with colleagues through beautiful interactive reports
-- **Practical outputs** - Get exactly what you need: counts, extracts, summaries, or full reports
-- **Flexible deployment** - Use in notebooks, scripts, or data pipelines
-- **Customizable** - Tailor validation steps and reporting to your specific needs
-- **Internationalization** - Reports can be generated in over 20 languages, including English, Spanish, French, and German
+- **Complete validation workflow**: From data access to validation to reporting in a single pipeline
+- **Built for collaboration**: Share results with colleagues through beautiful interactive reports
+- **Practical outputs**: Get exactly what you need: counts, extracts, summaries, or full reports
+- **Flexible deployment**: Use in notebooks, scripts, or data pipelines
+- **Customizable**: Tailor validation steps and reporting to your specific needs
+- **Internationalization**: Reports can be generated in over 20 languages, including English, Spanish, French, and German
 ## Documentation and Examples

pointblank 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl

pointblank 0.11.0py3-none-any.whl → 0.11.2py3-none-any.whl