PyPI - pointblank - Versions diffs - 0.11.6__py3-none-any.whl → 0.12.1__py3-none-any.whl - Mend

pointblank 0.11.6py3-none-any.whl → 0.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

pointblank/__init__.py +2 -0
pointblank/_constants.py +0 -1
pointblank/_interrogation.py +244 -606
pointblank/_utils.py +65 -3
pointblank/assistant.py +9 -0
pointblank/cli.py +39 -24
pointblank/data/api-docs.txt +658 -29
pointblank/schema.py +17 -0
pointblank/segments.py +163 -0
pointblank/validate.py +344 -92
{pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/METADATA +59 -6
{pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/RECORD +16 -15
{pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/WHEEL +0 -0
{pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/entry_points.txt +0 -0
{pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/licenses/LICENSE +0 -0
{pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/top_level.txt +0 -0

pointblank/_utils.py CHANGED Viewed

@@ -10,7 +10,7 @@ from great_tables import GT
 from great_tables.gt import _get_column_of_values
 from narwhals.typing import FrameT
-from pointblank._constants import ASSERTION_TYPE_METHOD_MAP, GENERAL_COLUMN_TYPES
+from pointblank._constants import ASSERTION_TYPE_METHOD_MAP, GENERAL_COLUMN_TYPES, IBIS_BACKENDS
 if TYPE_CHECKING:
     from collections.abc import Mapping
@@ -66,11 +66,13 @@ def _get_tbl_type(data: FrameT | Any) -> str:
         except Exception as e:
             raise TypeError("The `data` object is not a DataFrame or Ibis Table.") from e
-        # Detect through regex if the table is a polars or pandas DataFrame
+        # Detect through regex if the table is a polars, pandas, or Spark DataFrame
         if re.search(r"polars", df_ns_str, re.IGNORECASE):
             return "polars"
         elif re.search(r"pandas", df_ns_str, re.IGNORECASE):
             return "pandas"
+        elif re.search(r"pyspark", df_ns_str, re.IGNORECASE):
+            return "pyspark"
     # If ibis is present, then get the table's backend name
     ibis_present = _is_lib_present(lib_name="ibis")
@@ -108,6 +110,41 @@ def _get_tbl_type(data: FrameT | Any) -> str:
     return "unknown"  # pragma: no cover
+def _process_ibis_through_narwhals(data: FrameT | Any, tbl_type: str) -> tuple[FrameT | Any, str]:
+    """
+    Process Ibis tables through Narwhals to unify the processing pathway.
+    This function takes an Ibis table and wraps it with Narwhals, allowing
+    all downstream processing to use the unified Narwhals API instead of
+    Ibis-specific code paths.
+    Parameters
+    ----------
+    data : FrameT | Any
+        The data table, potentially an Ibis table
+    tbl_type : str
+        The detected table type
+    Returns
+    -------
+    tuple[FrameT | Any, str]
+        A tuple of (processed_data, updated_tbl_type) where:
+        - processed_data is the Narwhals-wrapped table if it was Ibis, otherwise original data
+        - updated_tbl_type is "narwhals" if it was Ibis, otherwise original tbl_type
+    """
+    # Check if this is an Ibis table type
+    if tbl_type in IBIS_BACKENDS:
+        try:
+            # Wrap with Narwhals
+            narwhals_wrapped = nw.from_native(data)
+            return narwhals_wrapped, "narwhals"
+        except Exception:
+            # If Narwhals can't handle it, fall back to original approach
+            return data, tbl_type
+    return data, tbl_type
 def _is_narwhals_table(data: any) -> bool:
     # Check if the data is a Narwhals DataFrame
     type_str = str(type(data)).lower()
@@ -164,7 +201,7 @@ def _check_any_df_lib(method_used: str) -> None:
 def _is_value_a_df(value: Any) -> bool:
     try:
         ns = nw.get_native_namespace(value)
-        if "polars" in str(ns) or "pandas" in str(ns):
+        if "polars" in str(ns) or "pandas" in str(ns) or "pyspark" in str(ns):
             return True
         else:  # pragma: no cover
             return False
@@ -619,6 +656,10 @@ def _get_api_text() -> str:
         "expr_col",
     ]
+    segments_exported = [
+        "seg_group",
+    ]
     interrogation_exported = [
         "Validate.interrogate",
         "Validate.get_tabular_report",
@@ -648,6 +689,12 @@ def _get_api_text() -> str:
         "assistant",
         "load_dataset",
         "get_data_path",
+        "connect_to_table",
+    ]
+    yaml_exported = [
+        "yaml_interrogate",
+        "validate_yaml",
     ]
     utility_exported = [
@@ -679,6 +726,10 @@ many steps). Furthermore, the `col()` function can be used to declare a comparis
 for the `value=` argument in many `col_vals_*()` methods) when you can't use a fixed value
 for comparison."""
+    segments_desc = (
+        """Combine multiple values into a single segment using `seg_*()` helper functions."""
+    )
     interrogation_desc = """The validation plan is put into action when `interrogate()` is called.
 The workflow for performing a comprehensive validation is then: (1) `Validate()`, (2) adding
 validation steps, (3) `interrogate()`. After interrogation of the data, we can view a validation
@@ -694,6 +745,11 @@ datasets included in the package can be accessed via the `load_dataset()` functi
 `config()` utility lets us set global configuration parameters. Want to chat with an assistant? Use
 the `assistant()` function to get help with Pointblank."""
+    yaml_desc = """The *YAML* group contains functions that allow for the use of YAML to orchestrate
+validation workflows. The `yaml_interrogate()` function can be used to run a validation workflow from
+YAML strings or files. The `validate_yaml()` function checks if the YAML configuration
+passes its own validity checks."""
     utility_desc = """The Utility Functions group contains functions that are useful for accessing
 metadata about the target data. Use `get_column_count()` or `get_row_count()` to get the number of
 columns or rows in a table. The `get_action_metadata()` function is useful when building custom
@@ -718,12 +774,18 @@ table information, and timing details."""
     api_text += f"""\n## The Column Selection family\n\n{column_selection_desc}\n\n"""
     api_text += get_api_details(module=pointblank, exported_list=column_selection_exported)
+    api_text += f"""\n## The Segments family\n\n{segments_desc}\n\n"""
+    api_text += get_api_details(module=pointblank, exported_list=segments_exported)
     api_text += f"""\n## The Interrogation and Reporting family\n\n{interrogation_desc}\n\n"""
     api_text += get_api_details(module=pointblank, exported_list=interrogation_exported)
     api_text += f"""\n## The Inspection and Assistance family\n\n{inspect_desc}\n\n"""
     api_text += get_api_details(module=pointblank, exported_list=inspect_exported)
+    api_text += f"""\n## The YAML family\n\n{yaml_desc}\n\n"""
+    api_text += get_api_details(module=pointblank, exported_list=yaml_exported)
     api_text += f"""\n## The Utility Functions family\n\n{utility_desc}\n\n"""
     api_text += get_api_details(module=pointblank, exported_list=utility_exported)

pointblank/assistant.py CHANGED Viewed

@@ -138,10 +138,15 @@ def assistant(
     - Polars DataFrame (`"polars"`)
     - Pandas DataFrame (`"pandas"`)
+    - PySpark table (`"pyspark"`)
     - DuckDB table (`"duckdb"`)*
     - MySQL table (`"mysql"`)*
     - PostgreSQL table (`"postgresql"`)*
     - SQLite table (`"sqlite"`)*
+    - Microsoft SQL Server table (`"mssql"`)*
+    - Snowflake table (`"snowflake"`)*
+    - Databricks table (`"databricks"`)*
+    - BigQuery table (`"bigquery"`)*
     - Parquet table (`"parquet"`)*
     - CSV files (string path or `pathlib.Path` object with `.csv` extension)
     - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
@@ -152,6 +157,10 @@ def assistant(
     `ibis.expr.types.relations.Table`). Furthermore, using `assistant()` with these types of tables
     requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or
     Pandas DataFrame, the availability of Ibis is not needed.
+    To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
+    provided. The file will be automatically detected and loaded using the best available DataFrame
+    library. The loading preference is Polars first, then Pandas as a fallback.
     """
     # Check that the chatlas package is installed

pointblank/cli.py CHANGED Viewed

@@ -1360,10 +1360,10 @@ def preview(
     For tables with many columns, use these options to control which columns are displayed:
     \b
-    - --columns: Specify exact columns (e.g., --columns "name,age,email")
-    - --col-range: Select column range (e.g., --col-range "1:10", --col-range "5:", --col-range ":15")
-    - --col-first: Show first N columns (e.g., --col-first 5)
-    - --col-last: Show last N columns (e.g., --col-last 3)
+    - --columns: Specify exact columns (--columns "name,age,email")
+    - --col-range: Select column range (--col-range "1:10", --col-range "5:", --col-range ":15")
+    - --col-first: Show first N columns (--col-first 5)
+    - --col-last: Show last N columns (--col-last 3)
     Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
     """
@@ -1920,31 +1920,43 @@ def validate(
     AVAILABLE CHECK_TYPES:
-    Use --list-checks to see all available validation methods with examples.
-    The default CHECK_TYPE is 'rows-distinct' which checks for duplicate rows.
+    Require no additional options:
     \b
     - rows-distinct: Check if all rows in the dataset are unique (no duplicates)
     - rows-complete: Check if all rows are complete (no missing values in any column)
-    - col-exists: Check if a specific column exists in the dataset (requires --column)
-    - col-vals-not-null: Check if all values in a column are not null/missing (requires --column)
-    - col-vals-gt: Check if all values in a column are greater than a comparison value (requires --column and --value)
-    - col-vals-ge: Check if all values in a column are greater than or equal to a comparison value (requires --column and --value)
-    - col-vals-lt: Check if all values in a column are less than a comparison value (requires --column and --value)
-    - col-vals-le: Check if all values in a column are less than or equal to a comparison value (requires --column and --value)
-    - col-vals-in-set: Check if all values in a column are in an allowed set (requires --column and --set)
+    Require --column:
+    \b
+    - col-exists: Check if a specific column exists in the dataset
+    - col-vals-not-null: Check if all values in a column are not null/missing
+    Require --column and --value:
+    \b
+    - col-vals-gt: Check if column values are greater than a fixed value
+    - col-vals-ge: Check if column values are greater than or equal to a fixed value
+    - col-vals-lt: Check if column values are less than a fixed value
+    - col-vals-le: Check if column values are less than or equal to a fixed value
+    Require --column and --set:
+    \b
+    - col-vals-in-set: Check if column values are in an allowed set
+    Use --list-checks to see all available validation methods with examples. The default CHECK_TYPE
+    is 'rows-distinct' which checks for duplicate rows.
     Examples:
     \b
-    pb validate data.csv                                             # Uses default validation (rows-distinct)
-    pb validate data.csv --list-checks                               # Show all available checks
+    pb validate data.csv                               # Uses default validation (rows-distinct)
+    pb validate data.csv --list-checks                 # Show all available checks
     pb validate data.csv --check rows-distinct
     pb validate data.csv --check rows-distinct --show-extract
     pb validate data.csv --check rows-distinct --write-extract failing_rows_folder
     pb validate data.csv --check rows-distinct --exit-code
-    pb validate data.csv --check rows-complete
     pb validate data.csv --check col-exists --column price
     pb validate data.csv --check col-vals-not-null --column email
     pb validate data.csv --check col-vals-gt --column score --value 50
@@ -1952,7 +1964,6 @@ def validate(
     Multiple validations in one command:
     pb validate data.csv --check rows-distinct --check rows-complete
-    pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
     """
     try:
         import sys
@@ -4627,36 +4638,40 @@ def pl(
     pb pl "pl.read_csv('data.csv').select(['name', 'age'])"
     pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)"
+    \b
     # Multi-line with editor (supports multiple statements)
     pb pl --edit
+    \b
     # Multi-statement code example in editor:
     # csv = pl.read_csv('data.csv')
     # result = csv.select(['name', 'age']).filter(pl.col('age') > 25)
+    \b
     # Multi-line with a specific editor
     pb pl --edit --editor nano
     pb pl --edit --editor code
     pb pl --edit --editor micro
+    \b
     # From file
     pb pl --file query.py
-    # Piping to other pb commands
-    pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)" --pipe | pb validate --check rows-distinct
+    \b
+    Piping to other pb commands
+    pb pl "pl.read_csv('data.csv').head(20)" --pipe | pb validate --check rows-distinct
     pb pl --edit --pipe | pb preview --head 10
     pb pl --edit --pipe | pb scan --output-html report.html
     pb pl --edit --pipe | pb missing --output-html missing_report.html
-    Use --output-format to change how results are displayed:
     \b
+    Use --output-format to change how results are displayed:
     pb pl "pl.read_csv('data.csv')" --output-format scan
     pb pl "pl.read_csv('data.csv')" --output-format missing
     pb pl "pl.read_csv('data.csv')" --output-format info
-    Note: For multi-statement code, assign your final result to a variable like
-    'result', 'df', 'data', or ensure it's the last expression.
+    Note: For multi-statement code, assign your final result to a variable like 'result', 'df',
+    'data', or ensure it's the last expression.
     """
     try:
         # Check if Polars is available

pointblank 0.11.6__py3-none-any.whl → 0.12.1__py3-none-any.whl

pointblank 0.11.6py3-none-any.whl → 0.12.1py3-none-any.whl