PyPI - pointblank - Versions diffs - 0.17.0__py3-none-any.whl → 0.18.0__py3-none-any.whl - Mend

pointblank 0.17.0py3-none-any.whl → 0.18.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

pointblank/__init__.py +2 -0
pointblank/_agg.py +120 -0
pointblank/_constants.py +192 -5
pointblank/_datascan_utils.py +28 -10
pointblank/_interrogation.py +202 -149
pointblank/_typing.py +12 -0
pointblank/_utils.py +81 -44
pointblank/_utils_ai.py +4 -5
pointblank/_utils_check_args.py +3 -3
pointblank/_utils_llms_txt.py +40 -2
pointblank/actions.py +1 -1
pointblank/assistant.py +2 -3
pointblank/cli.py +1 -1
pointblank/column.py +162 -46
pointblank/data/api-docs.txt +2695 -49
pointblank/datascan.py +17 -17
pointblank/draft.py +2 -3
pointblank/scan_profile.py +2 -1
pointblank/schema.py +61 -20
pointblank/thresholds.py +15 -13
pointblank/validate.py +780 -231
pointblank/validate.pyi +1104 -0
pointblank/yaml.py +10 -6
{pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/METADATA +2 -2
{pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/RECORD +29 -27
{pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/licenses/LICENSE +1 -1
{pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/WHEEL +0 -0
{pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/entry_points.txt +0 -0
{pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/top_level.txt +0 -0

pointblank/datascan.py CHANGED Viewed

@@ -3,12 +3,11 @@ from __future__ import annotations
 import contextlib
 import json
 from importlib.metadata import version
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 import narwhals as nw
 from great_tables import GT, google_font, html, loc, style
 from narwhals.dataframe import LazyFrame
-from narwhals.typing import FrameT
 from pointblank._utils_html import _create_table_dims_html, _create_table_type_html, _fmt_frac
 from pointblank.scan_profile import ColumnProfile, _as_physical, _DataProfile, _TypeMap
@@ -18,7 +17,7 @@ if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence
     from narwhals.dataframe import DataFrame
-    from narwhals.typing import Frame, IntoFrameT
+    from narwhals.typing import Frame
     from pointblank.scan_profile_stats import StatGroup
@@ -123,7 +122,7 @@ class DataScan:
     """
     # TODO: This needs to be generically typed at the class level, ie. DataScan[T]
-    def __init__(self, data: IntoFrameT, tbl_name: str | None = None) -> None:
+    def __init__(self, data: Any, tbl_name: str | None = None) -> None:
         # Import processing functions from validate module
         from pointblank.validate import (
             _process_data,
@@ -172,7 +171,7 @@ class DataScan:
             implementation=self.nw_data.implementation,
         )
         for column in columns:
-            col_data: DataFrame = self.nw_data.select(column)
+            col_data: Frame = self.nw_data.select(column)
             ## Handle dtyping:
             native_dtype = schema[column]
@@ -183,7 +182,7 @@ class DataScan:
             except NotImplementedError:
                 continue
-            col_profile = ColumnProfile(colname=column, coltype=native_dtype)
+            col_profile = ColumnProfile(colname=column, coltype=str(native_dtype))
             ## Collect Sample Data:
             ## This is the most consistent way (i think) to get the samples out of the data.
@@ -205,7 +204,7 @@ class DataScan:
         return profile
     @property
-    def summary_data(self) -> IntoFrameT:
+    def summary_data(self) -> Any:
         return self.profile.as_dataframe(strict=False).to_native()
     def get_tabular_report(self, *, show_sample_data: bool = False) -> GT:
@@ -318,11 +317,10 @@ class DataScan:
         # format fractions:
         # this is an anti-pattern but there's no serious alternative
+        _backend = cast(Any, self.profile.implementation)
         for _fmt_col in ("__frac_n_unique", "__frac_n_missing"):
             _formatted: list[str | None] = _fmt_frac(formatted_data[_fmt_col])
-            formatted: nw.Series = nw.new_series(
-                _fmt_col, values=_formatted, backend=self.profile.implementation
-            )
+            formatted: nw.Series = nw.new_series(_fmt_col, values=_formatted, backend=_backend)
             formatted_data = formatted_data.drop(_fmt_col)
             formatted_data = formatted_data.with_columns(formatted.alias(_fmt_col))
@@ -365,10 +363,10 @@ class DataScan:
                         trues.append(None)
                         falses.append(None)
                 true_ser: nw.Series = nw.new_series(
-                    name="__freq_true", values=trues, backend=self.profile.implementation
+                    name="__freq_true", values=trues, backend=_backend
                 )
                 false_ser: nw.Series = nw.new_series(
-                    name="__freq_false", values=falses, backend=self.profile.implementation
+                    name="__freq_false", values=falses, backend=_backend
                 )
                 formatted_data = formatted_data.with_columns(
                     __freq_true=true_ser, __freq_false=false_ser
@@ -382,9 +380,7 @@ class DataScan:
             )
             for _fmt_col in ("__pct_true", "__pct_false"):
                 _formatted: list[str | None] = _fmt_frac(formatted_data[_fmt_col])
-                formatted = nw.new_series(
-                    name=_fmt_col, values=_formatted, backend=self.profile.implementation
-                )
+                formatted = nw.new_series(name=_fmt_col, values=_formatted, backend=_backend)
                 formatted_data = formatted_data.drop(_fmt_col)
                 formatted_data = formatted_data.with_columns(formatted.alias(_fmt_col))
@@ -459,7 +455,11 @@ class DataScan:
             )
             .tab_style(style=style.text(size="12px"), locations=loc.body(columns="colname"))
             .cols_width(
-                icon="35px", colname="200px", **{stat_col: "60px" for stat_col in present_stat_cols}
+                cases={
+                    "icon": "35px",
+                    "colname": "200px",
+                    **{stat_col: "60px" for stat_col in present_stat_cols},
+                }
             )
         )
@@ -498,7 +498,7 @@ class DataScan:
             json.dump(json_string, f, indent=4)
-def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
+def col_summary_tbl(data: Any, tbl_name: str | None = None) -> GT:
     """
     Generate a column-level summary table of a dataset.

pointblank/draft.py CHANGED Viewed

@@ -4,7 +4,6 @@ from dataclasses import dataclass, field
 from typing import Any
 from importlib_resources import files
-from narwhals.typing import FrameT
 from pointblank._constants import MODEL_PROVIDERS
 from pointblank.datascan import DataScan
@@ -223,7 +222,7 @@ class DraftValidation:
     be replaced with the actual data variable.
     """
-    data: FrameT | Any
+    data: Any
     model: str
     api_key: str | None = None
     verify_ssl: bool = True
@@ -328,7 +327,7 @@ class DraftValidation:
         if provider == "anthropic":  # pragma: no cover
             # Check that the anthropic package is installed
             try:
-                import anthropic  # noqa
+                import anthropic  # noqa  # type: ignore[import-not-found]
             except ImportError:  # pragma: no cover
                 raise ImportError(  # pragma: no cover
                     "The `anthropic` package is required to use the `DraftValidation` class with "

pointblank/scan_profile.py CHANGED Viewed

@@ -5,7 +5,7 @@ from collections import defaultdict
 from collections.abc import Sequence
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, ClassVar
 import narwhals as nw
 from narwhals.dataframe import DataFrame
@@ -96,6 +96,7 @@ class ColumnProfile(_ColumnProfileABC):
     colname: str
     coltype: str
     statistics: MutableSequence[Stat] = field(default_factory=lambda: [])
+    _type: ClassVar[_TypeMap]  # Defined by subclasses
     @property
     def sample_data(self) -> Sequence[Any]:

pointblank/schema.py CHANGED Viewed

@@ -2,12 +2,16 @@ from __future__ import annotations
 import copy
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
 import narwhals as nw
 from pointblank._constants import IBIS_BACKENDS
 from pointblank._utils import _get_tbl_type, _is_lazy_frame, _is_lib_present, _is_narwhals_table
+if TYPE_CHECKING:
+    from typing import Any
 __all__ = ["Schema", "_check_schema_match"]
@@ -269,17 +273,15 @@ class Schema:
     `Schema` object is used in a validation workflow.
     """
-    columns: str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None = (
-        None
-    )
-    tbl: any | None = None
+    columns: list[tuple[str, ...]] | None = None
+    tbl: Any | None = None
     def __init__(
         self,
         columns: (
             str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None
         ) = None,
-        tbl: any | None = None,
+        tbl: Any | None = None,
         **kwargs,
     ):
         if tbl is None and columns is None and not kwargs:
@@ -387,6 +389,8 @@ class Schema:
         bool
             True if the columns are the same, False otherwise.
         """
+        if self.columns is None or other.columns is None:
+            return self.columns is None and other.columns is None
         if not case_sensitive_colnames:
             this_column_list = [col.lower() for col in self.get_column_list()]
@@ -463,6 +467,8 @@ class Schema:
         bool
             True if the columns are the same, False otherwise.
         """
+        if self.columns is None or other.columns is None:
+            return self.columns is None and other.columns is None
         if not case_sensitive_colnames:
             this_column_list = [col.lower() for col in self.get_column_list()]
@@ -547,6 +553,8 @@ class Schema:
         bool
             True if the columns are the same, False otherwise.
         """
+        if self.columns is None or other.columns is None:
+            return self.columns is None and other.columns is None
         if not case_sensitive_colnames:
             this_column_list = [col.lower() for col in self.get_column_list()]
@@ -633,6 +641,8 @@ class Schema:
         bool
             True if the columns are the same, False otherwise.
         """
+        if self.columns is None or other.columns is None:
+            return self.columns is None and other.columns is None
         if not case_sensitive_colnames:
             this_column_list = [col.lower() for col in self.get_column_list()]
@@ -702,6 +712,8 @@ class Schema:
         list[str]
             A list of column names.
         """
+        if self.columns is None:
+            return []
         return [col[0] for col in self.columns]
     def get_dtype_list(self) -> list[str]:
@@ -713,9 +725,11 @@ class Schema:
         list[str]
             A list of data types.
         """
+        if self.columns is None:
+            return []
         return [col[1] for col in self.columns]
-    def get_schema_coerced(self, to: str | None = None) -> dict[str, str]:
+    def get_schema_coerced(self, to: str | None = None) -> Schema:
         # If a table isn't provided, we cannot use this method
         if self.tbl is None:
             raise ValueError(
@@ -755,8 +769,15 @@ class Schema:
                 new_schema = copy.deepcopy(Schema(tbl=(self.tbl.to_pandas())))
                 return new_schema
+        raise ValueError(
+            f"Cannot coerce schema from '{self.tbl_type}' to '{to}'. "
+            "Supported conversions: pandas->polars, polars->pandas."
+        )
     def __str__(self):
         formatted_columns = []
+        if self.columns is None:
+            return "Pointblank Schema (empty)"
         for col in self.columns:
             if len(col) == 1:  # Only column name provided (no data type)
                 formatted_columns.append(f"  {col[0]}: <ANY>")
@@ -770,8 +791,15 @@ class Schema:
 def _process_columns(
-    *, columns: str | list[str] | list[tuple[str, str]] | dict[str, str] | None = None, **kwargs
-) -> list[tuple[str, str]]:
+    *,
+    columns: str
+    | list[str]
+    | list[tuple[str, str]]
+    | list[tuple[str]]
+    | dict[str, str]
+    | None = None,
+    **kwargs,
+) -> list[tuple[str, ...]]:
     """
     Process column information provided as individual arguments or as a list of
     tuples/dictionary.
@@ -785,15 +813,18 @@ def _process_columns(
     Returns
     -------
-    list[tuple[str, str]]
-        A list of tuples containing column information.
+    list[tuple[str, ...]]
+        A list of tuples containing column information (name only or name and dtype).
     """
     if columns is not None:
         if isinstance(columns, list):
             if all(isinstance(col, str) for col in columns):
-                return [(col,) for col in columns]
+                # Type narrowing: after the all() check, columns contains only strings
+                str_columns: list[str] = columns  # type: ignore[assignment]
+                return [(col,) for col in str_columns]
             else:
-                return columns
+                # Type narrowing: columns contains tuples
+                return columns  # type: ignore[return-value]
         if isinstance(columns, str):
             return [(columns,)]
@@ -810,11 +841,11 @@ def _schema_info_generate_colname_dict(
     index_matched: bool,
     matched_to: str | None,
     dtype_present: bool,
-    dtype_input: str | list[str],
+    dtype_input: str | list[str] | None,
     dtype_matched: bool,
     dtype_multiple: bool,
-    dtype_matched_pos: int,
-) -> dict[str, any]:
+    dtype_matched_pos: int | None,
+) -> dict[str, Any]:
     return {
         "colname_matched": colname_matched,
         "index_matched": index_matched,
@@ -829,8 +860,8 @@ def _schema_info_generate_colname_dict(
 def _schema_info_generate_columns_dict(
     colnames: list[str] | None,
-    colname_dict: list[dict[str, any]] | None,
-) -> dict[str, dict[str, any]]:
+    colname_dict: list[dict[str, Any]] | None,
+) -> dict[str, dict[str, Any]]:
     """
     Generate the columns dictionary for the schema information dictionary.
@@ -847,6 +878,7 @@ def _schema_info_generate_columns_dict(
     dict[str, dict[str, any]]
         The columns dictionary.
     """
+    assert colnames is not None and colname_dict is not None
     return {colnames[i]: colname_dict[i] for i in range(len(colnames))}
@@ -856,7 +888,7 @@ def _schema_info_generate_params_dict(
     case_sensitive_colnames: bool,
     case_sensitive_dtypes: bool,
     full_match_dtypes: bool,
-) -> dict[str, any]:
+) -> dict[str, Any]:
     """
     Generate the parameters dictionary for the schema information dictionary.
@@ -889,7 +921,7 @@ def _schema_info_generate_params_dict(
 def _get_schema_validation_info(
-    data_tbl: any,
+    data_tbl: Any,
     schema: Schema,
     passed: bool,
     complete: bool,
@@ -897,7 +929,7 @@ def _get_schema_validation_info(
     case_sensitive_colnames: bool,
     case_sensitive_dtypes: bool,
     full_match_dtypes: bool,
-) -> dict[str, any]:
+) -> dict[str, Any]:
     """
     Get the schema validation information dictionary.
@@ -949,6 +981,10 @@ def _get_schema_validation_info(
     schema_exp = schema
     schema_tgt = Schema(tbl=data_tbl)
+    # Both schemas must have columns for validation
+    assert schema_exp.columns is not None, "Expected schema must have columns"
+    assert schema_tgt.columns is not None, "Target schema must have columns"
     # Initialize the schema information dictionary
     schema_info = {
         "passed": passed,
@@ -1122,6 +1158,11 @@ def _get_schema_validation_info(
         #
         if colname_matched and dtype_present:
+            # Type narrowing: matched_to is not None when colname_matched is True
+            # and dtype_input is not None when dtype_present is True
+            assert matched_to is not None
+            assert dtype_input is not None
             # Get the dtype of the column in the target table
             dtype_tgt = schema_tgt.columns[tgt_colnames.index(matched_to)][1]

pointblank/thresholds.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 from dataclasses import dataclass, field
-from typing import Callable
+from typing import Callable, cast
 __all__ = ["Thresholds", "Actions", "FinalActions"]
@@ -180,14 +180,15 @@ class Thresholds:
         # The threshold value might be an absolute count, but we need to convert
         # it to a fractional value
         if isinstance(threshold_value, int):
-            threshold_value = _convert_abs_count_to_fraction(
-                value=threshold_value, test_units=test_units
-            )
+            converted = _convert_abs_count_to_fraction(value=threshold_value, test_units=test_units)
+            if converted is None:
+                return None
+            threshold_value = converted
         return fraction_failing >= threshold_value
-def _convert_abs_count_to_fraction(value: int | None, test_units: int) -> float:
+def _convert_abs_count_to_fraction(value: int | None, test_units: int) -> float | None:
     # Using a integer value signifying the total number of 'test units' (in the
     # context of a validation), we convert an integer count (absolute) threshold
     # value to a fractional threshold value
@@ -251,12 +252,12 @@ def _normalize_thresholds_creation(
         # any of these keys
         # Check keys for invalid entries and raise a ValueError if any are found
-        invalid_keys = set(thresholds.keys()) - {"warning", "error", "critical"}
+        invalid_keys: set = set(thresholds.keys()) - {"warning", "error", "critical"}
         if invalid_keys:
             raise ValueError(f"Invalid keys in the thresholds dictionary: {invalid_keys}")
-        thresholds = Thresholds(**thresholds)
+        thresholds = Thresholds(**cast(dict[str, int | float | None], thresholds))
     elif isinstance(thresholds, Thresholds):
         pass
@@ -483,12 +484,12 @@ class Actions:
     def _ensure_list(
         self, value: str | Callable | list[str | Callable] | None
-    ) -> list[str | Callable]:
+    ) -> list[str | Callable] | None:
         if value is None:
             return None
-        if not isinstance(value, list):
-            return [value]
-        return value
+        if isinstance(value, list):
+            return cast(list[str | Callable], value)
+        return [value]
     def __repr__(self) -> str:
         return f"Actions(warning={self.warning}, error={self.error}, critical={self.critical})"
@@ -627,13 +628,14 @@ class FinalActions:
     def __repr__(self) -> str:
         if isinstance(self.actions, list):
             action_reprs = ", ".join(
-                f"'{a}'" if isinstance(a, str) else a.__name__ for a in self.actions
+                f"'{a}'" if isinstance(a, str) else getattr(a, "__name__", repr(a))
+                for a in self.actions
             )
             return f"FinalActions([{action_reprs}])"
         elif isinstance(self.actions, str):
             return f"FinalActions('{self.actions}')"
         elif callable(self.actions):
-            return f"FinalActions({self.actions.__name__})"
+            return f"FinalActions({getattr(self.actions, '__name__', repr(self.actions))})"
         else:
             return f"FinalActions({self.actions})"  # pragma: no cover

pointblank 0.17.0__py3-none-any.whl → 0.18.0__py3-none-any.whl

pointblank 0.17.0py3-none-any.whl → 0.18.0py3-none-any.whl