PyPI - pointblank - Versions diffs - 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl - Mend

pointblank 0.16.0py3-none-any.whl → 0.18.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

pointblank/__init__.py +2 -0
pointblank/_agg.py +120 -0
pointblank/_constants.py +207 -6
pointblank/_constants_translations.py +1302 -0
pointblank/_datascan_utils.py +28 -10
pointblank/_interrogation.py +216 -139
pointblank/_typing.py +12 -0
pointblank/_utils.py +81 -44
pointblank/_utils_ai.py +4 -5
pointblank/_utils_check_args.py +3 -3
pointblank/_utils_llms_txt.py +41 -2
pointblank/actions.py +1 -1
pointblank/assistant.py +2 -3
pointblank/cli.py +1 -1
pointblank/column.py +162 -46
pointblank/data/api-docs.txt +2957 -50
pointblank/datascan.py +17 -17
pointblank/draft.py +2 -3
pointblank/scan_profile.py +2 -1
pointblank/schema.py +61 -20
pointblank/thresholds.py +15 -13
pointblank/validate.py +2280 -410
pointblank/validate.pyi +1104 -0
pointblank/yaml.py +15 -8
{pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/METADATA +7 -2
{pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/RECORD +30 -28
{pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/licenses/LICENSE +1 -1
{pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/WHEEL +0 -0
{pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/entry_points.txt +0 -0
{pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/top_level.txt +0 -0

pointblank/_typing.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
+import datetime
 import sys
+from collections.abc import Container
 from typing import List, Tuple, Union
 # Check Python version for TypeAlias support
@@ -15,6 +17,12 @@ if sys.version_info >= (3, 10):
     SegmentTuple: TypeAlias = Tuple[str, SegmentValue]
     SegmentItem: TypeAlias = Union[str, SegmentTuple]
     SegmentSpec: TypeAlias = Union[str, SegmentTuple, List[SegmentItem]]
+    _CompliantValue: TypeAlias = Union[str, int, float, datetime.datetime, datetime.date]
+    """A compliant value that pointblank can use in a validation step"""
+    _CompliantValues: TypeAlias = Container[_CompliantValue]
+    """A collection of compliant values that pointblank can use in a validation step"""
 else:
     # Python 3.8 and 3.9 compatible type aliases
     AbsoluteBounds = Tuple[int, int]
@@ -24,6 +32,10 @@ else:
     SegmentTuple = Tuple[str, SegmentValue]
     SegmentItem = Union[str, SegmentTuple]
     SegmentSpec = Union[str, SegmentTuple, List[SegmentItem]]
+    _CompliantValue = Union[str, int, float, datetime.datetime, datetime.date]
+    """A compliant value that pointblank can use in a validation step"""
+    _CompliantValues = Container[_CompliantValue]
+    """A collection of compliant values that pointblank can use in a validation step"""
 # Add docstrings for better IDE support
 # In Python 3.14+, __doc__ attribute on typing.Union objects became read-only

pointblank/_utils.py CHANGED Viewed

@@ -7,14 +7,17 @@ from typing import TYPE_CHECKING, Any
 import narwhals as nw
 from great_tables import GT
+from narwhals.dependencies import is_narwhals_dataframe, is_narwhals_lazyframe
 from great_tables.gt import _get_column_of_values
-from narwhals.typing import FrameT
 from pointblank._constants import ASSERTION_TYPE_METHOD_MAP, GENERAL_COLUMN_TYPES, IBIS_BACKENDS
+from pointblank.column import Column, ColumnLiteral, ColumnSelector, ColumnSelectorNarwhals, col
 if TYPE_CHECKING:
     from collections.abc import Mapping
+    from narwhals.typing import IntoFrame, IntoFrameT
     from pointblank._typing import AbsoluteBounds, Tolerance
@@ -35,6 +38,7 @@ def transpose_dicts(list_of_dicts: list[dict[str, Any]]) -> dict[str, list[Any]]
     return dict(result)
+# TODO: doctest
 def _derive_single_bound(ref: int, tol: int | float) -> int:
     """Derive a single bound using the reference."""
     if not isinstance(tol, float | int):
@@ -44,16 +48,17 @@ def _derive_single_bound(ref: int, tol: int | float) -> int:
     return int(tol * ref) if tol < 1 else int(tol)
+# TODO: doctest
 def _derive_bounds(ref: int, tol: Tolerance) -> AbsoluteBounds:
     """Validate and extract the absolute bounds of the tolerance."""
     if isinstance(tol, tuple):
-        return tuple(_derive_single_bound(ref, t) for t in tol)
+        return (_derive_single_bound(ref, tol[0]), _derive_single_bound(ref, tol[1]))
     bound = _derive_single_bound(ref, tol)
     return bound, bound
-def _get_tbl_type(data: FrameT | Any) -> str:
+def _get_tbl_type(data: Any) -> str:
     type_str = str(type(data))
     ibis_tbl = "ibis.expr.types.relations.Table" in type_str
@@ -110,7 +115,7 @@ def _get_tbl_type(data: FrameT | Any) -> str:
     return "unknown"  # pragma: no cover
-def _process_ibis_through_narwhals(data: FrameT | Any, tbl_type: str) -> tuple[FrameT | Any, str]:
+def _process_ibis_through_narwhals(data: Any, tbl_type: str) -> tuple[Any, str]:
     """
     Process Ibis tables through Narwhals to unify the processing pathway.
@@ -120,14 +125,14 @@ def _process_ibis_through_narwhals(data: FrameT | Any, tbl_type: str) -> tuple[F
     Parameters
     ----------
-    data : FrameT | Any
+    data
         The data table, potentially an Ibis table
-    tbl_type : str
+    tbl_type
         The detected table type
     Returns
     -------
-    tuple[FrameT | Any, str]
+    tuple[Any, str]
         A tuple of (processed_data, updated_tbl_type) where:
         - processed_data is the Narwhals-wrapped table if it was Ibis, otherwise original data
         - updated_tbl_type is "narwhals" if it was Ibis, otherwise original tbl_type
@@ -145,7 +150,7 @@ def _process_ibis_through_narwhals(data: FrameT | Any, tbl_type: str) -> tuple[F
     return data, tbl_type
-def _is_narwhals_table(data: any) -> bool:
+def _is_narwhals_table(data: Any) -> bool:
     # Check if the data is a Narwhals DataFrame
     type_str = str(type(data)).lower()
@@ -156,7 +161,7 @@ def _is_narwhals_table(data: any) -> bool:
     return False
-def _is_lazy_frame(data: any) -> bool:
+def _is_lazy_frame(data: Any) -> bool:
     # Check if the data is a Polars or Narwhals DataFrame
     type_str = str(type(data)).lower()
@@ -180,15 +185,17 @@ def _is_lib_present(lib_name: str) -> bool:
 def _check_any_df_lib(method_used: str) -> None:
     # Determine whether Pandas or Polars is available
+    pd = None
     try:
         import pandas as pd
     except ImportError:
-        pd = None
+        pass
+    pl = None
     try:
         import polars as pl
     except ImportError:
-        pl = None
+        pass
     # If neither Pandas nor Polars is available, raise an ImportError
     if pd is None and pl is None:
@@ -211,16 +218,18 @@ def _is_value_a_df(value: Any) -> bool:
 def _select_df_lib(preference: str = "polars") -> Any:
     # Determine whether Pandas is available
+    pd = None
     try:
         import pandas as pd
     except ImportError:
-        pd = None
+        pass
-    # Determine whether Pandas is available
+    # Determine whether Polars is available
+    pl = None
     try:
         import polars as pl
     except ImportError:
-        pl = None
+        pass
     # TODO: replace this with the `_check_any_df_lib()` function, introduce `method_used=` param
     # If neither Pandas nor Polars is available, raise an ImportError
@@ -240,7 +249,8 @@ def _select_df_lib(preference: str = "polars") -> Any:
     return pl if pl is not None else pd
-def _copy_dataframe(df):
+# TODO: Good argument exceptions should be handled by caller
+def _copy_dataframe(df: IntoFrameT) -> IntoFrameT:
     """
     Create a copy of a DataFrame, handling different DataFrame types.
@@ -280,19 +290,22 @@ def _copy_dataframe(df):
         return df  # pragma: no cover
-def _convert_to_narwhals(df: FrameT) -> nw.DataFrame:
+# TODO: Should straight up remove this
+def _convert_to_narwhals(df: IntoFrame) -> nw.DataFrame[Any] | nw.LazyFrame[Any]:
     # Convert the DataFrame to a format that narwhals can work with
-    return nw.from_native(df)
+    result = nw.from_native(df)
+    assert is_narwhals_dataframe(result) or is_narwhals_lazyframe(result)
+    return result
-def _check_column_exists(dfn: nw.DataFrame, column: str) -> None:
+def _check_column_exists(dfn: nw.DataFrame[Any] | nw.LazyFrame[Any], column: str) -> None:
     """
     Check if a column exists in a DataFrame.
     Parameters
     ----------
     dfn
-        A Narwhals DataFrame.
+        A Narwhals DataFrame or LazyFrame.
     column
         The column to check for existence.
@@ -307,7 +320,7 @@ def _check_column_exists(dfn: nw.DataFrame, column: str) -> None:
 def _count_true_values_in_column(
-    tbl: FrameT,
+    tbl: IntoFrame,
     column: str,
     inverse: bool = False,
 ) -> int:
@@ -337,14 +350,14 @@ def _count_true_values_in_column(
     tbl_filtered = tbl_nw.filter(nw.col(column) if not inverse else ~nw.col(column))
     # Always collect table if it is a LazyFrame; this is required to get the row count
-    if _is_lazy_frame(tbl_filtered):
+    if is_narwhals_lazyframe(tbl_filtered):
         tbl_filtered = tbl_filtered.collect()
     return len(tbl_filtered)
 def _count_null_values_in_column(
-    tbl: FrameT,
+    tbl: IntoFrame,
     column: str,
 ) -> int:
     """
@@ -371,7 +384,7 @@ def _count_null_values_in_column(
     tbl_filtered = tbl_nw.filter(nw.col(column).is_null())
     # Always collect table if it is a LazyFrame; this is required to get the row count
-    if _is_lazy_frame(tbl_filtered):
+    if is_narwhals_lazyframe(tbl_filtered):
         tbl_filtered = tbl_filtered.collect()
     return len(tbl_filtered)
@@ -435,8 +448,11 @@ def _is_duration_dtype(dtype: str) -> bool:
 def _get_column_dtype(
-    dfn: nw.DataFrame, column: str, raw: bool = False, lowercased: bool = True
-) -> str:
+    dfn: nw.DataFrame[Any] | nw.LazyFrame[Any],
+    column: str,
+    raw: bool = False,
+    lowercased: bool = True,
+) -> str | nw.dtypes.DType | None:
     """
     Get the data type of a column in a DataFrame.
@@ -447,14 +463,14 @@ def _get_column_dtype(
     column
         The column from which to get the data type.
     raw
-        If `True`, return the raw data type string.
+        If `True`, return the raw DType object (or None if column not found).
     lowercased
         If `True`, return the data type string in lowercase.
     Returns
     -------
-    str
-        The data type of the column.
+    str | nw.dtypes.DType | None
+        The data type of the column as a string, or the raw DType object if `raw=True`.
     """
     if raw:  # pragma: no cover
@@ -468,7 +484,9 @@ def _get_column_dtype(
     return column_dtype_str
-def _check_column_type(dfn: nw.DataFrame, column: str, allowed_types: list[str]) -> None:
+def _check_column_type(
+    dfn: nw.DataFrame[Any] | nw.LazyFrame[Any], column: str, allowed_types: list[str]
+) -> None:
     """
     Check if a column is of a certain data type.
@@ -520,8 +538,8 @@ def _check_column_type(dfn: nw.DataFrame, column: str, allowed_types: list[str])
 def _column_test_prep(
-    df: FrameT, column: str, allowed_types: list[str] | None, check_exists: bool = True
-) -> nw.DataFrame:
+    df: IntoFrame, column: str, allowed_types: list[str] | None, check_exists: bool = True
+) -> nw.DataFrame[Any] | nw.LazyFrame[Any]:
     # Convert the DataFrame to a format that narwhals can work with.
     dfn = _convert_to_narwhals(df=df)
@@ -537,8 +555,8 @@ def _column_test_prep(
 def _column_subset_test_prep(
-    df: FrameT, columns_subset: list[str] | None, check_exists: bool = True
-) -> nw.DataFrame:
+    df: IntoFrame, columns_subset: list[str] | None, check_exists: bool = True
+) -> nw.DataFrame[Any] | nw.LazyFrame[Any]:
     # Convert the DataFrame to a format that narwhals can work with.
     dfn = _convert_to_narwhals(df=df)
@@ -550,21 +568,40 @@ def _column_subset_test_prep(
     return dfn
-def _get_fn_name() -> str:
-    # Get the current function name
-    fn_name = inspect.currentframe().f_back.f_code.co_name
+_PBUnresolvedColumn = str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals
+_PBResolvedColumn = Column | ColumnLiteral | ColumnSelectorNarwhals | list[Column] | list[str]
-    return fn_name
+def _resolve_columns(columns: _PBUnresolvedColumn) -> _PBResolvedColumn:
+    # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
+    # resolve the columns
+    if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
+        columns = col(columns)
-def _get_assertion_from_fname() -> str:
+    # If `columns` is Column value or a string, place it in a list for iteration
+    if isinstance(columns, (Column, str)):
+        columns = [columns]
+    return columns
+def _get_fn_name() -> str | None:
     # Get the current function name
-    func_name = inspect.currentframe().f_back.f_code.co_name
+    frame = inspect.currentframe()
+    if frame is None or frame.f_back is None:
+        return None
+    return frame.f_back.f_code.co_name
-    # Use the `ASSERTION_TYPE_METHOD_MAP` dictionary to get the assertion type
-    assertion = ASSERTION_TYPE_METHOD_MAP.get(func_name)
-    return assertion
+def _get_assertion_from_fname() -> str | None:
+    # Get the current function name
+    frame = inspect.currentframe()
+    if frame is None or frame.f_back is None:
+        return None
+    func_name = frame.f_back.f_code.co_name
+    # Use the `ASSERTION_TYPE_METHOD_MAP` dictionary to get the assertion type
+    return ASSERTION_TYPE_METHOD_MAP.get(func_name)
 def _check_invalid_fields(fields: list[str], valid_fields: list[str]):
@@ -660,10 +697,10 @@ def _format_to_float_value(
 def _pivot_to_dict(col_dict: Mapping[str, Any]):  # TODO : Type hint and unit test
     result_dict = {}
-    for col, sub_dict in col_dict.items():
+    for _col, sub_dict in col_dict.items():
         for key, value in sub_dict.items():
             # add columns fields not present
             if key not in result_dict:
                 result_dict[key] = [None] * len(col_dict)
-            result_dict[key][list(col_dict.keys()).index(col)] = value
+            result_dict[key][list(col_dict.keys()).index(_col)] = value
     return result_dict

pointblank/_utils_ai.py CHANGED Viewed

@@ -7,7 +7,6 @@ from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple
 import narwhals as nw
-from narwhals.typing import FrameT
 from pointblank._constants import MODEL_PROVIDERS
@@ -111,7 +110,7 @@ EXAMPLE OUTPUT FORMAT:
     if provider == "anthropic":  # pragma: no cover
         # Check that the anthropic package is installed
         try:
-            import anthropic  # noqa
+            import anthropic  # noqa  # type: ignore[import-not-found]
         except ImportError:
             raise ImportError(
                 "The `anthropic` package is required to use AI validation with "
@@ -205,7 +204,7 @@ class _DataBatcher:
     def __init__(
         self,
-        data: FrameT,
+        data: Any,
         columns: Optional[List[str]] = None,
         config: Optional[_BatchConfig] = None,
     ):
@@ -265,13 +264,13 @@ class _DataBatcher:
         signature_str = json.dumps(signature_data, sort_keys=True, default=str)
         return hashlib.md5(signature_str.encode()).hexdigest()
-    def _build_unique_rows_table(self) -> Tuple[FrameT, Dict[str, List[int]]]:
+    def _build_unique_rows_table(self) -> Tuple[Any, Dict[str, List[int]]]:
         """
         Build unique rows table and mapping back to original indices.
         Returns
         -------
-        Tuple[FrameT, Dict[str, List[int]]]
+        Tuple[Any, Dict[str, List[int]]]
             Unique rows table and signature-to-indices mapping.
         """
         nw_data = self._nw_data

pointblank/_utils_check_args.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-from typing import Callable
+from typing import Any, Callable
 import narwhals as nw
@@ -28,7 +28,7 @@ def _check_boolean_input(param: bool, param_name: str):
         raise ValueError(f"`{param_name}=` must be a boolean value.")
-def _check_column(column: str | list[str]):
+def _check_column(column: str | list[str] | Column | ColumnSelector | nw.selectors.Selector):
     """
     Check the input value of the `column=` parameter.
@@ -59,7 +59,7 @@ def _check_column(column: str | list[str]):
 # TODO: allow for checking of dates/datetimes
-def _check_value_float_int(value: float | int | any):
+def _check_value_float_int(value: float | int | Any):
     """
     Check that input value of the `value=` parameter is a float or integer.

pointblank/_utils_llms_txt.py CHANGED Viewed

@@ -46,6 +46,25 @@ def get_api_details(module, exported_list):
         # Get the docstring
         doc = obj.__doc__
+        # Fallback for dynamically generated aggregation methods that might not have
+        # their docstrings properly attached yet
+        if not doc and obj_name.startswith("col_") and "_" in obj_name:
+            # Check if this looks like a dynamically generated aggregation method
+            # (e.g., col_sum_gt, col_avg_eq, col_sd_le)
+            parts_name = obj_name.split("_")
+            if (
+                len(parts_name) == 3
+                and parts_name[1] in ["sum", "avg", "sd"]
+                and parts_name[2] in ["gt", "ge", "lt", "le", "eq"]
+            ):
+                try:
+                    from pointblank.validate import _generate_agg_docstring
+                    doc = _generate_agg_docstring(obj_name)
+                except Exception:
+                    # If we can't generate the docstring, just use what we have
+                    pass
         # Combine the class name, signature, and docstring
         api_text += f"{obj_name}{sig}\n{doc}\n\n"
@@ -101,9 +120,25 @@ def _get_api_text() -> str:
         "Validate.col_vals_regex",
         "Validate.col_vals_within_spec",
         "Validate.col_vals_expr",
+        "Validate.col_sum_gt",
+        "Validate.col_sum_lt",
+        "Validate.col_sum_ge",
+        "Validate.col_sum_le",
+        "Validate.col_sum_eq",
+        "Validate.col_avg_gt",
+        "Validate.col_avg_lt",
+        "Validate.col_avg_ge",
+        "Validate.col_avg_le",
+        "Validate.col_avg_eq",
+        "Validate.col_sd_gt",
+        "Validate.col_sd_lt",
+        "Validate.col_sd_ge",
+        "Validate.col_sd_le",
+        "Validate.col_sd_eq",
         "Validate.rows_distinct",
         "Validate.rows_complete",
         "Validate.col_exists",
+        "Validate.col_pct_null",
         "Validate.col_schema_match",
         "Validate.row_count_match",
         "Validate.col_count_match",
@@ -331,10 +366,14 @@ def _get_examples_text() -> str:
             example_text = "\n".join(example_text.split("\n")[8:])
             # Extract the title of the example (the line beginning with `###`)
-            title = re.search(r"### (.*)", example_text).group(1)
+            title_match = re.search(r"### (.*)", example_text)
+            assert title_match is not None
+            title = title_match.group(1)
             # The next line with text is the short description of the example
-            desc = re.search(r"(.*)\.", example_text).group(1)
+            desc_match = re.search(r"(.*)\.", example_text)
+            assert desc_match is not None
+            desc = desc_match.group(1)
             # Get all of the Python code blocks in the example
             # these can be identified as starting with ```python and ending with ```

pointblank/actions.py CHANGED Viewed

@@ -15,7 +15,7 @@ def send_slack_notification(
     step_msg: str | None = None,
     summary_msg: str | None = None,
     debug: bool = False,
-) -> Callable:
+) -> Callable | None:
     """
     Create a Slack notification function using a webhook URL.

pointblank/assistant.py CHANGED Viewed

@@ -3,7 +3,6 @@ from __future__ import annotations
 from typing import Any
 from importlib_resources import files
-from narwhals.typing import FrameT
 from pointblank._constants import MODEL_PROVIDERS
 from pointblank.datascan import DataScan
@@ -15,7 +14,7 @@ __all__ = [
 def assistant(
     model: str,
-    data: FrameT | Any | None = None,
+    data: Any = None,
     tbl_name: str | None = None,
     api_key: str | None = None,
     display: str | None = None,
@@ -295,7 +294,7 @@ def assistant(
     if provider == "anthropic":  # pragma: no cover
         # Check that the anthropic package is installed
         try:
-            import anthropic  # noqa
+            import anthropic  # noqa  # type: ignore[import-not-found]
         except ImportError:  # pragma: no cover
             raise ImportError(  # pragma: no cover
                 "The `anthropic` package is required to use the `DraftValidation` class with "

pointblank/cli.py CHANGED Viewed

@@ -2411,7 +2411,7 @@ def requirements():
 def _rich_print_missing_table_enhanced(
-    gt_table: Any, original_data: Any = None, missing_info: dict = None
+    gt_table: Any, original_data: Any = None, missing_info: dict | None = None
 ) -> None:
     """Convert a missing values GT table to Rich table with enhanced formatting and metadata.

pointblank 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl

pointblank 0.16.0py3-none-any.whl → 0.18.0py3-none-any.whl