PyPI - pointblank - Versions diffs - 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

pointblank 0.9.5py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

pointblank/__init__.py +4 -0
pointblank/_constants.py +6 -0
pointblank/_datascan_utils.py +65 -0
pointblank/_utils.py +128 -0
pointblank/_utils_html.py +40 -0
pointblank/actions.py +3 -3
pointblank/assistant.py +1 -3
pointblank/column.py +4 -4
pointblank/compare.py +27 -0
pointblank/data/api-docs.txt +769 -138
pointblank/datascan.py +318 -959
pointblank/scan_profile.py +321 -0
pointblank/scan_profile_stats.py +180 -0
pointblank/schema.py +14 -3
pointblank/thresholds.py +2 -2
pointblank/validate.py +1594 -207
{pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/METADATA +6 -3
pointblank-0.10.0.dist-info/RECORD +37 -0
{pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/WHEEL +1 -1
pointblank-0.9.5.dist-info/RECORD +0 -33
{pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/licenses/LICENSE +0 -0
{pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/top_level.txt +0 -0

pointblank/__init__.py CHANGED Viewed

@@ -30,8 +30,10 @@ from pointblank.thresholds import Actions, FinalActions, Thresholds
 from pointblank.validate import (
     Validate,
     config,
+    connect_to_table,
     get_action_metadata,
     get_column_count,
+    get_data_path,
     get_row_count,
     get_validation_summary,
     load_dataset,
@@ -60,7 +62,9 @@ __all__ = [
     "first_n",
     "last_n",
     "load_dataset",
+    "get_data_path",
     "config",
+    "connect_to_table",
     "preview",
     "missing_vals_tbl",
     "get_action_metadata",

pointblank/_constants.py CHANGED Viewed

@@ -105,10 +105,12 @@ ROW_BASED_VALIDATION_TYPES = [
     "col_vals_regex",
     "col_vals_null",
     "col_vals_not_null",
+    "col_vals_expr",
     "conjointly",
 ]
 IBIS_BACKENDS = [
+    "bigquery",
     "databricks",
     "duckdb",
     "memtable",
@@ -158,6 +160,9 @@ MODEL_PROVIDERS = [
 TABLE_TYPE_STYLES = {
     "pandas": {"background": "#150458", "text": "#FFFFFF", "label": "Pandas"},
     "polars": {"background": "#0075FF", "text": "#FFFFFF", "label": "Polars"},
+    "polars-lazy": {"background": "#0075FF", "text": "#FFFFFF", "label": "Polars (LazyFrame)"},
+    "narwhals": {"background": "#78BEAF", "text": "#222222", "label": "Narwhals"},
+    "narwhals-lazy": {"background": "#78BEAF", "text": "#222222", "label": "Narwhals (LazyFrame)"},
     "duckdb": {"background": "#000000", "text": "#FFFFFF", "label": "DuckDB"},
     "mysql": {"background": "#EBAD40", "text": "#222222", "label": "MySQL"},
     "postgres": {"background": "#3E638B", "text": "#FFFFFF", "label": "PostgreSQL"},
@@ -165,6 +170,7 @@ TABLE_TYPE_STYLES = {
     "parquet": {"background": "#3F9FF9", "text": "#FFFFFF", "label": "Parquet"},
     "memtable": {"background": "#2C3E50", "text": "#FFFFFF", "label": "Ibis memtable"},
     "mssql": {"background": "#E2E2E2", "text": "#222222", "label": "MSSQL"},
+    "bigquery": {"background": "#4285F4", "text": "#FFFFFF", "label": "BigQuery"},
     "pyspark": {"background": "#E66F21", "text": "#FFFFFF", "label": "Spark DataFrame"},
     "databricks": {"background": "#FF3621", "text": "#FFFFFF", "label": "Databricks"},
 }

pointblank/_datascan_utils.py ADDED Viewed

@@ -0,0 +1,65 @@
+from __future__ import annotations
+from math import floor, log10
+from typing import TYPE_CHECKING
+from great_tables.vals import fmt_integer, fmt_number, fmt_scientific
+if TYPE_CHECKING:
+    pass
+def _round_to_sig_figs(value: float, sig_figs: int) -> float:
+    if value == 0:
+        return 0
+    return round(value, sig_figs - int(floor(log10(abs(value)))) - 1)
+def _compact_integer_fmt(value: float | int) -> str:
+    if value == 0:
+        formatted = "0"
+    elif abs(value) >= 1 and abs(value) < 10_000:
+        formatted = fmt_integer(value, use_seps=False)[0]
+    else:
+        formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
+    return formatted
+def _compact_decimal_fmt(value: float | int) -> str:
+    if value == 0:
+        formatted = "0.00"
+    elif abs(value) < 1 and abs(value) >= 0.01:
+        formatted = fmt_number(value, decimals=2)[0]
+    elif abs(value) < 0.01:
+        formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
+    elif abs(value) >= 1 and abs(value) < 1000:
+        formatted = fmt_number(value, n_sigfig=3)[0]
+    elif abs(value) >= 1000 and abs(value) < 10_000:
+        formatted = fmt_number(value, decimals=0, use_seps=False)[0]
+    else:
+        formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
+    return formatted
+def _compact_0_1_fmt(value: float | int | None) -> str | None:
+    if value is None:
+        return value
+    if value == 0:
+        return " 0.00"
+    if value == 1:
+        return " 1.00"
+    if abs(value) < 1 and abs(value) >= 0.01:
+        return " " + fmt_number(value, decimals=2)[0]
+    if abs(value) < 0.01:
+        return "<0.01"
+    if abs(value) > 0.99:
+        return ">0.99"
+    return fmt_number(value, n_sigfig=3)[0]

pointblank/_utils.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import inspect
 import re
+from collections import defaultdict
 from typing import TYPE_CHECKING, Any
 import narwhals as nw
@@ -12,9 +13,28 @@ from narwhals.typing import FrameT
 from pointblank._constants import ASSERTION_TYPE_METHOD_MAP, GENERAL_COLUMN_TYPES
 if TYPE_CHECKING:
+    from collections.abc import Mapping
     from pointblank._typing import AbsoluteBounds, Tolerance
+def transpose_dicts(list_of_dicts: list[dict[str, Any]]) -> dict[str, list[Any]]:
+    if not list_of_dicts:
+        return {}
+    # Get all unique keys across all dictionaries
+    all_keys = set()
+    for d in list_of_dicts:
+        all_keys.update(d.keys())
+    result = defaultdict(list)
+    for d in list_of_dicts:
+        for key in all_keys:
+            result[key].append(d.get(key))  # None is default for missing keys
+    return dict(result)
 def _derive_single_bound(ref: int, tol: int | float) -> int:
     """Derive a single bound using the reference."""
     if not isinstance(tol, float | int):
@@ -88,6 +108,29 @@ def _get_tbl_type(data: FrameT | Any) -> str:
     return "unknown"  # pragma: no cover
+def _is_narwhals_table(data: any) -> bool:
+    # Check if the data is a Narwhals DataFrame
+    type_str = str(type(data)).lower()
+    if "narwhals" in type_str:
+        # If the object is not a Narwhals DataFrame, return False
+        return True
+    return False
+def _is_lazy_frame(data: any) -> bool:
+    # Check if the data is a Polars or Narwhals DataFrame
+    type_str = str(type(data)).lower()
+    if "polars" not in type_str and "narwhals" not in type_str:
+        # If the object is neither a Polars nor a Narwhals DataFrame, return False
+        return False
+    # Check if the data is a lazy frame
+    return "lazy" in type_str
 def _is_lib_present(lib_name: str) -> bool:
     import importlib
@@ -186,6 +229,77 @@ def _check_column_exists(dfn: nw.DataFrame, column: str) -> None:
         raise ValueError(f"Column '{column}' not found in DataFrame.")
+def _count_true_values_in_column(
+    tbl: FrameT,
+    column: str,
+    inverse: bool = False,
+) -> int:
+    """
+    Count the number of `True` values in a specified column of a table.
+    Parameters
+    ----------
+    tbl
+        A Narwhals-compatible DataFrame or table-like object.
+    column
+        The column in which to count the `True` values.
+    inverse
+        If `True`, count the number of `False` values instead.
+    Returns
+    -------
+    int
+        The count of `True` (or `False`) values in the specified column.
+    """
+    # Convert the DataFrame to a Narwhals DataFrame (no detrimental effect if
+    # already a Narwhals DataFrame)
+    tbl_nw = nw.from_native(tbl)
+    # Filter the table based on the column and whether we want to count True or False values
+    tbl_filtered = tbl_nw.filter(nw.col(column) if not inverse else ~nw.col(column))
+    # Always collect table if it is a LazyFrame; this is required to get the row count
+    if _is_lazy_frame(tbl_filtered):
+        tbl_filtered = tbl_filtered.collect()
+    return len(tbl_filtered)
+def _count_null_values_in_column(
+    tbl: FrameT,
+    column: str,
+) -> int:
+    """
+    Count the number of Null values in a specified column of a table.
+    Parameters
+    ----------
+    tbl
+        A Narwhals-compatible DataFrame or table-like object.
+    column
+        The column in which to count the Null values.
+    Returns
+    -------
+    int
+        The count of Null values in the specified column.
+    """
+    # Convert the DataFrame to a Narwhals DataFrame (no detrimental effect if
+    # already a Narwhals DataFrame)
+    tbl_nw = nw.from_native(tbl)
+    # Filter the table to get rows where the specified column is Null
+    tbl_filtered = tbl_nw.filter(nw.col(column).is_null())
+    # Always collect table if it is a LazyFrame; this is required to get the row count
+    if _is_lazy_frame(tbl_filtered):
+        tbl_filtered = tbl_filtered.collect()
+    return len(tbl_filtered)
 def _is_numeric_dtype(dtype: str) -> bool:
     """
     Check if a given data type string represents a numeric type.
@@ -514,6 +628,8 @@ def _get_api_text() -> str:
         "Validate.get_data_extracts",
         "Validate.all_passed",
         "Validate.assert_passing",
+        "Validate.assert_below_threshold",
+        "Validate.above_threshold",
         "Validate.n",
         "Validate.n_passed",
         "Validate.n_failed",
@@ -531,6 +647,7 @@ def _get_api_text() -> str:
         "missing_vals_tbl",
         "assistant",
         "load_dataset",
+        "get_data_path",
     ]
     utility_exported = [
@@ -782,3 +899,14 @@ def _format_to_float_value(
     formatted_vals = _get_column_of_values(gt, column_name="x", context="html")
     return formatted_vals[0]
+def _pivot_to_dict(col_dict: Mapping[str, Any]):  # TODO : Type hint and unit test
+    result_dict = {}
+    for col, sub_dict in col_dict.items():
+        for key, value in sub_dict.items():
+            # add columns fields not present
+            if key not in result_dict:
+                result_dict[key] = [None] * len(col_dict)
+            result_dict[key][list(col_dict.keys()).index(col)] = value
+    return result_dict

pointblank/_utils_html.py CHANGED Viewed

@@ -1,9 +1,49 @@
 from __future__ import annotations
+from typing import Any
+from great_tables import html
 from pointblank._constants import TABLE_TYPE_STYLES
 from pointblank._utils import _format_to_integer_value
+def _fmt_frac(vec) -> list[str | None]:
+    res: list[str | None] = []
+    for x in vec:
+        if x is None:
+            res.append(x)
+            continue
+        if x == 0:
+            res.append("0")
+            continue
+        if x < 0.01:
+            res.append("<.01")
+            continue
+        try:
+            intx: int = int(x)
+        except ValueError:  # generic object, ie. NaN
+            res.append(str(x))
+            continue
+        if intx == x:  # can remove trailing 0s w/o loss
+            res.append(str(intx))
+            continue
+        res.append(str(round(x, 2)))
+    return res
+def _make_sublabel(major: str, minor: str) -> Any:
+    return html(
+        f'{major!s}<span style="font-size: 0.75em; vertical-align: sub; position: relative; line-height: 0.5em;">{minor!s}</span>'
+    )
 def _create_table_type_html(
     tbl_type: str | None, tbl_name: str | None, font_size: str = "10px"
 ) -> str:

pointblank/actions.py CHANGED Viewed

@@ -216,7 +216,7 @@ def send_slack_notification(
             thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
             actions=pb.Actions(critical=notify_slack),
         )
-        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
+        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
         .col_vals_gt(columns="item_revenue", value=0.05)
         .col_vals_gt(columns="session_duration", value=15)
         .interrogate()
@@ -248,7 +248,7 @@ def send_slack_notification(
             thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
             final_actions=pb.FinalActions(notify_slack),
         )
-        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
+        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
         .col_vals_gt(columns="item_revenue", value=0.05)
         .col_vals_gt(columns="session_duration", value=15)
         .interrogate()
@@ -316,7 +316,7 @@ def send_slack_notification(
             actions=pb.Actions(default=notify_slack),
             final_actions=pb.FinalActions(notify_slack),
         )
-        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
+        .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
         .col_vals_gt(columns="item_revenue", value=0.05)
         .col_vals_gt(columns="session_duration", value=15)
         .interrogate()

pointblank/assistant.py CHANGED Viewed

@@ -176,9 +176,7 @@ def assistant(
     if data is not None:
         scan = DataScan(data=data)
-        scan_dict = scan.to_dict()
-        tbl_type = scan_dict["tbl_type"]
+        tbl_type: str = scan.profile.implementation.name.lower()
         tbl_json = scan.to_json()
         if tbl_name is not None:

pointblank/column.py CHANGED Viewed

@@ -1007,7 +1007,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
     `[rev_01, rev_02, profit_01, profit_02, age]`
     and you want to validate columns that have two digits at the end of the name, you can use
-    `columns=matches(r"\d{2}$")`. This will select the `rev_01`, `rev_02`, `profit_01`, and
+    `columns=matches(r"[0-9]{2}$")`. This will select the `rev_01`, `rev_02`, `profit_01`, and
     `profit_02` columns.
     There will be a validation step created for every resolved column. Note that if there aren't any
@@ -1061,7 +1061,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
     [`col()`](`pointblank.col`) function, like this:
     ```python
-    col(matches(r"^\d{5}") & ends_with("_id"))
+    col(matches(r"^[0-9]{5}") & ends_with("_id"))
     ```
     There are four operators that can be used to compose column selectors:
@@ -1107,7 +1107,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
     validation = (
         pb.Validate(data=tbl)
-        .col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID\d{4}")
+        .col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID[0-9]{4}")
         .interrogate()
     )
@@ -1115,7 +1115,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
     ```
     From the results of the validation table we get two validation steps, one for `id_old` and one
-    for `new_identifier`. The values in both columns all match the pattern `"ID\d{4}"`.
+    for `new_identifier`. The values in both columns all match the pattern `"ID[0-9]{4}"`.
     We can also use the `matches()` function in combination with other column selectors (within
     [`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select

pointblank/compare.py ADDED Viewed

@@ -0,0 +1,27 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from pointblank import DataScan
+if TYPE_CHECKING:
+    from narwhals.typing import IntoFrame
+class Compare:
+    def __init__(self, a: IntoFrame, b: IntoFrame) -> None:
+        self.a: IntoFrame = a
+        self.b: IntoFrame = b
+    def compare(self) -> None:
+        ## Scan both frames
+        self._scana = DataScan(self.a)
+        self._scanb = DataScan(self.b)
+        ## Get summary outs
+        summarya = self._scana.summary_data
+        summaryb = self._scana.summary_data
+        summarya.columns
+        self._scana.profile

pointblank 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl

pointblank 0.9.5py3-none-any.whl → 0.10.0py3-none-any.whl