PyPI - pointblank - Versions diffs - 0.13.4__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

pointblank 0.13.4py3-none-any.whl → 0.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

pointblank/__init__.py +4 -0
pointblank/_constants.py +54 -0
pointblank/_constants_translations.py +487 -2
pointblank/_interrogation.py +182 -11
pointblank/_utils.py +3 -3
pointblank/_utils_ai.py +850 -0
pointblank/cli.py +128 -115
pointblank/column.py +1 -1
pointblank/data/api-docs.txt +198 -13
pointblank/data/validations/README.md +108 -0
pointblank/data/validations/complex_preprocessing.json +54 -0
pointblank/data/validations/complex_preprocessing.pkl +0 -0
pointblank/data/validations/generate_test_files.py +127 -0
pointblank/data/validations/multiple_steps.json +83 -0
pointblank/data/validations/multiple_steps.pkl +0 -0
pointblank/data/validations/narwhals_function.json +28 -0
pointblank/data/validations/narwhals_function.pkl +0 -0
pointblank/data/validations/no_preprocessing.json +83 -0
pointblank/data/validations/no_preprocessing.pkl +0 -0
pointblank/data/validations/pandas_compatible.json +28 -0
pointblank/data/validations/pandas_compatible.pkl +0 -0
pointblank/data/validations/preprocessing_functions.py +46 -0
pointblank/data/validations/simple_preprocessing.json +57 -0
pointblank/data/validations/simple_preprocessing.pkl +0 -0
pointblank/datascan.py +4 -4
pointblank/scan_profile.py +6 -6
pointblank/schema.py +8 -82
pointblank/thresholds.py +1 -1
pointblank/validate.py +1233 -12
{pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/METADATA +66 -8
pointblank-0.14.0.dist-info/RECORD +55 -0
pointblank-0.13.4.dist-info/RECORD +0 -39
{pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/WHEEL +0 -0
{pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/entry_points.txt +0 -0
{pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/licenses/LICENSE +0 -0
{pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/top_level.txt +0 -0

pointblank/data/validations/multiple_steps.json ADDED Viewed

@@ -0,0 +1,83 @@
+[
+    {
+        "i": 1,
+        "i_o": 1,
+        "assertion_type": "col_vals_gt",
+        "column": "a",
+        "values": 2,
+        "inclusive": null,
+        "na_pass": false,
+        "pre": "def double_column_a(df):\n    \"\"\"Double the values in column 'a'.\"\"\"\n    return df.with_columns(pl.col(\"a\") * 2)",
+        "segments": null,
+        "thresholds": "Thresholds(warning=None, error=None, critical=None)",
+        "label": null,
+        "brief": null,
+        "active": true,
+        "all_passed": false,
+        "n": 10,
+        "n_passed": 9,
+        "n_failed": 1,
+        "f_passed": 0.9,
+        "f_failed": 0.1,
+        "warning": null,
+        "error": null,
+        "critical": null,
+        "time_processed": "2025-10-02T04:16:44.712+00:00",
+        "proc_duration_s": 0.00152
+    },
+    {
+        "i": 2,
+        "i_o": 2,
+        "assertion_type": "col_vals_in_set",
+        "column": "c",
+        "values": [
+            "x",
+            "y"
+        ],
+        "inclusive": null,
+        "na_pass": null,
+        "pre": "def filter_by_d_gt_100(df):\n    \"\"\"Filter rows where column 'd' is greater than 100.\"\"\"\n    return df.filter(pl.col(\"d\") > 100)",
+        "segments": null,
+        "thresholds": "Thresholds(warning=None, error=None, critical=None)",
+        "label": null,
+        "brief": null,
+        "active": true,
+        "all_passed": true,
+        "n": 7,
+        "n_passed": 7,
+        "n_failed": 0,
+        "f_passed": 1.0,
+        "f_failed": 0.0,
+        "warning": null,
+        "error": null,
+        "critical": null,
+        "time_processed": "2025-10-02T04:16:44.713+00:00",
+        "proc_duration_s": 0.001
+    },
+    {
+        "i": 3,
+        "i_o": 3,
+        "assertion_type": "col_vals_gt",
+        "column": "sum_ab",
+        "values": 100,
+        "inclusive": null,
+        "na_pass": false,
+        "pre": "def add_computed_column(df):\n    \"\"\"Add a computed column based on existing columns.\"\"\"\n    return df.with_columns((pl.col(\"a\") + pl.col(\"b\")).alias(\"sum_ab\"))",
+        "segments": null,
+        "thresholds": "Thresholds(warning=None, error=None, critical=None)",
+        "label": null,
+        "brief": null,
+        "active": true,
+        "all_passed": false,
+        "n": 10,
+        "n_passed": 1,
+        "n_failed": 9,
+        "f_passed": 0.1,
+        "f_failed": 0.9,
+        "warning": null,
+        "error": null,
+        "critical": null,
+        "time_processed": "2025-10-02T04:16:44.714+00:00",
+        "proc_duration_s": 0.001464
+    }
+]

pointblank/data/validations/multiple_steps.pkl ADDED Viewed

Binary file

pointblank/data/validations/narwhals_function.json ADDED Viewed

@@ -0,0 +1,28 @@
+[
+    {
+        "i": 1,
+        "i_o": 1,
+        "assertion_type": "col_vals_gt",
+        "column": "a",
+        "values": 5,
+        "inclusive": null,
+        "na_pass": false,
+        "pre": "def narwhals_median_transform(df):\n    \"\"\"Use narwhals to compute median - cross-backend compatible.\"\"\"\n    return nw.from_native(df).select(nw.median(\"a\"), nw.median(\"d\"))",
+        "segments": null,
+        "thresholds": "Thresholds(warning=None, error=None, critical=None)",
+        "label": null,
+        "brief": null,
+        "active": true,
+        "all_passed": true,
+        "n": 1,
+        "n_passed": 1,
+        "n_failed": 0,
+        "f_passed": 1.0,
+        "f_failed": 0.0,
+        "warning": null,
+        "error": null,
+        "critical": null,
+        "time_processed": "2025-10-02T04:16:44.710+00:00",
+        "proc_duration_s": 0.001455
+    }
+]

pointblank/data/validations/narwhals_function.pkl ADDED Viewed

Binary file

pointblank/data/validations/no_preprocessing.json ADDED Viewed

@@ -0,0 +1,83 @@
+[
+    {
+        "i": 1,
+        "i_o": 1,
+        "assertion_type": "col_vals_gt",
+        "column": "a",
+        "values": 0,
+        "inclusive": null,
+        "na_pass": false,
+        "pre": null,
+        "segments": null,
+        "thresholds": "Thresholds(warning=None, error=None, critical=None)",
+        "label": null,
+        "brief": null,
+        "active": true,
+        "all_passed": true,
+        "n": 10,
+        "n_passed": 10,
+        "n_failed": 0,
+        "f_passed": 1.0,
+        "f_failed": 0.0,
+        "warning": null,
+        "error": null,
+        "critical": null,
+        "time_processed": "2025-10-02T04:16:44.718+00:00",
+        "proc_duration_s": 0.001148
+    },
+    {
+        "i": 2,
+        "i_o": 2,
+        "assertion_type": "col_vals_lt",
+        "column": "d",
+        "values": 300,
+        "inclusive": null,
+        "na_pass": false,
+        "pre": null,
+        "segments": null,
+        "thresholds": "Thresholds(warning=None, error=None, critical=None)",
+        "label": null,
+        "brief": null,
+        "active": true,
+        "all_passed": true,
+        "n": 10,
+        "n_passed": 10,
+        "n_failed": 0,
+        "f_passed": 1.0,
+        "f_failed": 0.0,
+        "warning": null,
+        "error": null,
+        "critical": null,
+        "time_processed": "2025-10-02T04:16:44.719+00:00",
+        "proc_duration_s": 0.001181
+    },
+    {
+        "i": 3,
+        "i_o": 3,
+        "assertion_type": "col_vals_in_set",
+        "column": "c",
+        "values": [
+            "x",
+            "y"
+        ],
+        "inclusive": null,
+        "na_pass": null,
+        "pre": null,
+        "segments": null,
+        "thresholds": "Thresholds(warning=None, error=None, critical=None)",
+        "label": null,
+        "brief": null,
+        "active": true,
+        "all_passed": true,
+        "n": 10,
+        "n_passed": 10,
+        "n_failed": 0,
+        "f_passed": 1.0,
+        "f_failed": 0.0,
+        "warning": null,
+        "error": null,
+        "critical": null,
+        "time_processed": "2025-10-02T04:16:44.720+00:00",
+        "proc_duration_s": 0.000892
+    }
+]

pointblank/data/validations/no_preprocessing.pkl ADDED Viewed

Binary file

pointblank/data/validations/pandas_compatible.json ADDED Viewed

@@ -0,0 +1,28 @@
+[
+    {
+        "i": 1,
+        "i_o": 1,
+        "assertion_type": "col_vals_gt",
+        "column": "a_plus_b",
+        "values": 10,
+        "inclusive": null,
+        "na_pass": false,
+        "pre": "def pandas_compatible_transform(df):\n    \"\"\"Transform that works with pandas DataFrames.\"\"\"\n    if hasattr(df, \"assign\"):  # pandas\n        return df.assign(a_plus_b=df[\"a\"] + df.get(\"b\", 0))\n    else:  # polars or other\n        return df.with_columns((pl.col(\"a\") + pl.col(\"b\")).alias(\"a_plus_b\"))",
+        "segments": null,
+        "thresholds": "Thresholds(warning=None, error=None, critical=None)",
+        "label": null,
+        "brief": null,
+        "active": true,
+        "all_passed": true,
+        "n": 10,
+        "n_passed": 10,
+        "n_failed": 0,
+        "f_passed": 1.0,
+        "f_failed": 0.0,
+        "warning": null,
+        "error": null,
+        "critical": null,
+        "time_processed": "2025-10-02T04:16:44.717+00:00",
+        "proc_duration_s": 0.001428
+    }
+]

pointblank/data/validations/pandas_compatible.pkl ADDED Viewed

Binary file

pointblank/data/validations/preprocessing_functions.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""
+Test preprocessing functions for validation serialization examples.
+These functions are used to create validation objects that can be serialized
+and stored as reference files for regression testing.
+"""
+import narwhals as nw
+import polars as pl
+def double_column_a(df):
+    """Double the values in column 'a'."""
+    return df.with_columns(pl.col("a") * 2)
+def add_computed_column(df):
+    """Add a computed column based on existing columns."""
+    return df.with_columns((pl.col("a") + pl.col("b")).alias("sum_ab"))
+def filter_by_d_gt_100(df):
+    """Filter rows where column 'd' is greater than 100."""
+    return df.filter(pl.col("d") > 100)
+def narwhals_median_transform(df):
+    """Use narwhals to compute median - cross-backend compatible."""
+    return nw.from_native(df).select(nw.median("a"), nw.median("d"))
+def complex_preprocessing(df):
+    """Complex preprocessing combining multiple operations."""
+    return (
+        df.filter(pl.col("a") > 1)
+        .with_columns((pl.col("a") * 2).alias("a_doubled"), (pl.col("d") / 10).alias("d_scaled"))
+        .filter(pl.col("d_scaled") > 10)
+    )
+def pandas_compatible_transform(df):
+    """Transform that works with pandas DataFrames."""
+    if hasattr(df, "assign"):  # pandas
+        return df.assign(a_plus_b=df["a"] + df.get("b", 0))
+    else:  # polars or other
+        return df.with_columns((pl.col("a") + pl.col("b")).alias("a_plus_b"))

pointblank/data/validations/simple_preprocessing.json ADDED Viewed

@@ -0,0 +1,57 @@
+[
+    {
+        "i": 1,
+        "i_o": 1,
+        "assertion_type": "col_vals_gt",
+        "column": "a",
+        "values": 0,
+        "inclusive": null,
+        "na_pass": false,
+        "pre": "def double_column_a(df):\n    \"\"\"Double the values in column 'a'.\"\"\"\n    return df.with_columns(pl.col(\"a\") * 2)",
+        "segments": null,
+        "thresholds": "Thresholds(warning=None, error=None, critical=None)",
+        "label": null,
+        "brief": null,
+        "active": true,
+        "all_passed": true,
+        "n": 10,
+        "n_passed": 10,
+        "n_failed": 0,
+        "f_passed": 1.0,
+        "f_failed": 0.0,
+        "warning": null,
+        "error": null,
+        "critical": null,
+        "time_processed": "2025-10-02T04:16:44.702+00:00",
+        "proc_duration_s": 0.00387
+    },
+    {
+        "i": 2,
+        "i_o": 2,
+        "assertion_type": "col_vals_in_set",
+        "column": "c",
+        "values": [
+            "x",
+            "y"
+        ],
+        "inclusive": null,
+        "na_pass": null,
+        "pre": null,
+        "segments": null,
+        "thresholds": "Thresholds(warning=None, error=None, critical=None)",
+        "label": null,
+        "brief": null,
+        "active": true,
+        "all_passed": true,
+        "n": 10,
+        "n_passed": 10,
+        "n_failed": 0,
+        "f_passed": 1.0,
+        "f_failed": 0.0,
+        "warning": null,
+        "error": null,
+        "critical": null,
+        "time_processed": "2025-10-02T04:16:44.703+00:00",
+        "proc_duration_s": 0.000983
+    }
+]

pointblank/data/validations/simple_preprocessing.pkl ADDED Viewed

Binary file

pointblank/datascan.py CHANGED Viewed

@@ -143,17 +143,17 @@ class DataScan:
             for conv_method in valid_conversion_methods:
                 try:
                     valid_native = getattr(ibis_native, conv_method)()
-                except (NotImplementedError, ImportError, ModuleNotFoundError):
-                    continue
+                except (NotImplementedError, ImportError, ModuleNotFoundError):  # pragma: no cover
+                    continue  # pragma: no cover
                 break
-            else:
+            else:  # pragma: no cover
                 msg = (
                     "To use `ibis` as input, you must have one of arrow, pandas, polars or numpy "
                     "available in the process. Until `ibis` is fully supported by Narwhals, this is "
                     "necessary. Additionally, the data must be collected in order to calculate some "
                     "structural statistics, which may be performance detrimental."
                 )
-                raise ImportError(msg)
+                raise ImportError(msg)  # pragma: no cover
             as_native = nw.from_native(valid_native)
         self.nw_data: Frame = nw.from_native(as_native)

pointblank/scan_profile.py CHANGED Viewed

@@ -299,12 +299,12 @@ class _DataProfile:  # TODO: feels redundant and weird
                 # instantiations that require consistent types.
                 all_same_type: bool = all(type(v) is first_type for v in values[1:])
                 if not all_same_type:
-                    if strict:
-                        msg = f"Some types in {key!s} stat are different. Turn off `strict` to bypass."
-                        raise TypeError(msg)
-                    for d in cols:
-                        if key in d:
-                            d[key] = str(d[key])
+                    if strict:  # pragma: no cover
+                        msg = f"Some types in {key!s} stat are different. Turn off `strict` to bypass."  # pragma: no cover
+                        raise TypeError(msg)  # pragma: no cover
+                    for d in cols:  # pragma: no cover
+                        if key in d:  # pragma: no cover
+                            d[key] = str(d[key])  # pragma: no cover
         return nw.from_dict(transpose_dicts(cols), backend=self.implementation)

pointblank/schema.py CHANGED Viewed

@@ -343,15 +343,15 @@ class Schema:
             schema_dict = {k: str(v) for k, v in schema_dict.items()}
             self.columns = list(schema_dict.items())
-        elif table_type == "pyspark":
+        elif table_type == "pyspark":  # pragma: no cover
             # Convert PySpark DataFrame to Narwhals to get schema
-            nw_df = nw.from_native(self.tbl)
-            if _is_lazy_frame(data=nw_df):
-                schema_dict = dict(nw_df.collect_schema())
-            else:
-                schema_dict = dict(nw_df.schema.items())
-            schema_dict = {k: str(v) for k, v in schema_dict.items()}
-            self.columns = list(schema_dict.items())
+            nw_df = nw.from_native(self.tbl)  # pragma: no cover
+            if _is_lazy_frame(data=nw_df):  # pragma: no cover
+                schema_dict = dict(nw_df.collect_schema())  # pragma: no cover
+            else:  # pragma: no cover
+                schema_dict = dict(nw_df.schema.items())  # pragma: no cover
+            schema_dict = {k: str(v) for k, v in schema_dict.items()}  # pragma: no cover
+            self.columns = list(schema_dict.items())  # pragma: no cover
         elif table_type in IBIS_BACKENDS:
             schema_dict = dict(self.tbl.schema().items())
@@ -888,80 +888,6 @@ def _schema_info_generate_params_dict(
     }
-def _check_schema_match(
-    data_tbl: any,
-    schema: Schema,
-    complete: bool = True,
-    in_order: bool = True,
-    case_sensitive_colnames: bool = True,
-    case_sensitive_dtypes: bool = True,
-    full_match_dtypes: bool = True,
-) -> bool:
-    """
-    Check if the schema matches the target table.
-    This function performs schema validation and returns a boolean result.
-    Parameters
-    ----------
-    data_tbl
-        The target table to validate.
-    schema
-        The expected schema.
-    complete
-        Whether the schema should be complete.
-    in_order
-        Whether the schema should be in order.
-    case_sensitive_colnames
-        Whether column names are case-sensitive.
-    case_sensitive_dtypes
-        Whether data types are case-sensitive.
-    full_match_dtypes
-        Whether data types must match exactly.
-    Returns
-    -------
-    bool
-        True if the schema matches, False otherwise.
-    """
-    validation_info = _get_schema_validation_info(
-        data_tbl=data_tbl,
-        schema=schema,
-        passed=False,  # This will be determined by the logic below
-        complete=complete,
-        in_order=in_order,
-        case_sensitive_colnames=case_sensitive_colnames,
-        case_sensitive_dtypes=case_sensitive_dtypes,
-        full_match_dtypes=full_match_dtypes,
-    )
-    # Determine if the schema validation passed based on the validation info
-    passed = True
-    # Check completeness requirement
-    if complete and not validation_info["columns_full_set"]:
-        passed = False
-    # Check order requirement
-    if in_order and not validation_info["columns_matched_in_order"]:
-        passed = False
-    # Check if all expected columns were found
-    if validation_info["columns_not_found"]:
-        passed = False
-    # Check column-specific validations
-    for col_info in validation_info["columns"].values():
-        if not col_info["colname_matched"]:
-            passed = False
-        if not col_info.get(
-            "dtype_matched", True
-        ):  # dtype_matched may not exist if no dtypes specified
-            passed = False
-    return passed
 def _get_schema_validation_info(
     data_tbl: any,
     schema: Schema,

pointblank/thresholds.py CHANGED Viewed

@@ -559,7 +559,7 @@ class FinalActions:
     def send_alert():
         summary = pb.get_validation_summary()
         if summary["highest_severity"] == "critical":
-            print(f"ALERT: Critical validation failures found in {summary['table_name']}")
+            print(f"ALERT: Critical validation failures found in {summary['tbl_name']}")
     validation = (
         pb.Validate(

pointblank 0.13.4__py3-none-any.whl → 0.14.0__py3-none-any.whl

pointblank 0.13.4py3-none-any.whl → 0.14.0py3-none-any.whl