PyPI - pointblank - Versions diffs - 0.13.3__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

pointblank 0.13.3py3-none-any.whl → 0.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

pointblank/__init__.py +4 -0
pointblank/_constants.py +54 -0
pointblank/_constants_translations.py +541 -2
pointblank/_interrogation.py +198 -12
pointblank/_utils.py +41 -1
pointblank/_utils_ai.py +850 -0
pointblank/cli.py +128 -115
pointblank/column.py +1 -1
pointblank/data/api-docs.txt +198 -13
pointblank/data/validations/README.md +108 -0
pointblank/data/validations/complex_preprocessing.json +54 -0
pointblank/data/validations/complex_preprocessing.pkl +0 -0
pointblank/data/validations/generate_test_files.py +127 -0
pointblank/data/validations/multiple_steps.json +83 -0
pointblank/data/validations/multiple_steps.pkl +0 -0
pointblank/data/validations/narwhals_function.json +28 -0
pointblank/data/validations/narwhals_function.pkl +0 -0
pointblank/data/validations/no_preprocessing.json +83 -0
pointblank/data/validations/no_preprocessing.pkl +0 -0
pointblank/data/validations/pandas_compatible.json +28 -0
pointblank/data/validations/pandas_compatible.pkl +0 -0
pointblank/data/validations/preprocessing_functions.py +46 -0
pointblank/data/validations/simple_preprocessing.json +57 -0
pointblank/data/validations/simple_preprocessing.pkl +0 -0
pointblank/datascan.py +4 -4
pointblank/scan_profile.py +6 -6
pointblank/schema.py +8 -82
pointblank/thresholds.py +1 -1
pointblank/validate.py +1412 -20
{pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/METADATA +66 -8
pointblank-0.14.0.dist-info/RECORD +55 -0
pointblank/_constants_docs.py +0 -40
pointblank-0.13.3.dist-info/RECORD +0 -40
{pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/WHEEL +0 -0
{pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/entry_points.txt +0 -0
{pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/licenses/LICENSE +0 -0
{pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/top_level.txt +0 -0

pointblank/_interrogation.py CHANGED Viewed

@@ -119,8 +119,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
         # The namespace is the actual module, so we check its name
         if hasattr(native_namespace, "__name__") and "ibis" in native_namespace.__name__:
             return null_check
-    except Exception:
-        pass
+    except Exception:  # pragma: no cover
+        pass  # pragma: no cover
     # For non-Ibis backends, try to use `is_nan()` if the column type supports it
     try:
@@ -128,8 +128,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
             schema = data_frame.collect_schema()
         elif hasattr(data_frame, "schema"):
             schema = data_frame.schema
-        else:
-            schema = None
+        else:  # pragma: no cover
+            schema = None  # pragma: no cover
         if schema and column_name:
             column_dtype = schema.get(column_name)
@@ -148,8 +148,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
                     except Exception:
                         # If `is_nan()` fails for any reason, fall back to Null only
                         pass
-    except Exception:
-        pass
+    except Exception:  # pragma: no cover
+        pass  # pragma: no cover
     # Fallback: just check Null values
     return null_check
@@ -333,7 +333,7 @@ class ConjointlyValidation:
                     ibis_expr = col_expr.to_ibis_expr(self.data_tbl)
                     ibis_expressions.append(ibis_expr)
             except Exception:  # pragma: no cover
-                # Silent failure - we already tried both strategies
+                # Silent failure where we already tried both strategies
                 pass
         # Combine expressions
@@ -370,7 +370,7 @@ class ConjointlyValidation:
                 else:
                     raise TypeError(
                         f"Expression returned {type(expr_result)}, expected PySpark Column"
-                    )
+                    )  # pragma: no cover
             except Exception as e:
                 try:
@@ -382,7 +382,9 @@ class ConjointlyValidation:
                         pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
                         pyspark_columns.append(pyspark_expr)
                     else:
-                        raise TypeError(f"Cannot convert {type(col_expr)} to PySpark Column")
+                        raise TypeError(
+                            f"Cannot convert {type(col_expr)} to PySpark Column"
+                        )  # pragma: no cover
                 except Exception as nested_e:
                     print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
@@ -435,7 +437,7 @@ class SpeciallyValidation:
             data_tbl = self.data_tbl
             result = expression(data_tbl)
         else:
-            # More than one parameter - this doesn't match either allowed signature
+            # More than one parameter: this doesn't match either allowed signature
             raise ValueError(
                 f"The function provided to 'specially()' should have either no parameters or a "
                 f"single 'data' parameter, but it has {len(params)} parameters: {params}"
@@ -656,7 +658,7 @@ def col_vals_expr(data_tbl: FrameT, expr, tbl_type: str = "local"):
             return data_tbl.assign(pb_is_good_=expr)
     # For remote backends, return original table (placeholder)
-    return data_tbl
+    return data_tbl  # pragma: no cover
 def rows_complete(data_tbl: FrameT, columns_subset: list[str] | None):
@@ -1688,15 +1690,30 @@ def interrogate_notin(tbl: FrameT, column: str, set_values: any) -> FrameT:
     return result_tbl.to_native()
-def interrogate_regex(tbl: FrameT, column: str, pattern: str, na_pass: bool) -> FrameT:
+def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: bool) -> FrameT:
     """Regex interrogation."""
+    # Handle both old and new formats for backward compatibility
+    if isinstance(values, str):
+        pattern = values
+        inverse = False
+    else:
+        pattern = values["pattern"]
+        inverse = values["inverse"]
     nw_tbl = nw.from_native(tbl)
     result_tbl = nw_tbl.with_columns(
         pb_is_good_1=nw.col(column).is_null() & na_pass,
         pb_is_good_2=nw.col(column).str.contains(pattern, literal=False).fill_null(False),
     )
+    # Apply inverse logic if needed
+    if inverse:
+        # Use explicit boolean logic instead of bitwise NOT for pandas compatibility
+        result_tbl = result_tbl.with_columns(
+            pb_is_good_2=nw.when(nw.col("pb_is_good_2")).then(False).otherwise(True)
+        )
     result_tbl = result_tbl.with_columns(
         pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2")
     ).drop("pb_is_good_1", "pb_is_good_2")
@@ -1847,3 +1864,172 @@ def interrogate_rows_complete(tbl: FrameT, columns_subset: list[str] | None) ->
     result_tbl = result_tbl.drop("_any_is_null_")
     return result_tbl.to_native()
+def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config: dict) -> FrameT:
+    """AI-powered interrogation of rows."""
+    import logging
+    logger = logging.getLogger(__name__)
+    try:
+        # Import AI validation modules
+        from pointblank._utils_ai import (
+            _AIValidationEngine,
+            _BatchConfig,
+            _DataBatcher,
+            _LLMConfig,
+            _PromptBuilder,
+            _ValidationResponseParser,
+        )
+        # Extract AI configuration
+        prompt = ai_config["prompt"]
+        llm_provider = ai_config["llm_provider"]
+        llm_model = ai_config["llm_model"]
+        batch_size = ai_config.get("batch_size", 1000)
+        max_concurrent = ai_config.get("max_concurrent", 3)
+        # Set up LLM configuration (api_key will be loaded from environment)
+        llm_config = _LLMConfig(
+            provider=llm_provider,
+            model=llm_model,
+            api_key=None,  # Will be loaded from environment variables
+        )
+        # Set up batch configuration
+        batch_config = _BatchConfig(size=batch_size, max_concurrent=max_concurrent)
+        # Create optimized data batcher
+        batcher = _DataBatcher(data=tbl, columns=columns_subset, config=batch_config)
+        # Create batches with signature mapping for optimization
+        batches, signature_mapping = batcher.create_batches()
+        logger.info(f"Created {len(batches)} batches for AI validation")
+        # Log optimization stats
+        if hasattr(batcher, "get_reduction_stats"):
+            stats = batcher.get_reduction_stats()
+            if stats.get("reduction_percentage", 0) > 0:
+                logger.info(
+                    f"Optimization: {stats['original_rows']} → {stats['unique_rows']} rows ({stats['reduction_percentage']:.1f}% reduction)"
+                )
+        # Create prompt builder
+        prompt_builder = _PromptBuilder(prompt)
+        # Create AI validation engine
+        engine = _AIValidationEngine(llm_config)
+        # Run AI validation synchronously (chatlas is synchronous)
+        batch_results = engine.validate_batches(
+            batches=batches, prompt_builder=prompt_builder, max_concurrent=max_concurrent
+        )
+        # Parse and combine results with signature mapping optimization
+        parser = _ValidationResponseParser(total_rows=len(tbl))
+        combined_results = parser.combine_batch_results(batch_results, signature_mapping)
+        # Debug: Log table info and combined results
+        logger.debug("🏁 Final result conversion:")
+        logger.debug(f"   - Table length: {len(tbl)}")
+        logger.debug(
+            f"   - Combined results keys: {sorted(combined_results.keys()) if combined_results else 'None'}"
+        )
+        # Convert results to narwhals format
+        nw_tbl = nw.from_native(tbl)
+        # Create a boolean column for validation results
+        validation_results = []
+        for i in range(len(tbl)):
+            # Default to False if row wasn't processed
+            result = combined_results.get(i, False)
+            validation_results.append(result)
+            # Debug: Log first few conversions
+            if i < 5 or len(tbl) - i <= 2:
+                logger.debug(f"   Row {i}: {result} (from combined_results.get({i}, False))")
+        logger.debug(f"   - Final validation_results length: {len(validation_results)}")
+        logger.debug(f"   - Final passed count: {sum(validation_results)}")
+        logger.debug(
+            f"   - Final failed count: {len(validation_results) - sum(validation_results)}"
+        )
+        # Add the pb_is_good_ column by creating a proper boolean Series
+        # First convert to native to work with the underlying data frame
+        native_tbl = nw_tbl.to_native()
+        # Create the result table with the boolean column
+        if hasattr(native_tbl, "with_columns"):  # Polars
+            import polars as pl
+            result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
+        elif hasattr(native_tbl, "assign"):  # Pandas
+            import pandas as pd
+            result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
+        else:
+            # Generic fallback
+            result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
+            result_tbl["pb_is_good_"] = validation_results
+        logger.info(
+            f"AI validation completed. {sum(validation_results)} rows passed out of {len(validation_results)}"
+        )
+        return result_tbl
+    except ImportError as e:
+        logger.error(f"Missing dependencies for AI validation: {e}")
+        logger.error("Install required packages: pip install openai anthropic aiohttp")
+        # Return all False results as fallback
+        nw_tbl = nw.from_native(tbl)
+        native_tbl = nw_tbl.to_native()
+        validation_results = [False] * len(tbl)
+        if hasattr(native_tbl, "with_columns"):  # Polars
+            import polars as pl
+            result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
+        elif hasattr(native_tbl, "assign"):  # Pandas
+            import pandas as pd
+            result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
+        else:
+            # Fallback
+            result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
+            result_tbl["pb_is_good_"] = validation_results
+        return result_tbl
+    except Exception as e:
+        logger.error(f"AI validation failed: {e}")
+        # Return all False results as fallback
+        nw_tbl = nw.from_native(tbl)
+        native_tbl = nw_tbl.to_native()
+        validation_results = [False] * len(tbl)
+        if hasattr(native_tbl, "with_columns"):  # Polars
+            import polars as pl
+            result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
+        elif hasattr(native_tbl, "assign"):  # Pandas
+            import pandas as pd
+            result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
+        else:
+            # Fallback
+            result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
+            result_tbl["pb_is_good_"] = validation_results
+        return result_tbl

pointblank/_utils.py CHANGED Viewed

@@ -102,7 +102,7 @@ def _get_tbl_type(data: FrameT | Any) -> str:
                 if "read_parquet" in tbl_name:
                     return "parquet"
-            else:
+            else:  # pragma: no cover
                 return "duckdb"
         return backend
@@ -240,6 +240,46 @@ def _select_df_lib(preference: str = "polars") -> Any:
     return pl if pl is not None else pd
+def _copy_dataframe(df):
+    """
+    Create a copy of a DataFrame, handling different DataFrame types.
+    This function attempts to create a proper copy of the DataFrame using
+    the most appropriate method for each DataFrame type.
+    """
+    # Try standard copy methods first
+    if hasattr(df, "copy") and callable(getattr(df, "copy")):
+        try:
+            return df.copy()
+        except Exception:
+            pass
+    if hasattr(df, "clone") and callable(getattr(df, "clone")):
+        try:
+            return df.clone()
+        except Exception:
+            pass
+    # Try the select('*') approach for DataFrames that support it
+    # This works well for PySpark and other SQL-like DataFrames
+    if hasattr(df, "select") and callable(getattr(df, "select")):
+        try:
+            return df.select("*")
+        except Exception:
+            pass
+    # For DataFrames that can't be copied, return original
+    # This provides some protection while avoiding crashes
+    try:
+        import copy
+        return copy.deepcopy(df)
+    except Exception:  # pragma: no cover
+        # If all else fails, return the original DataFrame
+        # This is better than crashing the validation
+        return df  # pragma: no cover
 def _convert_to_narwhals(df: FrameT) -> nw.DataFrame:
     # Convert the DataFrame to a format that narwhals can work with
     return nw.from_native(df)

pointblank 0.13.3__py3-none-any.whl → 0.14.0__py3-none-any.whl

pointblank 0.13.3py3-none-any.whl → 0.14.0py3-none-any.whl