PyPI - pointblank - Versions diffs - 0.13.4__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

pointblank 0.13.4py3-none-any.whl → 0.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

pointblank/__init__.py +4 -0
pointblank/_constants.py +54 -0
pointblank/_constants_translations.py +487 -2
pointblank/_interrogation.py +182 -11
pointblank/_utils.py +3 -3
pointblank/_utils_ai.py +850 -0
pointblank/cli.py +128 -115
pointblank/column.py +1 -1
pointblank/data/api-docs.txt +198 -13
pointblank/data/validations/README.md +108 -0
pointblank/data/validations/complex_preprocessing.json +54 -0
pointblank/data/validations/complex_preprocessing.pkl +0 -0
pointblank/data/validations/generate_test_files.py +127 -0
pointblank/data/validations/multiple_steps.json +83 -0
pointblank/data/validations/multiple_steps.pkl +0 -0
pointblank/data/validations/narwhals_function.json +28 -0
pointblank/data/validations/narwhals_function.pkl +0 -0
pointblank/data/validations/no_preprocessing.json +83 -0
pointblank/data/validations/no_preprocessing.pkl +0 -0
pointblank/data/validations/pandas_compatible.json +28 -0
pointblank/data/validations/pandas_compatible.pkl +0 -0
pointblank/data/validations/preprocessing_functions.py +46 -0
pointblank/data/validations/simple_preprocessing.json +57 -0
pointblank/data/validations/simple_preprocessing.pkl +0 -0
pointblank/datascan.py +4 -4
pointblank/scan_profile.py +6 -6
pointblank/schema.py +8 -82
pointblank/thresholds.py +1 -1
pointblank/validate.py +1233 -12
{pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/METADATA +66 -8
pointblank-0.14.0.dist-info/RECORD +55 -0
pointblank-0.13.4.dist-info/RECORD +0 -39
{pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/WHEEL +0 -0
{pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/entry_points.txt +0 -0
{pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/licenses/LICENSE +0 -0
{pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/top_level.txt +0 -0

pointblank/_interrogation.py CHANGED Viewed

@@ -119,8 +119,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
         # The namespace is the actual module, so we check its name
         if hasattr(native_namespace, "__name__") and "ibis" in native_namespace.__name__:
             return null_check
-    except Exception:
-        pass
+    except Exception:  # pragma: no cover
+        pass  # pragma: no cover
     # For non-Ibis backends, try to use `is_nan()` if the column type supports it
     try:
@@ -128,8 +128,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
             schema = data_frame.collect_schema()
         elif hasattr(data_frame, "schema"):
             schema = data_frame.schema
-        else:
-            schema = None
+        else:  # pragma: no cover
+            schema = None  # pragma: no cover
         if schema and column_name:
             column_dtype = schema.get(column_name)
@@ -148,8 +148,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
                     except Exception:
                         # If `is_nan()` fails for any reason, fall back to Null only
                         pass
-    except Exception:
-        pass
+    except Exception:  # pragma: no cover
+        pass  # pragma: no cover
     # Fallback: just check Null values
     return null_check
@@ -333,7 +333,7 @@ class ConjointlyValidation:
                     ibis_expr = col_expr.to_ibis_expr(self.data_tbl)
                     ibis_expressions.append(ibis_expr)
             except Exception:  # pragma: no cover
-                # Silent failure - we already tried both strategies
+                # Silent failure where we already tried both strategies
                 pass
         # Combine expressions
@@ -370,7 +370,7 @@ class ConjointlyValidation:
                 else:
                     raise TypeError(
                         f"Expression returned {type(expr_result)}, expected PySpark Column"
-                    )
+                    )  # pragma: no cover
             except Exception as e:
                 try:
@@ -382,7 +382,9 @@ class ConjointlyValidation:
                         pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
                         pyspark_columns.append(pyspark_expr)
                     else:
-                        raise TypeError(f"Cannot convert {type(col_expr)} to PySpark Column")
+                        raise TypeError(
+                            f"Cannot convert {type(col_expr)} to PySpark Column"
+                        )  # pragma: no cover
                 except Exception as nested_e:
                     print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
@@ -435,7 +437,7 @@ class SpeciallyValidation:
             data_tbl = self.data_tbl
             result = expression(data_tbl)
         else:
-            # More than one parameter - this doesn't match either allowed signature
+            # More than one parameter: this doesn't match either allowed signature
             raise ValueError(
                 f"The function provided to 'specially()' should have either no parameters or a "
                 f"single 'data' parameter, but it has {len(params)} parameters: {params}"
@@ -656,7 +658,7 @@ def col_vals_expr(data_tbl: FrameT, expr, tbl_type: str = "local"):
             return data_tbl.assign(pb_is_good_=expr)
     # For remote backends, return original table (placeholder)
-    return data_tbl
+    return data_tbl  # pragma: no cover
 def rows_complete(data_tbl: FrameT, columns_subset: list[str] | None):
@@ -1862,3 +1864,172 @@ def interrogate_rows_complete(tbl: FrameT, columns_subset: list[str] | None) ->
     result_tbl = result_tbl.drop("_any_is_null_")
     return result_tbl.to_native()
+def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config: dict) -> FrameT:
+    """AI-powered interrogation of rows."""
+    import logging
+    logger = logging.getLogger(__name__)
+    try:
+        # Import AI validation modules
+        from pointblank._utils_ai import (
+            _AIValidationEngine,
+            _BatchConfig,
+            _DataBatcher,
+            _LLMConfig,
+            _PromptBuilder,
+            _ValidationResponseParser,
+        )
+        # Extract AI configuration
+        prompt = ai_config["prompt"]
+        llm_provider = ai_config["llm_provider"]
+        llm_model = ai_config["llm_model"]
+        batch_size = ai_config.get("batch_size", 1000)
+        max_concurrent = ai_config.get("max_concurrent", 3)
+        # Set up LLM configuration (api_key will be loaded from environment)
+        llm_config = _LLMConfig(
+            provider=llm_provider,
+            model=llm_model,
+            api_key=None,  # Will be loaded from environment variables
+        )
+        # Set up batch configuration
+        batch_config = _BatchConfig(size=batch_size, max_concurrent=max_concurrent)
+        # Create optimized data batcher
+        batcher = _DataBatcher(data=tbl, columns=columns_subset, config=batch_config)
+        # Create batches with signature mapping for optimization
+        batches, signature_mapping = batcher.create_batches()
+        logger.info(f"Created {len(batches)} batches for AI validation")
+        # Log optimization stats
+        if hasattr(batcher, "get_reduction_stats"):
+            stats = batcher.get_reduction_stats()
+            if stats.get("reduction_percentage", 0) > 0:
+                logger.info(
+                    f"Optimization: {stats['original_rows']} → {stats['unique_rows']} rows ({stats['reduction_percentage']:.1f}% reduction)"
+                )
+        # Create prompt builder
+        prompt_builder = _PromptBuilder(prompt)
+        # Create AI validation engine
+        engine = _AIValidationEngine(llm_config)
+        # Run AI validation synchronously (chatlas is synchronous)
+        batch_results = engine.validate_batches(
+            batches=batches, prompt_builder=prompt_builder, max_concurrent=max_concurrent
+        )
+        # Parse and combine results with signature mapping optimization
+        parser = _ValidationResponseParser(total_rows=len(tbl))
+        combined_results = parser.combine_batch_results(batch_results, signature_mapping)
+        # Debug: Log table info and combined results
+        logger.debug("🏁 Final result conversion:")
+        logger.debug(f"   - Table length: {len(tbl)}")
+        logger.debug(
+            f"   - Combined results keys: {sorted(combined_results.keys()) if combined_results else 'None'}"
+        )
+        # Convert results to narwhals format
+        nw_tbl = nw.from_native(tbl)
+        # Create a boolean column for validation results
+        validation_results = []
+        for i in range(len(tbl)):
+            # Default to False if row wasn't processed
+            result = combined_results.get(i, False)
+            validation_results.append(result)
+            # Debug: Log first few conversions
+            if i < 5 or len(tbl) - i <= 2:
+                logger.debug(f"   Row {i}: {result} (from combined_results.get({i}, False))")
+        logger.debug(f"   - Final validation_results length: {len(validation_results)}")
+        logger.debug(f"   - Final passed count: {sum(validation_results)}")
+        logger.debug(
+            f"   - Final failed count: {len(validation_results) - sum(validation_results)}"
+        )
+        # Add the pb_is_good_ column by creating a proper boolean Series
+        # First convert to native to work with the underlying data frame
+        native_tbl = nw_tbl.to_native()
+        # Create the result table with the boolean column
+        if hasattr(native_tbl, "with_columns"):  # Polars
+            import polars as pl
+            result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
+        elif hasattr(native_tbl, "assign"):  # Pandas
+            import pandas as pd
+            result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
+        else:
+            # Generic fallback
+            result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
+            result_tbl["pb_is_good_"] = validation_results
+        logger.info(
+            f"AI validation completed. {sum(validation_results)} rows passed out of {len(validation_results)}"
+        )
+        return result_tbl
+    except ImportError as e:
+        logger.error(f"Missing dependencies for AI validation: {e}")
+        logger.error("Install required packages: pip install openai anthropic aiohttp")
+        # Return all False results as fallback
+        nw_tbl = nw.from_native(tbl)
+        native_tbl = nw_tbl.to_native()
+        validation_results = [False] * len(tbl)
+        if hasattr(native_tbl, "with_columns"):  # Polars
+            import polars as pl
+            result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
+        elif hasattr(native_tbl, "assign"):  # Pandas
+            import pandas as pd
+            result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
+        else:
+            # Fallback
+            result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
+            result_tbl["pb_is_good_"] = validation_results
+        return result_tbl
+    except Exception as e:
+        logger.error(f"AI validation failed: {e}")
+        # Return all False results as fallback
+        nw_tbl = nw.from_native(tbl)
+        native_tbl = nw_tbl.to_native()
+        validation_results = [False] * len(tbl)
+        if hasattr(native_tbl, "with_columns"):  # Polars
+            import polars as pl
+            result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
+        elif hasattr(native_tbl, "assign"):  # Pandas
+            import pandas as pd
+            result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
+        else:
+            # Fallback
+            result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
+            result_tbl["pb_is_good_"] = validation_results
+        return result_tbl

pointblank/_utils.py CHANGED Viewed

@@ -102,7 +102,7 @@ def _get_tbl_type(data: FrameT | Any) -> str:
                 if "read_parquet" in tbl_name:
                     return "parquet"
-            else:
+            else:  # pragma: no cover
                 return "duckdb"
         return backend
@@ -274,10 +274,10 @@ def _copy_dataframe(df):
         import copy
         return copy.deepcopy(df)
-    except Exception:
+    except Exception:  # pragma: no cover
         # If all else fails, return the original DataFrame
         # This is better than crashing the validation
-        return df
+        return df  # pragma: no cover
 def _convert_to_narwhals(df: FrameT) -> nw.DataFrame:

pointblank 0.13.4__py3-none-any.whl → 0.14.0__py3-none-any.whl

pointblank 0.13.4py3-none-any.whl → 0.14.0py3-none-any.whl