pointblank 0.13.4__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. pointblank/__init__.py +4 -0
  2. pointblank/_constants.py +54 -0
  3. pointblank/_constants_translations.py +487 -2
  4. pointblank/_interrogation.py +182 -11
  5. pointblank/_utils.py +3 -3
  6. pointblank/_utils_ai.py +850 -0
  7. pointblank/cli.py +128 -115
  8. pointblank/column.py +1 -1
  9. pointblank/data/api-docs.txt +198 -13
  10. pointblank/data/validations/README.md +108 -0
  11. pointblank/data/validations/complex_preprocessing.json +54 -0
  12. pointblank/data/validations/complex_preprocessing.pkl +0 -0
  13. pointblank/data/validations/generate_test_files.py +127 -0
  14. pointblank/data/validations/multiple_steps.json +83 -0
  15. pointblank/data/validations/multiple_steps.pkl +0 -0
  16. pointblank/data/validations/narwhals_function.json +28 -0
  17. pointblank/data/validations/narwhals_function.pkl +0 -0
  18. pointblank/data/validations/no_preprocessing.json +83 -0
  19. pointblank/data/validations/no_preprocessing.pkl +0 -0
  20. pointblank/data/validations/pandas_compatible.json +28 -0
  21. pointblank/data/validations/pandas_compatible.pkl +0 -0
  22. pointblank/data/validations/preprocessing_functions.py +46 -0
  23. pointblank/data/validations/simple_preprocessing.json +57 -0
  24. pointblank/data/validations/simple_preprocessing.pkl +0 -0
  25. pointblank/datascan.py +4 -4
  26. pointblank/scan_profile.py +6 -6
  27. pointblank/schema.py +8 -82
  28. pointblank/thresholds.py +1 -1
  29. pointblank/validate.py +1233 -12
  30. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/METADATA +66 -8
  31. pointblank-0.14.0.dist-info/RECORD +55 -0
  32. pointblank-0.13.4.dist-info/RECORD +0 -39
  33. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/WHEEL +0 -0
  34. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/entry_points.txt +0 -0
  35. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/licenses/LICENSE +0 -0
  36. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/top_level.txt +0 -0
@@ -119,8 +119,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
119
119
  # The namespace is the actual module, so we check its name
120
120
  if hasattr(native_namespace, "__name__") and "ibis" in native_namespace.__name__:
121
121
  return null_check
122
- except Exception:
123
- pass
122
+ except Exception: # pragma: no cover
123
+ pass # pragma: no cover
124
124
 
125
125
  # For non-Ibis backends, try to use `is_nan()` if the column type supports it
126
126
  try:
@@ -128,8 +128,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
128
128
  schema = data_frame.collect_schema()
129
129
  elif hasattr(data_frame, "schema"):
130
130
  schema = data_frame.schema
131
- else:
132
- schema = None
131
+ else: # pragma: no cover
132
+ schema = None # pragma: no cover
133
133
 
134
134
  if schema and column_name:
135
135
  column_dtype = schema.get(column_name)
@@ -148,8 +148,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
148
148
  except Exception:
149
149
  # If `is_nan()` fails for any reason, fall back to Null only
150
150
  pass
151
- except Exception:
152
- pass
151
+ except Exception: # pragma: no cover
152
+ pass # pragma: no cover
153
153
 
154
154
  # Fallback: just check Null values
155
155
  return null_check
@@ -333,7 +333,7 @@ class ConjointlyValidation:
333
333
  ibis_expr = col_expr.to_ibis_expr(self.data_tbl)
334
334
  ibis_expressions.append(ibis_expr)
335
335
  except Exception: # pragma: no cover
336
- # Silent failure - we already tried both strategies
336
+ # Silent failure where we already tried both strategies
337
337
  pass
338
338
 
339
339
  # Combine expressions
@@ -370,7 +370,7 @@ class ConjointlyValidation:
370
370
  else:
371
371
  raise TypeError(
372
372
  f"Expression returned {type(expr_result)}, expected PySpark Column"
373
- )
373
+ ) # pragma: no cover
374
374
 
375
375
  except Exception as e:
376
376
  try:
@@ -382,7 +382,9 @@ class ConjointlyValidation:
382
382
  pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
383
383
  pyspark_columns.append(pyspark_expr)
384
384
  else:
385
- raise TypeError(f"Cannot convert {type(col_expr)} to PySpark Column")
385
+ raise TypeError(
386
+ f"Cannot convert {type(col_expr)} to PySpark Column"
387
+ ) # pragma: no cover
386
388
  except Exception as nested_e:
387
389
  print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
388
390
 
@@ -435,7 +437,7 @@ class SpeciallyValidation:
435
437
  data_tbl = self.data_tbl
436
438
  result = expression(data_tbl)
437
439
  else:
438
- # More than one parameter - this doesn't match either allowed signature
440
+ # More than one parameter: this doesn't match either allowed signature
439
441
  raise ValueError(
440
442
  f"The function provided to 'specially()' should have either no parameters or a "
441
443
  f"single 'data' parameter, but it has {len(params)} parameters: {params}"
@@ -656,7 +658,7 @@ def col_vals_expr(data_tbl: FrameT, expr, tbl_type: str = "local"):
656
658
  return data_tbl.assign(pb_is_good_=expr)
657
659
 
658
660
  # For remote backends, return original table (placeholder)
659
- return data_tbl
661
+ return data_tbl # pragma: no cover
660
662
 
661
663
 
662
664
  def rows_complete(data_tbl: FrameT, columns_subset: list[str] | None):
@@ -1862,3 +1864,172 @@ def interrogate_rows_complete(tbl: FrameT, columns_subset: list[str] | None) ->
1862
1864
  result_tbl = result_tbl.drop("_any_is_null_")
1863
1865
 
1864
1866
  return result_tbl.to_native()
1867
+
1868
+
1869
+ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config: dict) -> FrameT:
1870
+ """AI-powered interrogation of rows."""
1871
+ import logging
1872
+
1873
+ logger = logging.getLogger(__name__)
1874
+
1875
+ try:
1876
+ # Import AI validation modules
1877
+ from pointblank._utils_ai import (
1878
+ _AIValidationEngine,
1879
+ _BatchConfig,
1880
+ _DataBatcher,
1881
+ _LLMConfig,
1882
+ _PromptBuilder,
1883
+ _ValidationResponseParser,
1884
+ )
1885
+
1886
+ # Extract AI configuration
1887
+ prompt = ai_config["prompt"]
1888
+ llm_provider = ai_config["llm_provider"]
1889
+ llm_model = ai_config["llm_model"]
1890
+ batch_size = ai_config.get("batch_size", 1000)
1891
+ max_concurrent = ai_config.get("max_concurrent", 3)
1892
+
1893
+ # Set up LLM configuration (api_key will be loaded from environment)
1894
+ llm_config = _LLMConfig(
1895
+ provider=llm_provider,
1896
+ model=llm_model,
1897
+ api_key=None, # Will be loaded from environment variables
1898
+ )
1899
+
1900
+ # Set up batch configuration
1901
+ batch_config = _BatchConfig(size=batch_size, max_concurrent=max_concurrent)
1902
+
1903
+ # Create optimized data batcher
1904
+ batcher = _DataBatcher(data=tbl, columns=columns_subset, config=batch_config)
1905
+
1906
+ # Create batches with signature mapping for optimization
1907
+ batches, signature_mapping = batcher.create_batches()
1908
+ logger.info(f"Created {len(batches)} batches for AI validation")
1909
+
1910
+ # Log optimization stats
1911
+ if hasattr(batcher, "get_reduction_stats"):
1912
+ stats = batcher.get_reduction_stats()
1913
+ if stats.get("reduction_percentage", 0) > 0:
1914
+ logger.info(
1915
+ f"Optimization: {stats['original_rows']} → {stats['unique_rows']} rows ({stats['reduction_percentage']:.1f}% reduction)"
1916
+ )
1917
+
1918
+ # Create prompt builder
1919
+ prompt_builder = _PromptBuilder(prompt)
1920
+
1921
+ # Create AI validation engine
1922
+ engine = _AIValidationEngine(llm_config)
1923
+
1924
+ # Run AI validation synchronously (chatlas is synchronous)
1925
+ batch_results = engine.validate_batches(
1926
+ batches=batches, prompt_builder=prompt_builder, max_concurrent=max_concurrent
1927
+ )
1928
+
1929
+ # Parse and combine results with signature mapping optimization
1930
+ parser = _ValidationResponseParser(total_rows=len(tbl))
1931
+ combined_results = parser.combine_batch_results(batch_results, signature_mapping)
1932
+
1933
+ # Debug: Log table info and combined results
1934
+ logger.debug("🏁 Final result conversion:")
1935
+ logger.debug(f" - Table length: {len(tbl)}")
1936
+ logger.debug(
1937
+ f" - Combined results keys: {sorted(combined_results.keys()) if combined_results else 'None'}"
1938
+ )
1939
+
1940
+ # Convert results to narwhals format
1941
+ nw_tbl = nw.from_native(tbl)
1942
+
1943
+ # Create a boolean column for validation results
1944
+ validation_results = []
1945
+ for i in range(len(tbl)):
1946
+ # Default to False if row wasn't processed
1947
+ result = combined_results.get(i, False)
1948
+ validation_results.append(result)
1949
+
1950
+ # Debug: Log first few conversions
1951
+ if i < 5 or len(tbl) - i <= 2:
1952
+ logger.debug(f" Row {i}: {result} (from combined_results.get({i}, False))")
1953
+
1954
+ logger.debug(f" - Final validation_results length: {len(validation_results)}")
1955
+ logger.debug(f" - Final passed count: {sum(validation_results)}")
1956
+ logger.debug(
1957
+ f" - Final failed count: {len(validation_results) - sum(validation_results)}"
1958
+ )
1959
+
1960
+ # Add the pb_is_good_ column by creating a proper boolean Series
1961
+ # First convert to native to work with the underlying data frame
1962
+ native_tbl = nw_tbl.to_native()
1963
+
1964
+ # Create the result table with the boolean column
1965
+ if hasattr(native_tbl, "with_columns"): # Polars
1966
+ import polars as pl
1967
+
1968
+ result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
1969
+
1970
+ elif hasattr(native_tbl, "assign"): # Pandas
1971
+ import pandas as pd
1972
+
1973
+ result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
1974
+
1975
+ else:
1976
+ # Generic fallback
1977
+ result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
1978
+ result_tbl["pb_is_good_"] = validation_results
1979
+
1980
+ logger.info(
1981
+ f"AI validation completed. {sum(validation_results)} rows passed out of {len(validation_results)}"
1982
+ )
1983
+
1984
+ return result_tbl
1985
+
1986
+ except ImportError as e:
1987
+ logger.error(f"Missing dependencies for AI validation: {e}")
1988
+ logger.error("Install required packages: pip install openai anthropic aiohttp")
1989
+
1990
+ # Return all False results as fallback
1991
+ nw_tbl = nw.from_native(tbl)
1992
+ native_tbl = nw_tbl.to_native()
1993
+ validation_results = [False] * len(tbl)
1994
+
1995
+ if hasattr(native_tbl, "with_columns"): # Polars
1996
+ import polars as pl
1997
+
1998
+ result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
1999
+
2000
+ elif hasattr(native_tbl, "assign"): # Pandas
2001
+ import pandas as pd
2002
+
2003
+ result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
2004
+
2005
+ else:
2006
+ # Fallback
2007
+ result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
2008
+ result_tbl["pb_is_good_"] = validation_results
2009
+
2010
+ return result_tbl
2011
+
2012
+ except Exception as e:
2013
+ logger.error(f"AI validation failed: {e}")
2014
+
2015
+ # Return all False results as fallback
2016
+ nw_tbl = nw.from_native(tbl)
2017
+ native_tbl = nw_tbl.to_native()
2018
+ validation_results = [False] * len(tbl)
2019
+
2020
+ if hasattr(native_tbl, "with_columns"): # Polars
2021
+ import polars as pl
2022
+
2023
+ result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
2024
+
2025
+ elif hasattr(native_tbl, "assign"): # Pandas
2026
+ import pandas as pd
2027
+
2028
+ result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
2029
+
2030
+ else:
2031
+ # Fallback
2032
+ result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
2033
+ result_tbl["pb_is_good_"] = validation_results
2034
+
2035
+ return result_tbl
pointblank/_utils.py CHANGED
@@ -102,7 +102,7 @@ def _get_tbl_type(data: FrameT | Any) -> str:
102
102
  if "read_parquet" in tbl_name:
103
103
  return "parquet"
104
104
 
105
- else:
105
+ else: # pragma: no cover
106
106
  return "duckdb"
107
107
 
108
108
  return backend
@@ -274,10 +274,10 @@ def _copy_dataframe(df):
274
274
  import copy
275
275
 
276
276
  return copy.deepcopy(df)
277
- except Exception:
277
+ except Exception: # pragma: no cover
278
278
  # If all else fails, return the original DataFrame
279
279
  # This is better than crashing the validation
280
- return df
280
+ return df # pragma: no cover
281
281
 
282
282
 
283
283
  def _convert_to_narwhals(df: FrameT) -> nw.DataFrame: