pointblank 0.13.3__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. pointblank/__init__.py +4 -0
  2. pointblank/_constants.py +54 -0
  3. pointblank/_constants_translations.py +541 -2
  4. pointblank/_interrogation.py +198 -12
  5. pointblank/_utils.py +41 -1
  6. pointblank/_utils_ai.py +850 -0
  7. pointblank/cli.py +128 -115
  8. pointblank/column.py +1 -1
  9. pointblank/data/api-docs.txt +198 -13
  10. pointblank/data/validations/README.md +108 -0
  11. pointblank/data/validations/complex_preprocessing.json +54 -0
  12. pointblank/data/validations/complex_preprocessing.pkl +0 -0
  13. pointblank/data/validations/generate_test_files.py +127 -0
  14. pointblank/data/validations/multiple_steps.json +83 -0
  15. pointblank/data/validations/multiple_steps.pkl +0 -0
  16. pointblank/data/validations/narwhals_function.json +28 -0
  17. pointblank/data/validations/narwhals_function.pkl +0 -0
  18. pointblank/data/validations/no_preprocessing.json +83 -0
  19. pointblank/data/validations/no_preprocessing.pkl +0 -0
  20. pointblank/data/validations/pandas_compatible.json +28 -0
  21. pointblank/data/validations/pandas_compatible.pkl +0 -0
  22. pointblank/data/validations/preprocessing_functions.py +46 -0
  23. pointblank/data/validations/simple_preprocessing.json +57 -0
  24. pointblank/data/validations/simple_preprocessing.pkl +0 -0
  25. pointblank/datascan.py +4 -4
  26. pointblank/scan_profile.py +6 -6
  27. pointblank/schema.py +8 -82
  28. pointblank/thresholds.py +1 -1
  29. pointblank/validate.py +1412 -20
  30. {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/METADATA +66 -8
  31. pointblank-0.14.0.dist-info/RECORD +55 -0
  32. pointblank/_constants_docs.py +0 -40
  33. pointblank-0.13.3.dist-info/RECORD +0 -40
  34. {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/WHEEL +0 -0
  35. {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/entry_points.txt +0 -0
  36. {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/licenses/LICENSE +0 -0
  37. {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/top_level.txt +0 -0
@@ -119,8 +119,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
119
119
  # The namespace is the actual module, so we check its name
120
120
  if hasattr(native_namespace, "__name__") and "ibis" in native_namespace.__name__:
121
121
  return null_check
122
- except Exception:
123
- pass
122
+ except Exception: # pragma: no cover
123
+ pass # pragma: no cover
124
124
 
125
125
  # For non-Ibis backends, try to use `is_nan()` if the column type supports it
126
126
  try:
@@ -128,8 +128,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
128
128
  schema = data_frame.collect_schema()
129
129
  elif hasattr(data_frame, "schema"):
130
130
  schema = data_frame.schema
131
- else:
132
- schema = None
131
+ else: # pragma: no cover
132
+ schema = None # pragma: no cover
133
133
 
134
134
  if schema and column_name:
135
135
  column_dtype = schema.get(column_name)
@@ -148,8 +148,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
148
148
  except Exception:
149
149
  # If `is_nan()` fails for any reason, fall back to Null only
150
150
  pass
151
- except Exception:
152
- pass
151
+ except Exception: # pragma: no cover
152
+ pass # pragma: no cover
153
153
 
154
154
  # Fallback: just check Null values
155
155
  return null_check
@@ -333,7 +333,7 @@ class ConjointlyValidation:
333
333
  ibis_expr = col_expr.to_ibis_expr(self.data_tbl)
334
334
  ibis_expressions.append(ibis_expr)
335
335
  except Exception: # pragma: no cover
336
- # Silent failure - we already tried both strategies
336
+ # Silent failure where we already tried both strategies
337
337
  pass
338
338
 
339
339
  # Combine expressions
@@ -370,7 +370,7 @@ class ConjointlyValidation:
370
370
  else:
371
371
  raise TypeError(
372
372
  f"Expression returned {type(expr_result)}, expected PySpark Column"
373
- )
373
+ ) # pragma: no cover
374
374
 
375
375
  except Exception as e:
376
376
  try:
@@ -382,7 +382,9 @@ class ConjointlyValidation:
382
382
  pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
383
383
  pyspark_columns.append(pyspark_expr)
384
384
  else:
385
- raise TypeError(f"Cannot convert {type(col_expr)} to PySpark Column")
385
+ raise TypeError(
386
+ f"Cannot convert {type(col_expr)} to PySpark Column"
387
+ ) # pragma: no cover
386
388
  except Exception as nested_e:
387
389
  print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
388
390
 
@@ -435,7 +437,7 @@ class SpeciallyValidation:
435
437
  data_tbl = self.data_tbl
436
438
  result = expression(data_tbl)
437
439
  else:
438
- # More than one parameter - this doesn't match either allowed signature
440
+ # More than one parameter: this doesn't match either allowed signature
439
441
  raise ValueError(
440
442
  f"The function provided to 'specially()' should have either no parameters or a "
441
443
  f"single 'data' parameter, but it has {len(params)} parameters: {params}"
@@ -656,7 +658,7 @@ def col_vals_expr(data_tbl: FrameT, expr, tbl_type: str = "local"):
656
658
  return data_tbl.assign(pb_is_good_=expr)
657
659
 
658
660
  # For remote backends, return original table (placeholder)
659
- return data_tbl
661
+ return data_tbl # pragma: no cover
660
662
 
661
663
 
662
664
  def rows_complete(data_tbl: FrameT, columns_subset: list[str] | None):
@@ -1688,15 +1690,30 @@ def interrogate_notin(tbl: FrameT, column: str, set_values: any) -> FrameT:
1688
1690
  return result_tbl.to_native()
1689
1691
 
1690
1692
 
1691
- def interrogate_regex(tbl: FrameT, column: str, pattern: str, na_pass: bool) -> FrameT:
1693
+ def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: bool) -> FrameT:
1692
1694
  """Regex interrogation."""
1693
1695
 
1696
+ # Handle both old and new formats for backward compatibility
1697
+ if isinstance(values, str):
1698
+ pattern = values
1699
+ inverse = False
1700
+ else:
1701
+ pattern = values["pattern"]
1702
+ inverse = values["inverse"]
1703
+
1694
1704
  nw_tbl = nw.from_native(tbl)
1695
1705
  result_tbl = nw_tbl.with_columns(
1696
1706
  pb_is_good_1=nw.col(column).is_null() & na_pass,
1697
1707
  pb_is_good_2=nw.col(column).str.contains(pattern, literal=False).fill_null(False),
1698
1708
  )
1699
1709
 
1710
+ # Apply inverse logic if needed
1711
+ if inverse:
1712
+ # Use explicit boolean logic instead of bitwise NOT for pandas compatibility
1713
+ result_tbl = result_tbl.with_columns(
1714
+ pb_is_good_2=nw.when(nw.col("pb_is_good_2")).then(False).otherwise(True)
1715
+ )
1716
+
1700
1717
  result_tbl = result_tbl.with_columns(
1701
1718
  pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2")
1702
1719
  ).drop("pb_is_good_1", "pb_is_good_2")
@@ -1847,3 +1864,172 @@ def interrogate_rows_complete(tbl: FrameT, columns_subset: list[str] | None) ->
1847
1864
  result_tbl = result_tbl.drop("_any_is_null_")
1848
1865
 
1849
1866
  return result_tbl.to_native()
1867
+
1868
+
1869
+ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config: dict) -> FrameT:
1870
+ """AI-powered interrogation of rows."""
1871
+ import logging
1872
+
1873
+ logger = logging.getLogger(__name__)
1874
+
1875
+ try:
1876
+ # Import AI validation modules
1877
+ from pointblank._utils_ai import (
1878
+ _AIValidationEngine,
1879
+ _BatchConfig,
1880
+ _DataBatcher,
1881
+ _LLMConfig,
1882
+ _PromptBuilder,
1883
+ _ValidationResponseParser,
1884
+ )
1885
+
1886
+ # Extract AI configuration
1887
+ prompt = ai_config["prompt"]
1888
+ llm_provider = ai_config["llm_provider"]
1889
+ llm_model = ai_config["llm_model"]
1890
+ batch_size = ai_config.get("batch_size", 1000)
1891
+ max_concurrent = ai_config.get("max_concurrent", 3)
1892
+
1893
+ # Set up LLM configuration (api_key will be loaded from environment)
1894
+ llm_config = _LLMConfig(
1895
+ provider=llm_provider,
1896
+ model=llm_model,
1897
+ api_key=None, # Will be loaded from environment variables
1898
+ )
1899
+
1900
+ # Set up batch configuration
1901
+ batch_config = _BatchConfig(size=batch_size, max_concurrent=max_concurrent)
1902
+
1903
+ # Create optimized data batcher
1904
+ batcher = _DataBatcher(data=tbl, columns=columns_subset, config=batch_config)
1905
+
1906
+ # Create batches with signature mapping for optimization
1907
+ batches, signature_mapping = batcher.create_batches()
1908
+ logger.info(f"Created {len(batches)} batches for AI validation")
1909
+
1910
+ # Log optimization stats
1911
+ if hasattr(batcher, "get_reduction_stats"):
1912
+ stats = batcher.get_reduction_stats()
1913
+ if stats.get("reduction_percentage", 0) > 0:
1914
+ logger.info(
1915
+ f"Optimization: {stats['original_rows']} → {stats['unique_rows']} rows ({stats['reduction_percentage']:.1f}% reduction)"
1916
+ )
1917
+
1918
+ # Create prompt builder
1919
+ prompt_builder = _PromptBuilder(prompt)
1920
+
1921
+ # Create AI validation engine
1922
+ engine = _AIValidationEngine(llm_config)
1923
+
1924
+ # Run AI validation synchronously (chatlas is synchronous)
1925
+ batch_results = engine.validate_batches(
1926
+ batches=batches, prompt_builder=prompt_builder, max_concurrent=max_concurrent
1927
+ )
1928
+
1929
+ # Parse and combine results with signature mapping optimization
1930
+ parser = _ValidationResponseParser(total_rows=len(tbl))
1931
+ combined_results = parser.combine_batch_results(batch_results, signature_mapping)
1932
+
1933
+ # Debug: Log table info and combined results
1934
+ logger.debug("🏁 Final result conversion:")
1935
+ logger.debug(f" - Table length: {len(tbl)}")
1936
+ logger.debug(
1937
+ f" - Combined results keys: {sorted(combined_results.keys()) if combined_results else 'None'}"
1938
+ )
1939
+
1940
+ # Convert results to narwhals format
1941
+ nw_tbl = nw.from_native(tbl)
1942
+
1943
+ # Create a boolean column for validation results
1944
+ validation_results = []
1945
+ for i in range(len(tbl)):
1946
+ # Default to False if row wasn't processed
1947
+ result = combined_results.get(i, False)
1948
+ validation_results.append(result)
1949
+
1950
+ # Debug: Log first few conversions
1951
+ if i < 5 or len(tbl) - i <= 2:
1952
+ logger.debug(f" Row {i}: {result} (from combined_results.get({i}, False))")
1953
+
1954
+ logger.debug(f" - Final validation_results length: {len(validation_results)}")
1955
+ logger.debug(f" - Final passed count: {sum(validation_results)}")
1956
+ logger.debug(
1957
+ f" - Final failed count: {len(validation_results) - sum(validation_results)}"
1958
+ )
1959
+
1960
+ # Add the pb_is_good_ column by creating a proper boolean Series
1961
+ # First convert to native to work with the underlying data frame
1962
+ native_tbl = nw_tbl.to_native()
1963
+
1964
+ # Create the result table with the boolean column
1965
+ if hasattr(native_tbl, "with_columns"): # Polars
1966
+ import polars as pl
1967
+
1968
+ result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
1969
+
1970
+ elif hasattr(native_tbl, "assign"): # Pandas
1971
+ import pandas as pd
1972
+
1973
+ result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
1974
+
1975
+ else:
1976
+ # Generic fallback
1977
+ result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
1978
+ result_tbl["pb_is_good_"] = validation_results
1979
+
1980
+ logger.info(
1981
+ f"AI validation completed. {sum(validation_results)} rows passed out of {len(validation_results)}"
1982
+ )
1983
+
1984
+ return result_tbl
1985
+
1986
+ except ImportError as e:
1987
+ logger.error(f"Missing dependencies for AI validation: {e}")
1988
+ logger.error("Install required packages: pip install openai anthropic aiohttp")
1989
+
1990
+ # Return all False results as fallback
1991
+ nw_tbl = nw.from_native(tbl)
1992
+ native_tbl = nw_tbl.to_native()
1993
+ validation_results = [False] * len(tbl)
1994
+
1995
+ if hasattr(native_tbl, "with_columns"): # Polars
1996
+ import polars as pl
1997
+
1998
+ result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
1999
+
2000
+ elif hasattr(native_tbl, "assign"): # Pandas
2001
+ import pandas as pd
2002
+
2003
+ result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
2004
+
2005
+ else:
2006
+ # Fallback
2007
+ result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
2008
+ result_tbl["pb_is_good_"] = validation_results
2009
+
2010
+ return result_tbl
2011
+
2012
+ except Exception as e:
2013
+ logger.error(f"AI validation failed: {e}")
2014
+
2015
+ # Return all False results as fallback
2016
+ nw_tbl = nw.from_native(tbl)
2017
+ native_tbl = nw_tbl.to_native()
2018
+ validation_results = [False] * len(tbl)
2019
+
2020
+ if hasattr(native_tbl, "with_columns"): # Polars
2021
+ import polars as pl
2022
+
2023
+ result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
2024
+
2025
+ elif hasattr(native_tbl, "assign"): # Pandas
2026
+ import pandas as pd
2027
+
2028
+ result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
2029
+
2030
+ else:
2031
+ # Fallback
2032
+ result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
2033
+ result_tbl["pb_is_good_"] = validation_results
2034
+
2035
+ return result_tbl
pointblank/_utils.py CHANGED
@@ -102,7 +102,7 @@ def _get_tbl_type(data: FrameT | Any) -> str:
102
102
  if "read_parquet" in tbl_name:
103
103
  return "parquet"
104
104
 
105
- else:
105
+ else: # pragma: no cover
106
106
  return "duckdb"
107
107
 
108
108
  return backend
@@ -240,6 +240,46 @@ def _select_df_lib(preference: str = "polars") -> Any:
240
240
  return pl if pl is not None else pd
241
241
 
242
242
 
243
+ def _copy_dataframe(df):
244
+ """
245
+ Create a copy of a DataFrame, handling different DataFrame types.
246
+
247
+ This function attempts to create a proper copy of the DataFrame using
248
+ the most appropriate method for each DataFrame type.
249
+ """
250
+ # Try standard copy methods first
251
+ if hasattr(df, "copy") and callable(getattr(df, "copy")):
252
+ try:
253
+ return df.copy()
254
+ except Exception:
255
+ pass
256
+
257
+ if hasattr(df, "clone") and callable(getattr(df, "clone")):
258
+ try:
259
+ return df.clone()
260
+ except Exception:
261
+ pass
262
+
263
+ # Try the select('*') approach for DataFrames that support it
264
+ # This works well for PySpark and other SQL-like DataFrames
265
+ if hasattr(df, "select") and callable(getattr(df, "select")):
266
+ try:
267
+ return df.select("*")
268
+ except Exception:
269
+ pass
270
+
271
+ # For DataFrames that can't be copied, return original
272
+ # This provides some protection while avoiding crashes
273
+ try:
274
+ import copy
275
+
276
+ return copy.deepcopy(df)
277
+ except Exception: # pragma: no cover
278
+ # If all else fails, return the original DataFrame
279
+ # This is better than crashing the validation
280
+ return df # pragma: no cover
281
+
282
+
243
283
  def _convert_to_narwhals(df: FrameT) -> nw.DataFrame:
244
284
  # Convert the DataFrame to a format that narwhals can work with
245
285
  return nw.from_native(df)