pointblank 0.13.3__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +54 -0
- pointblank/_constants_translations.py +541 -2
- pointblank/_interrogation.py +198 -12
- pointblank/_utils.py +41 -1
- pointblank/_utils_ai.py +850 -0
- pointblank/cli.py +128 -115
- pointblank/column.py +1 -1
- pointblank/data/api-docs.txt +198 -13
- pointblank/data/validations/README.md +108 -0
- pointblank/data/validations/complex_preprocessing.json +54 -0
- pointblank/data/validations/complex_preprocessing.pkl +0 -0
- pointblank/data/validations/generate_test_files.py +127 -0
- pointblank/data/validations/multiple_steps.json +83 -0
- pointblank/data/validations/multiple_steps.pkl +0 -0
- pointblank/data/validations/narwhals_function.json +28 -0
- pointblank/data/validations/narwhals_function.pkl +0 -0
- pointblank/data/validations/no_preprocessing.json +83 -0
- pointblank/data/validations/no_preprocessing.pkl +0 -0
- pointblank/data/validations/pandas_compatible.json +28 -0
- pointblank/data/validations/pandas_compatible.pkl +0 -0
- pointblank/data/validations/preprocessing_functions.py +46 -0
- pointblank/data/validations/simple_preprocessing.json +57 -0
- pointblank/data/validations/simple_preprocessing.pkl +0 -0
- pointblank/datascan.py +4 -4
- pointblank/scan_profile.py +6 -6
- pointblank/schema.py +8 -82
- pointblank/thresholds.py +1 -1
- pointblank/validate.py +1412 -20
- {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/METADATA +66 -8
- pointblank-0.14.0.dist-info/RECORD +55 -0
- pointblank/_constants_docs.py +0 -40
- pointblank-0.13.3.dist-info/RECORD +0 -40
- {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/WHEEL +0 -0
- {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/top_level.txt +0 -0
pointblank/_interrogation.py
CHANGED
|
@@ -119,8 +119,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
|
|
|
119
119
|
# The namespace is the actual module, so we check its name
|
|
120
120
|
if hasattr(native_namespace, "__name__") and "ibis" in native_namespace.__name__:
|
|
121
121
|
return null_check
|
|
122
|
-
except Exception:
|
|
123
|
-
pass
|
|
122
|
+
except Exception: # pragma: no cover
|
|
123
|
+
pass # pragma: no cover
|
|
124
124
|
|
|
125
125
|
# For non-Ibis backends, try to use `is_nan()` if the column type supports it
|
|
126
126
|
try:
|
|
@@ -128,8 +128,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
|
|
|
128
128
|
schema = data_frame.collect_schema()
|
|
129
129
|
elif hasattr(data_frame, "schema"):
|
|
130
130
|
schema = data_frame.schema
|
|
131
|
-
else:
|
|
132
|
-
schema = None
|
|
131
|
+
else: # pragma: no cover
|
|
132
|
+
schema = None # pragma: no cover
|
|
133
133
|
|
|
134
134
|
if schema and column_name:
|
|
135
135
|
column_dtype = schema.get(column_name)
|
|
@@ -148,8 +148,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
|
|
|
148
148
|
except Exception:
|
|
149
149
|
# If `is_nan()` fails for any reason, fall back to Null only
|
|
150
150
|
pass
|
|
151
|
-
except Exception:
|
|
152
|
-
pass
|
|
151
|
+
except Exception: # pragma: no cover
|
|
152
|
+
pass # pragma: no cover
|
|
153
153
|
|
|
154
154
|
# Fallback: just check Null values
|
|
155
155
|
return null_check
|
|
@@ -333,7 +333,7 @@ class ConjointlyValidation:
|
|
|
333
333
|
ibis_expr = col_expr.to_ibis_expr(self.data_tbl)
|
|
334
334
|
ibis_expressions.append(ibis_expr)
|
|
335
335
|
except Exception: # pragma: no cover
|
|
336
|
-
# Silent failure
|
|
336
|
+
# Silent failure where we already tried both strategies
|
|
337
337
|
pass
|
|
338
338
|
|
|
339
339
|
# Combine expressions
|
|
@@ -370,7 +370,7 @@ class ConjointlyValidation:
|
|
|
370
370
|
else:
|
|
371
371
|
raise TypeError(
|
|
372
372
|
f"Expression returned {type(expr_result)}, expected PySpark Column"
|
|
373
|
-
)
|
|
373
|
+
) # pragma: no cover
|
|
374
374
|
|
|
375
375
|
except Exception as e:
|
|
376
376
|
try:
|
|
@@ -382,7 +382,9 @@ class ConjointlyValidation:
|
|
|
382
382
|
pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
|
|
383
383
|
pyspark_columns.append(pyspark_expr)
|
|
384
384
|
else:
|
|
385
|
-
raise TypeError(
|
|
385
|
+
raise TypeError(
|
|
386
|
+
f"Cannot convert {type(col_expr)} to PySpark Column"
|
|
387
|
+
) # pragma: no cover
|
|
386
388
|
except Exception as nested_e:
|
|
387
389
|
print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
|
|
388
390
|
|
|
@@ -435,7 +437,7 @@ class SpeciallyValidation:
|
|
|
435
437
|
data_tbl = self.data_tbl
|
|
436
438
|
result = expression(data_tbl)
|
|
437
439
|
else:
|
|
438
|
-
# More than one parameter
|
|
440
|
+
# More than one parameter: this doesn't match either allowed signature
|
|
439
441
|
raise ValueError(
|
|
440
442
|
f"The function provided to 'specially()' should have either no parameters or a "
|
|
441
443
|
f"single 'data' parameter, but it has {len(params)} parameters: {params}"
|
|
@@ -656,7 +658,7 @@ def col_vals_expr(data_tbl: FrameT, expr, tbl_type: str = "local"):
|
|
|
656
658
|
return data_tbl.assign(pb_is_good_=expr)
|
|
657
659
|
|
|
658
660
|
# For remote backends, return original table (placeholder)
|
|
659
|
-
return data_tbl
|
|
661
|
+
return data_tbl # pragma: no cover
|
|
660
662
|
|
|
661
663
|
|
|
662
664
|
def rows_complete(data_tbl: FrameT, columns_subset: list[str] | None):
|
|
@@ -1688,15 +1690,30 @@ def interrogate_notin(tbl: FrameT, column: str, set_values: any) -> FrameT:
|
|
|
1688
1690
|
return result_tbl.to_native()
|
|
1689
1691
|
|
|
1690
1692
|
|
|
1691
|
-
def interrogate_regex(tbl: FrameT, column: str,
|
|
1693
|
+
def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: bool) -> FrameT:
|
|
1692
1694
|
"""Regex interrogation."""
|
|
1693
1695
|
|
|
1696
|
+
# Handle both old and new formats for backward compatibility
|
|
1697
|
+
if isinstance(values, str):
|
|
1698
|
+
pattern = values
|
|
1699
|
+
inverse = False
|
|
1700
|
+
else:
|
|
1701
|
+
pattern = values["pattern"]
|
|
1702
|
+
inverse = values["inverse"]
|
|
1703
|
+
|
|
1694
1704
|
nw_tbl = nw.from_native(tbl)
|
|
1695
1705
|
result_tbl = nw_tbl.with_columns(
|
|
1696
1706
|
pb_is_good_1=nw.col(column).is_null() & na_pass,
|
|
1697
1707
|
pb_is_good_2=nw.col(column).str.contains(pattern, literal=False).fill_null(False),
|
|
1698
1708
|
)
|
|
1699
1709
|
|
|
1710
|
+
# Apply inverse logic if needed
|
|
1711
|
+
if inverse:
|
|
1712
|
+
# Use explicit boolean logic instead of bitwise NOT for pandas compatibility
|
|
1713
|
+
result_tbl = result_tbl.with_columns(
|
|
1714
|
+
pb_is_good_2=nw.when(nw.col("pb_is_good_2")).then(False).otherwise(True)
|
|
1715
|
+
)
|
|
1716
|
+
|
|
1700
1717
|
result_tbl = result_tbl.with_columns(
|
|
1701
1718
|
pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2")
|
|
1702
1719
|
).drop("pb_is_good_1", "pb_is_good_2")
|
|
@@ -1847,3 +1864,172 @@ def interrogate_rows_complete(tbl: FrameT, columns_subset: list[str] | None) ->
|
|
|
1847
1864
|
result_tbl = result_tbl.drop("_any_is_null_")
|
|
1848
1865
|
|
|
1849
1866
|
return result_tbl.to_native()
|
|
1867
|
+
|
|
1868
|
+
|
|
1869
|
+
def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config: dict) -> FrameT:
|
|
1870
|
+
"""AI-powered interrogation of rows."""
|
|
1871
|
+
import logging
|
|
1872
|
+
|
|
1873
|
+
logger = logging.getLogger(__name__)
|
|
1874
|
+
|
|
1875
|
+
try:
|
|
1876
|
+
# Import AI validation modules
|
|
1877
|
+
from pointblank._utils_ai import (
|
|
1878
|
+
_AIValidationEngine,
|
|
1879
|
+
_BatchConfig,
|
|
1880
|
+
_DataBatcher,
|
|
1881
|
+
_LLMConfig,
|
|
1882
|
+
_PromptBuilder,
|
|
1883
|
+
_ValidationResponseParser,
|
|
1884
|
+
)
|
|
1885
|
+
|
|
1886
|
+
# Extract AI configuration
|
|
1887
|
+
prompt = ai_config["prompt"]
|
|
1888
|
+
llm_provider = ai_config["llm_provider"]
|
|
1889
|
+
llm_model = ai_config["llm_model"]
|
|
1890
|
+
batch_size = ai_config.get("batch_size", 1000)
|
|
1891
|
+
max_concurrent = ai_config.get("max_concurrent", 3)
|
|
1892
|
+
|
|
1893
|
+
# Set up LLM configuration (api_key will be loaded from environment)
|
|
1894
|
+
llm_config = _LLMConfig(
|
|
1895
|
+
provider=llm_provider,
|
|
1896
|
+
model=llm_model,
|
|
1897
|
+
api_key=None, # Will be loaded from environment variables
|
|
1898
|
+
)
|
|
1899
|
+
|
|
1900
|
+
# Set up batch configuration
|
|
1901
|
+
batch_config = _BatchConfig(size=batch_size, max_concurrent=max_concurrent)
|
|
1902
|
+
|
|
1903
|
+
# Create optimized data batcher
|
|
1904
|
+
batcher = _DataBatcher(data=tbl, columns=columns_subset, config=batch_config)
|
|
1905
|
+
|
|
1906
|
+
# Create batches with signature mapping for optimization
|
|
1907
|
+
batches, signature_mapping = batcher.create_batches()
|
|
1908
|
+
logger.info(f"Created {len(batches)} batches for AI validation")
|
|
1909
|
+
|
|
1910
|
+
# Log optimization stats
|
|
1911
|
+
if hasattr(batcher, "get_reduction_stats"):
|
|
1912
|
+
stats = batcher.get_reduction_stats()
|
|
1913
|
+
if stats.get("reduction_percentage", 0) > 0:
|
|
1914
|
+
logger.info(
|
|
1915
|
+
f"Optimization: {stats['original_rows']} → {stats['unique_rows']} rows ({stats['reduction_percentage']:.1f}% reduction)"
|
|
1916
|
+
)
|
|
1917
|
+
|
|
1918
|
+
# Create prompt builder
|
|
1919
|
+
prompt_builder = _PromptBuilder(prompt)
|
|
1920
|
+
|
|
1921
|
+
# Create AI validation engine
|
|
1922
|
+
engine = _AIValidationEngine(llm_config)
|
|
1923
|
+
|
|
1924
|
+
# Run AI validation synchronously (chatlas is synchronous)
|
|
1925
|
+
batch_results = engine.validate_batches(
|
|
1926
|
+
batches=batches, prompt_builder=prompt_builder, max_concurrent=max_concurrent
|
|
1927
|
+
)
|
|
1928
|
+
|
|
1929
|
+
# Parse and combine results with signature mapping optimization
|
|
1930
|
+
parser = _ValidationResponseParser(total_rows=len(tbl))
|
|
1931
|
+
combined_results = parser.combine_batch_results(batch_results, signature_mapping)
|
|
1932
|
+
|
|
1933
|
+
# Debug: Log table info and combined results
|
|
1934
|
+
logger.debug("🏁 Final result conversion:")
|
|
1935
|
+
logger.debug(f" - Table length: {len(tbl)}")
|
|
1936
|
+
logger.debug(
|
|
1937
|
+
f" - Combined results keys: {sorted(combined_results.keys()) if combined_results else 'None'}"
|
|
1938
|
+
)
|
|
1939
|
+
|
|
1940
|
+
# Convert results to narwhals format
|
|
1941
|
+
nw_tbl = nw.from_native(tbl)
|
|
1942
|
+
|
|
1943
|
+
# Create a boolean column for validation results
|
|
1944
|
+
validation_results = []
|
|
1945
|
+
for i in range(len(tbl)):
|
|
1946
|
+
# Default to False if row wasn't processed
|
|
1947
|
+
result = combined_results.get(i, False)
|
|
1948
|
+
validation_results.append(result)
|
|
1949
|
+
|
|
1950
|
+
# Debug: Log first few conversions
|
|
1951
|
+
if i < 5 or len(tbl) - i <= 2:
|
|
1952
|
+
logger.debug(f" Row {i}: {result} (from combined_results.get({i}, False))")
|
|
1953
|
+
|
|
1954
|
+
logger.debug(f" - Final validation_results length: {len(validation_results)}")
|
|
1955
|
+
logger.debug(f" - Final passed count: {sum(validation_results)}")
|
|
1956
|
+
logger.debug(
|
|
1957
|
+
f" - Final failed count: {len(validation_results) - sum(validation_results)}"
|
|
1958
|
+
)
|
|
1959
|
+
|
|
1960
|
+
# Add the pb_is_good_ column by creating a proper boolean Series
|
|
1961
|
+
# First convert to native to work with the underlying data frame
|
|
1962
|
+
native_tbl = nw_tbl.to_native()
|
|
1963
|
+
|
|
1964
|
+
# Create the result table with the boolean column
|
|
1965
|
+
if hasattr(native_tbl, "with_columns"): # Polars
|
|
1966
|
+
import polars as pl
|
|
1967
|
+
|
|
1968
|
+
result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
|
|
1969
|
+
|
|
1970
|
+
elif hasattr(native_tbl, "assign"): # Pandas
|
|
1971
|
+
import pandas as pd
|
|
1972
|
+
|
|
1973
|
+
result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
|
|
1974
|
+
|
|
1975
|
+
else:
|
|
1976
|
+
# Generic fallback
|
|
1977
|
+
result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
|
|
1978
|
+
result_tbl["pb_is_good_"] = validation_results
|
|
1979
|
+
|
|
1980
|
+
logger.info(
|
|
1981
|
+
f"AI validation completed. {sum(validation_results)} rows passed out of {len(validation_results)}"
|
|
1982
|
+
)
|
|
1983
|
+
|
|
1984
|
+
return result_tbl
|
|
1985
|
+
|
|
1986
|
+
except ImportError as e:
|
|
1987
|
+
logger.error(f"Missing dependencies for AI validation: {e}")
|
|
1988
|
+
logger.error("Install required packages: pip install openai anthropic aiohttp")
|
|
1989
|
+
|
|
1990
|
+
# Return all False results as fallback
|
|
1991
|
+
nw_tbl = nw.from_native(tbl)
|
|
1992
|
+
native_tbl = nw_tbl.to_native()
|
|
1993
|
+
validation_results = [False] * len(tbl)
|
|
1994
|
+
|
|
1995
|
+
if hasattr(native_tbl, "with_columns"): # Polars
|
|
1996
|
+
import polars as pl
|
|
1997
|
+
|
|
1998
|
+
result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
|
|
1999
|
+
|
|
2000
|
+
elif hasattr(native_tbl, "assign"): # Pandas
|
|
2001
|
+
import pandas as pd
|
|
2002
|
+
|
|
2003
|
+
result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
|
|
2004
|
+
|
|
2005
|
+
else:
|
|
2006
|
+
# Fallback
|
|
2007
|
+
result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
|
|
2008
|
+
result_tbl["pb_is_good_"] = validation_results
|
|
2009
|
+
|
|
2010
|
+
return result_tbl
|
|
2011
|
+
|
|
2012
|
+
except Exception as e:
|
|
2013
|
+
logger.error(f"AI validation failed: {e}")
|
|
2014
|
+
|
|
2015
|
+
# Return all False results as fallback
|
|
2016
|
+
nw_tbl = nw.from_native(tbl)
|
|
2017
|
+
native_tbl = nw_tbl.to_native()
|
|
2018
|
+
validation_results = [False] * len(tbl)
|
|
2019
|
+
|
|
2020
|
+
if hasattr(native_tbl, "with_columns"): # Polars
|
|
2021
|
+
import polars as pl
|
|
2022
|
+
|
|
2023
|
+
result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
|
|
2024
|
+
|
|
2025
|
+
elif hasattr(native_tbl, "assign"): # Pandas
|
|
2026
|
+
import pandas as pd
|
|
2027
|
+
|
|
2028
|
+
result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
|
|
2029
|
+
|
|
2030
|
+
else:
|
|
2031
|
+
# Fallback
|
|
2032
|
+
result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
|
|
2033
|
+
result_tbl["pb_is_good_"] = validation_results
|
|
2034
|
+
|
|
2035
|
+
return result_tbl
|
pointblank/_utils.py
CHANGED
|
@@ -102,7 +102,7 @@ def _get_tbl_type(data: FrameT | Any) -> str:
|
|
|
102
102
|
if "read_parquet" in tbl_name:
|
|
103
103
|
return "parquet"
|
|
104
104
|
|
|
105
|
-
else:
|
|
105
|
+
else: # pragma: no cover
|
|
106
106
|
return "duckdb"
|
|
107
107
|
|
|
108
108
|
return backend
|
|
@@ -240,6 +240,46 @@ def _select_df_lib(preference: str = "polars") -> Any:
|
|
|
240
240
|
return pl if pl is not None else pd
|
|
241
241
|
|
|
242
242
|
|
|
243
|
+
def _copy_dataframe(df):
|
|
244
|
+
"""
|
|
245
|
+
Create a copy of a DataFrame, handling different DataFrame types.
|
|
246
|
+
|
|
247
|
+
This function attempts to create a proper copy of the DataFrame using
|
|
248
|
+
the most appropriate method for each DataFrame type.
|
|
249
|
+
"""
|
|
250
|
+
# Try standard copy methods first
|
|
251
|
+
if hasattr(df, "copy") and callable(getattr(df, "copy")):
|
|
252
|
+
try:
|
|
253
|
+
return df.copy()
|
|
254
|
+
except Exception:
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
if hasattr(df, "clone") and callable(getattr(df, "clone")):
|
|
258
|
+
try:
|
|
259
|
+
return df.clone()
|
|
260
|
+
except Exception:
|
|
261
|
+
pass
|
|
262
|
+
|
|
263
|
+
# Try the select('*') approach for DataFrames that support it
|
|
264
|
+
# This works well for PySpark and other SQL-like DataFrames
|
|
265
|
+
if hasattr(df, "select") and callable(getattr(df, "select")):
|
|
266
|
+
try:
|
|
267
|
+
return df.select("*")
|
|
268
|
+
except Exception:
|
|
269
|
+
pass
|
|
270
|
+
|
|
271
|
+
# For DataFrames that can't be copied, return original
|
|
272
|
+
# This provides some protection while avoiding crashes
|
|
273
|
+
try:
|
|
274
|
+
import copy
|
|
275
|
+
|
|
276
|
+
return copy.deepcopy(df)
|
|
277
|
+
except Exception: # pragma: no cover
|
|
278
|
+
# If all else fails, return the original DataFrame
|
|
279
|
+
# This is better than crashing the validation
|
|
280
|
+
return df # pragma: no cover
|
|
281
|
+
|
|
282
|
+
|
|
243
283
|
def _convert_to_narwhals(df: FrameT) -> nw.DataFrame:
|
|
244
284
|
# Convert the DataFrame to a format that narwhals can work with
|
|
245
285
|
return nw.from_native(df)
|