pointblank 0.17.0__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_agg.py +120 -0
- pointblank/_constants.py +192 -5
- pointblank/_datascan_utils.py +28 -10
- pointblank/_interrogation.py +202 -149
- pointblank/_typing.py +12 -0
- pointblank/_utils.py +81 -44
- pointblank/_utils_ai.py +4 -5
- pointblank/_utils_check_args.py +3 -3
- pointblank/_utils_llms_txt.py +40 -2
- pointblank/actions.py +1 -1
- pointblank/assistant.py +2 -3
- pointblank/cli.py +1 -1
- pointblank/column.py +162 -46
- pointblank/data/api-docs.txt +2695 -49
- pointblank/datascan.py +17 -17
- pointblank/draft.py +2 -3
- pointblank/scan_profile.py +2 -1
- pointblank/schema.py +61 -20
- pointblank/thresholds.py +15 -13
- pointblank/validate.py +780 -231
- pointblank/validate.pyi +1104 -0
- pointblank/yaml.py +10 -6
- {pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/METADATA +2 -2
- {pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/RECORD +29 -27
- {pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/licenses/LICENSE +1 -1
- {pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/WHEEL +0 -0
- {pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.17.0.dist-info → pointblank-0.18.0.dist-info}/top_level.txt +0 -0
pointblank/_interrogation.py
CHANGED
|
@@ -3,11 +3,15 @@ from __future__ import annotations
|
|
|
3
3
|
import functools
|
|
4
4
|
from collections.abc import Callable
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
7
|
|
|
8
8
|
import narwhals as nw
|
|
9
|
-
from narwhals.dependencies import
|
|
10
|
-
|
|
9
|
+
from narwhals.dependencies import (
|
|
10
|
+
is_narwhals_dataframe,
|
|
11
|
+
is_narwhals_lazyframe,
|
|
12
|
+
is_pandas_dataframe,
|
|
13
|
+
is_polars_dataframe,
|
|
14
|
+
)
|
|
11
15
|
|
|
12
16
|
from pointblank._constants import IBIS_BACKENDS
|
|
13
17
|
from pointblank._spec_utils import (
|
|
@@ -25,6 +29,9 @@ from pointblank._utils import (
|
|
|
25
29
|
)
|
|
26
30
|
from pointblank.column import Column
|
|
27
31
|
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from narwhals.typing import IntoFrame
|
|
34
|
+
|
|
28
35
|
|
|
29
36
|
def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val: Any) -> Any:
|
|
30
37
|
"""
|
|
@@ -94,7 +101,9 @@ def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val:
|
|
|
94
101
|
return compare_val
|
|
95
102
|
|
|
96
103
|
|
|
97
|
-
def _safe_is_nan_or_null_expr(
|
|
104
|
+
def _safe_is_nan_or_null_expr(
|
|
105
|
+
data_frame: Any, column_expr: Any, column_name: str | None = None
|
|
106
|
+
) -> Any:
|
|
98
107
|
"""
|
|
99
108
|
Create an expression that safely checks for both Null and NaN values.
|
|
100
109
|
|
|
@@ -425,7 +434,7 @@ class SpeciallyValidation:
|
|
|
425
434
|
else:
|
|
426
435
|
self.tbl_type = tbl_type
|
|
427
436
|
|
|
428
|
-
def get_test_results(self) ->
|
|
437
|
+
def get_test_results(self) -> Any | list[bool]:
|
|
429
438
|
"""Evaluate the expression get either a list of booleans or a results table."""
|
|
430
439
|
|
|
431
440
|
# Get the expression and inspect whether there is a `data` argument
|
|
@@ -519,7 +528,7 @@ class NumberOfTestUnits:
|
|
|
519
528
|
Count the number of test units in a column.
|
|
520
529
|
"""
|
|
521
530
|
|
|
522
|
-
df:
|
|
531
|
+
df: Any # Can be IntoFrame or Ibis table
|
|
523
532
|
column: str
|
|
524
533
|
|
|
525
534
|
def get_test_units(self, tbl_type: str) -> int:
|
|
@@ -536,15 +545,18 @@ class NumberOfTestUnits:
|
|
|
536
545
|
)
|
|
537
546
|
|
|
538
547
|
# Handle LazyFrames which don't have len()
|
|
539
|
-
if
|
|
548
|
+
if is_narwhals_lazyframe(dfn):
|
|
540
549
|
dfn = dfn.collect()
|
|
541
550
|
|
|
551
|
+
assert is_narwhals_dataframe(dfn)
|
|
542
552
|
return len(dfn)
|
|
543
553
|
|
|
544
554
|
if tbl_type in IBIS_BACKENDS:
|
|
545
555
|
# Get the count of test units and convert to a native format
|
|
546
556
|
# TODO: check whether pandas or polars is available
|
|
547
|
-
return self.df.count().to_polars()
|
|
557
|
+
return self.df.count().to_polars() # type: ignore[union-attr]
|
|
558
|
+
|
|
559
|
+
raise ValueError(f"Unsupported table type: {tbl_type}")
|
|
548
560
|
|
|
549
561
|
|
|
550
562
|
def _get_compare_expr_nw(compare: Any) -> Any:
|
|
@@ -555,28 +567,25 @@ def _get_compare_expr_nw(compare: Any) -> Any:
|
|
|
555
567
|
return compare
|
|
556
568
|
|
|
557
569
|
|
|
558
|
-
def _column_has_null_values(table:
|
|
570
|
+
def _column_has_null_values(table: nw.DataFrame[Any] | nw.LazyFrame[Any], column: str) -> bool:
|
|
559
571
|
try:
|
|
560
|
-
# Try the standard null_count() method
|
|
561
|
-
null_count = (table.select(column).null_count())[column][0]
|
|
572
|
+
# Try the standard null_count() method (DataFrame)
|
|
573
|
+
null_count = (table.select(column).null_count())[column][0] # type: ignore[union-attr]
|
|
562
574
|
except AttributeError:
|
|
563
575
|
# For LazyFrames, collect first then get null count
|
|
564
576
|
try:
|
|
565
|
-
collected = table.select(column).collect()
|
|
577
|
+
collected = table.select(column).collect() # type: ignore[union-attr]
|
|
566
578
|
null_count = (collected.null_count())[column][0]
|
|
567
579
|
except Exception:
|
|
568
580
|
# Fallback: check if any values are null
|
|
569
581
|
try:
|
|
570
|
-
result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect()
|
|
582
|
+
result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect() # type: ignore[union-attr]
|
|
571
583
|
null_count = result["null_count"][0]
|
|
572
584
|
except Exception:
|
|
573
585
|
# Last resort: return False (assume no nulls)
|
|
574
586
|
return False
|
|
575
587
|
|
|
576
|
-
|
|
577
|
-
return False
|
|
578
|
-
|
|
579
|
-
return True
|
|
588
|
+
return null_count is not None and null_count > 0
|
|
580
589
|
|
|
581
590
|
|
|
582
591
|
def _check_nulls_across_columns_nw(table, columns_subset):
|
|
@@ -596,7 +605,7 @@ def _check_nulls_across_columns_nw(table, columns_subset):
|
|
|
596
605
|
return result
|
|
597
606
|
|
|
598
607
|
|
|
599
|
-
def _modify_datetime_compare_val(tgt_column:
|
|
608
|
+
def _modify_datetime_compare_val(tgt_column: Any, compare_val: Any) -> Any:
|
|
600
609
|
tgt_col_dtype_str = str(tgt_column.dtype).lower()
|
|
601
610
|
|
|
602
611
|
if compare_val is isinstance(compare_val, Column): # pragma: no cover
|
|
@@ -640,7 +649,7 @@ def _modify_datetime_compare_val(tgt_column: any, compare_val: any) -> any:
|
|
|
640
649
|
return compare_expr
|
|
641
650
|
|
|
642
651
|
|
|
643
|
-
def col_vals_expr(data_tbl:
|
|
652
|
+
def col_vals_expr(data_tbl: Any, expr: Any, tbl_type: str = "local") -> Any:
|
|
644
653
|
"""Check if values in a column evaluate to True for a given predicate expression."""
|
|
645
654
|
if tbl_type == "local":
|
|
646
655
|
# Check the type of expression provided
|
|
@@ -670,21 +679,19 @@ def col_vals_expr(data_tbl: FrameT, expr, tbl_type: str = "local"):
|
|
|
670
679
|
return data_tbl # pragma: no cover
|
|
671
680
|
|
|
672
681
|
|
|
673
|
-
def rows_complete(data_tbl:
|
|
682
|
+
def rows_complete(data_tbl: IntoFrame, columns_subset: list[str] | None) -> Any:
|
|
674
683
|
"""
|
|
675
684
|
Check if rows in a DataFrame are complete (no null values).
|
|
676
685
|
|
|
677
686
|
This function replaces the RowsComplete dataclass for direct usage.
|
|
678
687
|
"""
|
|
679
|
-
tbl = _convert_to_narwhals(df=data_tbl)
|
|
680
|
-
|
|
681
688
|
return interrogate_rows_complete(
|
|
682
|
-
tbl=
|
|
689
|
+
tbl=data_tbl,
|
|
683
690
|
columns_subset=columns_subset,
|
|
684
691
|
)
|
|
685
692
|
|
|
686
693
|
|
|
687
|
-
def col_exists(data_tbl:
|
|
694
|
+
def col_exists(data_tbl: IntoFrame, column: str) -> bool:
|
|
688
695
|
"""
|
|
689
696
|
Check if a column exists in a DataFrame.
|
|
690
697
|
|
|
@@ -705,8 +712,8 @@ def col_exists(data_tbl: FrameT, column: str) -> bool:
|
|
|
705
712
|
|
|
706
713
|
|
|
707
714
|
def col_schema_match(
|
|
708
|
-
data_tbl:
|
|
709
|
-
schema,
|
|
715
|
+
data_tbl: IntoFrame,
|
|
716
|
+
schema: Any,
|
|
710
717
|
complete: bool,
|
|
711
718
|
in_order: bool,
|
|
712
719
|
case_sensitive_colnames: bool,
|
|
@@ -730,7 +737,9 @@ def col_schema_match(
|
|
|
730
737
|
)
|
|
731
738
|
|
|
732
739
|
|
|
733
|
-
def row_count_match(
|
|
740
|
+
def row_count_match(
|
|
741
|
+
data_tbl: IntoFrame, count: Any, inverse: bool, abs_tol_bounds: AbsoluteBounds
|
|
742
|
+
) -> bool:
|
|
734
743
|
"""
|
|
735
744
|
Check if DataFrame row count matches expected count.
|
|
736
745
|
"""
|
|
@@ -748,28 +757,33 @@ def row_count_match(data_tbl: FrameT, count, inverse: bool, abs_tol_bounds) -> b
|
|
|
748
757
|
|
|
749
758
|
|
|
750
759
|
def col_pct_null(
|
|
751
|
-
data_tbl:
|
|
760
|
+
data_tbl: IntoFrame, column: str, p: float, bound_finder: Callable[[int], AbsoluteBounds]
|
|
752
761
|
) -> bool:
|
|
753
762
|
"""Check if the percentage of null vales are within p given the absolute bounds."""
|
|
754
|
-
|
|
755
|
-
nw_tbl = nw.from_native(data_tbl)
|
|
756
|
-
|
|
763
|
+
nw_frame = nw.from_native(data_tbl)
|
|
757
764
|
# Handle LazyFrames by collecting them first
|
|
758
|
-
if
|
|
759
|
-
|
|
765
|
+
if is_narwhals_lazyframe(nw_frame):
|
|
766
|
+
nw_frame = nw_frame.collect()
|
|
767
|
+
|
|
768
|
+
assert is_narwhals_dataframe(nw_frame)
|
|
769
|
+
|
|
770
|
+
# We cast as int because it could come back as an arbitary type. For example if the backend
|
|
771
|
+
# is numpy-like, we might get a scalar from `item()`. `int()` expects a certain signature though
|
|
772
|
+
# and `object` does not satisfy so we have to go with the type ignore.
|
|
773
|
+
total_rows: object = nw_frame.select(nw.len()).item()
|
|
774
|
+
total_rows: int = int(total_rows) # type: ignore
|
|
760
775
|
|
|
761
|
-
# Get total rows using narwhals
|
|
762
|
-
total_rows: int = nw_tbl.select(nw.len()).item()
|
|
763
776
|
abs_target: float = round(total_rows * p)
|
|
764
777
|
lower_bound, upper_bound = bound_finder(abs_target)
|
|
765
778
|
|
|
766
|
-
# Count null values
|
|
767
|
-
n_null:
|
|
779
|
+
# Count null values (see above comment on typing shenanigans)
|
|
780
|
+
n_null: object = nw_frame.select(nw.col(column).is_null().sum()).item()
|
|
781
|
+
n_null: int = int(n_null) # type: ignore
|
|
768
782
|
|
|
769
783
|
return n_null >= (abs_target - lower_bound) and n_null <= (abs_target + upper_bound)
|
|
770
784
|
|
|
771
785
|
|
|
772
|
-
def col_count_match(data_tbl:
|
|
786
|
+
def col_count_match(data_tbl: IntoFrame, count: Any, inverse: bool) -> bool:
|
|
773
787
|
"""
|
|
774
788
|
Check if DataFrame column count matches expected count.
|
|
775
789
|
"""
|
|
@@ -781,7 +795,7 @@ def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
|
|
|
781
795
|
return get_column_count(data=data_tbl) != count
|
|
782
796
|
|
|
783
797
|
|
|
784
|
-
def _coerce_to_common_backend(data_tbl:
|
|
798
|
+
def _coerce_to_common_backend(data_tbl: Any, tbl_compare: Any) -> tuple[Any, Any]:
|
|
785
799
|
"""
|
|
786
800
|
Coerce two tables to the same backend if they differ.
|
|
787
801
|
|
|
@@ -798,7 +812,7 @@ def _coerce_to_common_backend(data_tbl: FrameT, tbl_compare: FrameT) -> tuple[Fr
|
|
|
798
812
|
|
|
799
813
|
Returns
|
|
800
814
|
-------
|
|
801
|
-
tuple[
|
|
815
|
+
tuple[Any, Any]
|
|
802
816
|
Both tables, with tbl_compare potentially converted to data_tbl's backend.
|
|
803
817
|
"""
|
|
804
818
|
# Get backend types for both tables
|
|
@@ -884,7 +898,7 @@ def _coerce_to_common_backend(data_tbl: FrameT, tbl_compare: FrameT) -> tuple[Fr
|
|
|
884
898
|
return data_tbl, tbl_compare
|
|
885
899
|
|
|
886
900
|
|
|
887
|
-
def tbl_match(data_tbl:
|
|
901
|
+
def tbl_match(data_tbl: IntoFrame, tbl_compare: IntoFrame) -> bool:
|
|
888
902
|
"""
|
|
889
903
|
Check if two tables match exactly in schema, row count, and data.
|
|
890
904
|
|
|
@@ -998,33 +1012,37 @@ def tbl_match(data_tbl: FrameT, tbl_compare: FrameT) -> bool:
|
|
|
998
1012
|
|
|
999
1013
|
# Convert to native format for comparison
|
|
1000
1014
|
# We need to collect if lazy frames
|
|
1001
|
-
if
|
|
1015
|
+
if is_narwhals_lazyframe(col_data_1):
|
|
1002
1016
|
col_data_1 = col_data_1.collect()
|
|
1003
1017
|
|
|
1004
|
-
if
|
|
1018
|
+
if is_narwhals_lazyframe(col_data_2):
|
|
1005
1019
|
col_data_2 = col_data_2.collect()
|
|
1006
1020
|
|
|
1007
1021
|
# Convert to native and then to lists for comparison
|
|
1008
|
-
|
|
1009
|
-
|
|
1022
|
+
# Native frames could be Polars, Pandas, or Ibis - use Any for dynamic access
|
|
1023
|
+
col_1_native: Any = col_data_1.to_native()
|
|
1024
|
+
col_2_native: Any = col_data_2.to_native()
|
|
1010
1025
|
|
|
1011
1026
|
# Extract values as lists for comparison
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1027
|
+
# Note: We use hasattr for runtime detection but maintain Any typing
|
|
1028
|
+
values_1: list[Any]
|
|
1029
|
+
values_2: list[Any]
|
|
1030
|
+
if hasattr(col_1_native, "to_list"): # Polars DataFrame
|
|
1031
|
+
values_1 = col_1_native[col_name].to_list() # type: ignore[index]
|
|
1032
|
+
values_2 = col_2_native[col_name].to_list() # type: ignore[index]
|
|
1015
1033
|
|
|
1016
|
-
elif hasattr(col_1_native, "tolist"): # Pandas
|
|
1017
|
-
values_1 = col_1_native[col_name].tolist()
|
|
1018
|
-
values_2 = col_2_native[col_name].tolist()
|
|
1034
|
+
elif hasattr(col_1_native, "tolist"): # Pandas DataFrame
|
|
1035
|
+
values_1 = col_1_native[col_name].tolist() # type: ignore[index]
|
|
1036
|
+
values_2 = col_2_native[col_name].tolist() # type: ignore[index]
|
|
1019
1037
|
|
|
1020
1038
|
elif hasattr(col_1_native, "collect"): # Ibis
|
|
1021
|
-
values_1 = col_1_native[col_name].to_pandas().tolist()
|
|
1022
|
-
values_2 = col_2_native[col_name].to_pandas().tolist()
|
|
1039
|
+
values_1 = col_1_native[col_name].to_pandas().tolist() # type: ignore[index]
|
|
1040
|
+
values_2 = col_2_native[col_name].to_pandas().tolist() # type: ignore[index]
|
|
1023
1041
|
|
|
1024
1042
|
else:
|
|
1025
1043
|
# Fallback: try direct comparison
|
|
1026
|
-
values_1 = list(col_1_native[col_name])
|
|
1027
|
-
values_2 = list(col_2_native[col_name])
|
|
1044
|
+
values_1 = list(col_1_native[col_name]) # type: ignore[index]
|
|
1045
|
+
values_2 = list(col_2_native[col_name]) # type: ignore[index]
|
|
1028
1046
|
|
|
1029
1047
|
# Compare the two lists element by element, handling NaN/None
|
|
1030
1048
|
if len(values_1) != len(values_2):
|
|
@@ -1086,7 +1104,9 @@ def tbl_match(data_tbl: FrameT, tbl_compare: FrameT) -> bool:
|
|
|
1086
1104
|
return True
|
|
1087
1105
|
|
|
1088
1106
|
|
|
1089
|
-
def conjointly_validation(
|
|
1107
|
+
def conjointly_validation(
|
|
1108
|
+
data_tbl: IntoFrame, expressions: Any, threshold: int, tbl_type: str = "local"
|
|
1109
|
+
) -> Any:
|
|
1090
1110
|
"""
|
|
1091
1111
|
Perform conjoint validation using multiple expressions.
|
|
1092
1112
|
"""
|
|
@@ -1101,30 +1121,32 @@ def conjointly_validation(data_tbl: FrameT, expressions, threshold: int, tbl_typ
|
|
|
1101
1121
|
return conjointly_instance.get_test_results()
|
|
1102
1122
|
|
|
1103
1123
|
|
|
1104
|
-
|
|
1124
|
+
# TODO: we can certainly simplify this
|
|
1125
|
+
def interrogate_gt(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1105
1126
|
"""Greater than interrogation."""
|
|
1106
1127
|
return _interrogate_comparison_base(tbl, column, compare, na_pass, "gt")
|
|
1107
1128
|
|
|
1108
1129
|
|
|
1109
|
-
def interrogate_lt(tbl:
|
|
1130
|
+
def interrogate_lt(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1110
1131
|
"""Less than interrogation."""
|
|
1111
1132
|
return _interrogate_comparison_base(tbl, column, compare, na_pass, "lt")
|
|
1112
1133
|
|
|
1113
1134
|
|
|
1114
|
-
def interrogate_ge(tbl:
|
|
1135
|
+
def interrogate_ge(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1115
1136
|
"""Greater than or equal interrogation."""
|
|
1116
1137
|
return _interrogate_comparison_base(tbl, column, compare, na_pass, "ge")
|
|
1117
1138
|
|
|
1118
1139
|
|
|
1119
|
-
def interrogate_le(tbl:
|
|
1140
|
+
def interrogate_le(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1120
1141
|
"""Less than or equal interrogation."""
|
|
1121
1142
|
return _interrogate_comparison_base(tbl, column, compare, na_pass, "le")
|
|
1122
1143
|
|
|
1123
1144
|
|
|
1124
|
-
def interrogate_eq(tbl:
|
|
1145
|
+
def interrogate_eq(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1125
1146
|
"""Equal interrogation."""
|
|
1126
1147
|
|
|
1127
1148
|
nw_tbl = nw.from_native(tbl)
|
|
1149
|
+
assert is_narwhals_dataframe(nw_tbl) or is_narwhals_lazyframe(nw_tbl)
|
|
1128
1150
|
|
|
1129
1151
|
if isinstance(compare, Column):
|
|
1130
1152
|
compare_expr = _get_compare_expr_nw(compare=compare)
|
|
@@ -1170,10 +1192,10 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1170
1192
|
)
|
|
1171
1193
|
result_tbl = result_tbl.rename({"pb_is_good_4_tmp": "pb_is_good_4"})
|
|
1172
1194
|
elif "cannot compare" in str(e).lower():
|
|
1173
|
-
# Handle genuine type incompatibility
|
|
1195
|
+
# Handle genuine type incompatibility - native_df type varies by backend
|
|
1174
1196
|
native_df = result_tbl.to_native()
|
|
1175
|
-
col_dtype = str(native_df[column].dtype)
|
|
1176
|
-
compare_dtype = str(native_df[compare.name].dtype)
|
|
1197
|
+
col_dtype = str(native_df[column].dtype) # type: ignore[index]
|
|
1198
|
+
compare_dtype = str(native_df[compare.name].dtype) # type: ignore[index]
|
|
1177
1199
|
|
|
1178
1200
|
raise TypeError(
|
|
1179
1201
|
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
@@ -1208,21 +1230,19 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1208
1230
|
or "conversion" in error_msg
|
|
1209
1231
|
and "failed" in error_msg
|
|
1210
1232
|
):
|
|
1211
|
-
# Get column types for a descriptive error message
|
|
1233
|
+
# Get column types for a descriptive error message - native type varies by backend
|
|
1234
|
+
col_dtype = "unknown"
|
|
1235
|
+
compare_dtype = "unknown"
|
|
1212
1236
|
try:
|
|
1213
1237
|
native_df = result_tbl.to_native()
|
|
1214
1238
|
if hasattr(native_df, "dtypes"):
|
|
1215
|
-
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
1216
|
-
compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
|
|
1239
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown")) # type: ignore[union-attr]
|
|
1240
|
+
compare_dtype = str(native_df.dtypes.get(compare.name, "unknown")) # type: ignore[union-attr]
|
|
1217
1241
|
elif hasattr(native_df, "schema"):
|
|
1218
|
-
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
1219
|
-
compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
|
|
1220
|
-
else:
|
|
1221
|
-
col_dtype = "unknown"
|
|
1222
|
-
compare_dtype = "unknown"
|
|
1242
|
+
col_dtype = str(native_df.schema.get(column, "unknown")) # type: ignore[union-attr]
|
|
1243
|
+
compare_dtype = str(native_df.schema.get(compare.name, "unknown")) # type: ignore[union-attr]
|
|
1223
1244
|
except Exception:
|
|
1224
|
-
|
|
1225
|
-
compare_dtype = "unknown"
|
|
1245
|
+
pass
|
|
1226
1246
|
|
|
1227
1247
|
raise TypeError(
|
|
1228
1248
|
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
@@ -1271,17 +1291,16 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1271
1291
|
or "conversion" in error_msg
|
|
1272
1292
|
and "failed" in error_msg
|
|
1273
1293
|
):
|
|
1274
|
-
# Get column type for a descriptive error message
|
|
1294
|
+
# Get column type for a descriptive error message - native type varies by backend
|
|
1295
|
+
col_dtype = "unknown"
|
|
1275
1296
|
try:
|
|
1276
1297
|
native_df = result_tbl.to_native()
|
|
1277
1298
|
if hasattr(native_df, "dtypes"):
|
|
1278
|
-
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
1299
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown")) # type: ignore[union-attr]
|
|
1279
1300
|
elif hasattr(native_df, "schema"):
|
|
1280
|
-
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
1281
|
-
else:
|
|
1282
|
-
col_dtype = "unknown"
|
|
1301
|
+
col_dtype = str(native_df.schema.get(column, "unknown")) # type: ignore[union-attr]
|
|
1283
1302
|
except Exception:
|
|
1284
|
-
|
|
1303
|
+
pass
|
|
1285
1304
|
|
|
1286
1305
|
compare_type = type(compare).__name__
|
|
1287
1306
|
compare_value = str(compare)
|
|
@@ -1311,10 +1330,11 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1311
1330
|
return result_tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
|
|
1312
1331
|
|
|
1313
1332
|
|
|
1314
|
-
def interrogate_ne(tbl:
|
|
1333
|
+
def interrogate_ne(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1315
1334
|
"""Not equal interrogation."""
|
|
1316
1335
|
|
|
1317
1336
|
nw_tbl = nw.from_native(tbl)
|
|
1337
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
1318
1338
|
|
|
1319
1339
|
# Determine if the reference and comparison columns have any null values
|
|
1320
1340
|
ref_col_has_null_vals = _column_has_null_values(table=nw_tbl, column=column)
|
|
@@ -1867,14 +1887,15 @@ def interrogate_ne(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1867
1887
|
|
|
1868
1888
|
|
|
1869
1889
|
def interrogate_between(
|
|
1870
|
-
tbl:
|
|
1871
|
-
) ->
|
|
1890
|
+
tbl: IntoFrame, column: str, low: Any, high: Any, inclusive: tuple[bool, bool], na_pass: bool
|
|
1891
|
+
) -> Any:
|
|
1872
1892
|
"""Between interrogation."""
|
|
1873
1893
|
|
|
1874
1894
|
low_val = _get_compare_expr_nw(compare=low)
|
|
1875
1895
|
high_val = _get_compare_expr_nw(compare=high)
|
|
1876
1896
|
|
|
1877
1897
|
nw_tbl = nw.from_native(tbl)
|
|
1898
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
1878
1899
|
low_val = _safe_modify_datetime_compare_val(nw_tbl, column, low_val)
|
|
1879
1900
|
high_val = _safe_modify_datetime_compare_val(nw_tbl, column, high_val)
|
|
1880
1901
|
|
|
@@ -1936,14 +1957,15 @@ def interrogate_between(
|
|
|
1936
1957
|
|
|
1937
1958
|
|
|
1938
1959
|
def interrogate_outside(
|
|
1939
|
-
tbl:
|
|
1940
|
-
) ->
|
|
1960
|
+
tbl: IntoFrame, column: str, low: Any, high: Any, inclusive: tuple[bool, bool], na_pass: bool
|
|
1961
|
+
) -> Any:
|
|
1941
1962
|
"""Outside range interrogation."""
|
|
1942
1963
|
|
|
1943
1964
|
low_val = _get_compare_expr_nw(compare=low)
|
|
1944
1965
|
high_val = _get_compare_expr_nw(compare=high)
|
|
1945
1966
|
|
|
1946
1967
|
nw_tbl = nw.from_native(tbl)
|
|
1968
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
1947
1969
|
low_val = _safe_modify_datetime_compare_val(nw_tbl, column, low_val)
|
|
1948
1970
|
high_val = _safe_modify_datetime_compare_val(nw_tbl, column, high_val)
|
|
1949
1971
|
|
|
@@ -2002,10 +2024,11 @@ def interrogate_outside(
|
|
|
2002
2024
|
return result_tbl.to_native()
|
|
2003
2025
|
|
|
2004
2026
|
|
|
2005
|
-
def interrogate_isin(tbl:
|
|
2027
|
+
def interrogate_isin(tbl: IntoFrame, column: str, set_values: Any) -> Any:
|
|
2006
2028
|
"""In set interrogation."""
|
|
2007
2029
|
|
|
2008
2030
|
nw_tbl = nw.from_native(tbl)
|
|
2031
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2009
2032
|
|
|
2010
2033
|
can_be_null: bool = None in set_values
|
|
2011
2034
|
base_expr: nw.Expr = nw.col(column).is_in(set_values)
|
|
@@ -2016,17 +2039,20 @@ def interrogate_isin(tbl: FrameT, column: str, set_values: any) -> FrameT:
|
|
|
2016
2039
|
return result_tbl.to_native()
|
|
2017
2040
|
|
|
2018
2041
|
|
|
2019
|
-
def interrogate_notin(tbl:
|
|
2042
|
+
def interrogate_notin(tbl: IntoFrame, column: str, set_values: Any) -> Any:
|
|
2020
2043
|
"""Not in set interrogation."""
|
|
2021
2044
|
|
|
2022
2045
|
nw_tbl = nw.from_native(tbl)
|
|
2046
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2023
2047
|
result_tbl = nw_tbl.with_columns(
|
|
2024
2048
|
pb_is_good_=nw.col(column).is_in(set_values),
|
|
2025
2049
|
).with_columns(pb_is_good_=~nw.col("pb_is_good_"))
|
|
2026
2050
|
return result_tbl.to_native()
|
|
2027
2051
|
|
|
2028
2052
|
|
|
2029
|
-
def interrogate_regex(
|
|
2053
|
+
def interrogate_regex(
|
|
2054
|
+
tbl: IntoFrame, column: str, values: dict[str, Any] | str, na_pass: bool
|
|
2055
|
+
) -> Any:
|
|
2030
2056
|
"""Regex interrogation."""
|
|
2031
2057
|
|
|
2032
2058
|
# Handle both old and new formats for backward compatibility
|
|
@@ -2038,6 +2064,7 @@ def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: boo
|
|
|
2038
2064
|
inverse = values["inverse"]
|
|
2039
2065
|
|
|
2040
2066
|
nw_tbl = nw.from_native(tbl)
|
|
2067
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2041
2068
|
result_tbl = nw_tbl.with_columns(
|
|
2042
2069
|
pb_is_good_1=nw.col(column).is_null() & na_pass,
|
|
2043
2070
|
pb_is_good_2=nw.col(column).str.contains(pattern, literal=False).fill_null(False),
|
|
@@ -2057,7 +2084,9 @@ def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: boo
|
|
|
2057
2084
|
return result_tbl.to_native()
|
|
2058
2085
|
|
|
2059
2086
|
|
|
2060
|
-
def interrogate_within_spec(
|
|
2087
|
+
def interrogate_within_spec(
|
|
2088
|
+
tbl: IntoFrame, column: str, values: dict[str, Any], na_pass: bool
|
|
2089
|
+
) -> Any:
|
|
2061
2090
|
"""Within specification interrogation."""
|
|
2062
2091
|
from pointblank._spec_utils import (
|
|
2063
2092
|
regex_email,
|
|
@@ -2082,6 +2111,7 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
|
|
|
2082
2111
|
|
|
2083
2112
|
# Convert to Narwhals for cross-backend compatibility
|
|
2084
2113
|
nw_tbl = nw.from_native(tbl)
|
|
2114
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2085
2115
|
|
|
2086
2116
|
# Regex-based specifications can use Narwhals directly (no materialization needed)
|
|
2087
2117
|
regex_specs = {
|
|
@@ -2135,18 +2165,18 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
|
|
|
2135
2165
|
|
|
2136
2166
|
# For non-Ibis tables or other specs, materialize data and use Python validation
|
|
2137
2167
|
# Get the column data as a list
|
|
2138
|
-
col_data = nw_tbl.select(column).to_native()
|
|
2168
|
+
col_data: Any = nw_tbl.select(column).to_native()
|
|
2139
2169
|
|
|
2140
|
-
# Convert to list based on backend
|
|
2170
|
+
# Convert to list based on backend - type varies so use duck typing
|
|
2141
2171
|
if hasattr(col_data, "to_list"): # Polars
|
|
2142
|
-
col_list = col_data[column].to_list()
|
|
2172
|
+
col_list = col_data[column].to_list() # type: ignore[index]
|
|
2143
2173
|
elif hasattr(col_data, "tolist"): # Pandas
|
|
2144
|
-
col_list = col_data[column].tolist()
|
|
2174
|
+
col_list = col_data[column].tolist() # type: ignore[index]
|
|
2145
2175
|
else: # For Ibis tables, we need to execute the query first
|
|
2146
2176
|
try:
|
|
2147
2177
|
# Try to execute if it's an Ibis table
|
|
2148
2178
|
if hasattr(col_data, "execute"):
|
|
2149
|
-
col_data_exec = col_data.execute()
|
|
2179
|
+
col_data_exec = col_data.execute() # type: ignore[operator]
|
|
2150
2180
|
if hasattr(col_data_exec, "to_list"): # Polars result
|
|
2151
2181
|
col_list = col_data_exec[column].to_list()
|
|
2152
2182
|
elif hasattr(col_data_exec, "tolist"): # Pandas result
|
|
@@ -2159,6 +2189,8 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
|
|
|
2159
2189
|
# Fallback to direct list conversion
|
|
2160
2190
|
col_list = list(col_data[column])
|
|
2161
2191
|
|
|
2192
|
+
assert isinstance(col_list, list)
|
|
2193
|
+
|
|
2162
2194
|
# Validate based on spec type (checksum-based validations)
|
|
2163
2195
|
if spec_lower in ("isbn", "isbn-10", "isbn-13"):
|
|
2164
2196
|
is_valid_list = check_isbn(col_list)
|
|
@@ -2205,7 +2237,9 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
|
|
|
2205
2237
|
return result_tbl.to_native()
|
|
2206
2238
|
|
|
2207
2239
|
|
|
2208
|
-
def interrogate_within_spec_db(
|
|
2240
|
+
def interrogate_within_spec_db(
|
|
2241
|
+
tbl: IntoFrame, column: str, values: dict[str, Any], na_pass: bool
|
|
2242
|
+
) -> Any:
|
|
2209
2243
|
"""
|
|
2210
2244
|
Database-native specification validation (proof of concept).
|
|
2211
2245
|
|
|
@@ -2226,7 +2260,7 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2226
2260
|
|
|
2227
2261
|
Returns
|
|
2228
2262
|
-------
|
|
2229
|
-
|
|
2263
|
+
Any
|
|
2230
2264
|
Result table with pb_is_good_ column indicating validation results.
|
|
2231
2265
|
|
|
2232
2266
|
Notes
|
|
@@ -2239,9 +2273,9 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2239
2273
|
spec_lower = spec.lower()
|
|
2240
2274
|
|
|
2241
2275
|
# Check if this is an Ibis table
|
|
2242
|
-
native_tbl = tbl
|
|
2243
|
-
if
|
|
2244
|
-
native_tbl = tbl.to_native()
|
|
2276
|
+
native_tbl: Any = tbl
|
|
2277
|
+
if is_narwhals_dataframe(tbl) or is_narwhals_lazyframe(tbl):
|
|
2278
|
+
native_tbl = tbl.to_native()
|
|
2245
2279
|
|
|
2246
2280
|
is_ibis = hasattr(native_tbl, "execute")
|
|
2247
2281
|
|
|
@@ -2308,7 +2342,7 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2308
2342
|
weights = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
|
|
2309
2343
|
|
|
2310
2344
|
# Get the column as an Ibis expression
|
|
2311
|
-
col_expr = native_tbl[column]
|
|
2345
|
+
col_expr = native_tbl[column] # type: ignore[index]
|
|
2312
2346
|
|
|
2313
2347
|
# Basic checks: length must be 17, no invalid characters (I, O, Q)
|
|
2314
2348
|
valid_length = col_expr.length() == 17
|
|
@@ -2335,11 +2369,11 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2335
2369
|
value = ibis.cases(*conditions, else_=0) # Default: invalid char = 0 (will fail validation)
|
|
2336
2370
|
|
|
2337
2371
|
# Multiply by weight and add to checksum
|
|
2338
|
-
checksum = checksum + (value * weights[pos])
|
|
2372
|
+
checksum = checksum + (value * weights[pos]) # type: ignore[operator]
|
|
2339
2373
|
|
|
2340
2374
|
# Check digit calculation: checksum % 11
|
|
2341
2375
|
# If result is 10, check digit should be 'X', otherwise it's the digit itself
|
|
2342
|
-
expected_check = checksum % 11
|
|
2376
|
+
expected_check = checksum % 11 # type: ignore[operator]
|
|
2343
2377
|
actual_check_char = col_expr.upper().substr(8, 1) # Position 9 (0-indexed 8)
|
|
2344
2378
|
|
|
2345
2379
|
# Validate check digit using ibis.cases()
|
|
@@ -2362,14 +2396,14 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2362
2396
|
is_valid = is_valid.fill_null(False)
|
|
2363
2397
|
|
|
2364
2398
|
# Add validation column to table
|
|
2365
|
-
result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
|
|
2399
|
+
result_tbl = native_tbl.mutate(pb_is_good_=is_valid) # type: ignore[union-attr]
|
|
2366
2400
|
|
|
2367
2401
|
return result_tbl
|
|
2368
2402
|
|
|
2369
2403
|
|
|
2370
2404
|
def interrogate_credit_card_db(
|
|
2371
|
-
tbl:
|
|
2372
|
-
) ->
|
|
2405
|
+
tbl: IntoFrame, column: str, values: dict[str, str], na_pass: bool
|
|
2406
|
+
) -> Any:
|
|
2373
2407
|
"""
|
|
2374
2408
|
Database-native credit card validation using Luhn algorithm in SQL.
|
|
2375
2409
|
|
|
@@ -2391,7 +2425,7 @@ def interrogate_credit_card_db(
|
|
|
2391
2425
|
|
|
2392
2426
|
Returns
|
|
2393
2427
|
-------
|
|
2394
|
-
|
|
2428
|
+
Any
|
|
2395
2429
|
Result table with pb_is_good_ column indicating validation results.
|
|
2396
2430
|
|
|
2397
2431
|
Notes
|
|
@@ -2408,7 +2442,7 @@ def interrogate_credit_card_db(
|
|
|
2408
2442
|
# Check if this is an Ibis table
|
|
2409
2443
|
native_tbl = tbl
|
|
2410
2444
|
if hasattr(tbl, "to_native"):
|
|
2411
|
-
native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl
|
|
2445
|
+
native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl # type: ignore[operator]
|
|
2412
2446
|
|
|
2413
2447
|
is_ibis = hasattr(native_tbl, "execute")
|
|
2414
2448
|
|
|
@@ -2422,7 +2456,7 @@ def interrogate_credit_card_db(
|
|
|
2422
2456
|
raise ImportError("Ibis is required for database-native validation")
|
|
2423
2457
|
|
|
2424
2458
|
# Get the column as an Ibis expression
|
|
2425
|
-
col_expr = native_tbl[column]
|
|
2459
|
+
col_expr = native_tbl[column] # type: ignore[index]
|
|
2426
2460
|
|
|
2427
2461
|
# Step 1: Clean the input and remove spaces and hyphens
|
|
2428
2462
|
# First check format: only digits, spaces, and hyphens allowed
|
|
@@ -2475,7 +2509,7 @@ def interrogate_credit_card_db(
|
|
|
2475
2509
|
|
|
2476
2510
|
# Calculate contribution to checksum
|
|
2477
2511
|
# If should_double: double the digit, then if > 9 subtract 9
|
|
2478
|
-
doubled = digit_val * 2
|
|
2512
|
+
doubled = digit_val * 2 # type: ignore[operator]
|
|
2479
2513
|
adjusted = ibis.cases(
|
|
2480
2514
|
(should_double & (doubled > 9), doubled - 9),
|
|
2481
2515
|
(should_double, doubled),
|
|
@@ -2488,10 +2522,10 @@ def interrogate_credit_card_db(
|
|
|
2488
2522
|
else_=0,
|
|
2489
2523
|
)
|
|
2490
2524
|
|
|
2491
|
-
checksum = checksum + contribution
|
|
2525
|
+
checksum = checksum + contribution # type: ignore[operator]
|
|
2492
2526
|
|
|
2493
2527
|
# Step 4: Valid if checksum % 10 == 0
|
|
2494
|
-
luhn_valid = (checksum % 10) == 0
|
|
2528
|
+
luhn_valid = (checksum % 10) == 0 # type: ignore[operator]
|
|
2495
2529
|
|
|
2496
2530
|
# Combine all validation checks
|
|
2497
2531
|
is_valid = valid_chars & valid_length & luhn_valid
|
|
@@ -2505,30 +2539,32 @@ def interrogate_credit_card_db(
|
|
|
2505
2539
|
is_valid = is_valid.fill_null(False)
|
|
2506
2540
|
|
|
2507
2541
|
# Add validation column to table
|
|
2508
|
-
result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
|
|
2542
|
+
result_tbl = native_tbl.mutate(pb_is_good_=is_valid) # type: ignore[union-attr]
|
|
2509
2543
|
|
|
2510
2544
|
return result_tbl
|
|
2511
2545
|
|
|
2512
2546
|
|
|
2513
|
-
def interrogate_null(tbl:
|
|
2547
|
+
def interrogate_null(tbl: IntoFrame, column: str) -> Any:
|
|
2514
2548
|
"""Null interrogation."""
|
|
2515
2549
|
|
|
2516
2550
|
nw_tbl = nw.from_native(tbl)
|
|
2551
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2517
2552
|
result_tbl = nw_tbl.with_columns(pb_is_good_=nw.col(column).is_null())
|
|
2518
2553
|
return result_tbl.to_native()
|
|
2519
2554
|
|
|
2520
2555
|
|
|
2521
|
-
def interrogate_not_null(tbl:
|
|
2556
|
+
def interrogate_not_null(tbl: IntoFrame, column: str) -> Any:
|
|
2522
2557
|
"""Not null interrogation."""
|
|
2523
2558
|
|
|
2524
2559
|
nw_tbl = nw.from_native(tbl)
|
|
2560
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2525
2561
|
result_tbl = nw_tbl.with_columns(pb_is_good_=~nw.col(column).is_null())
|
|
2526
2562
|
return result_tbl.to_native()
|
|
2527
2563
|
|
|
2528
2564
|
|
|
2529
2565
|
def interrogate_increasing(
|
|
2530
|
-
tbl:
|
|
2531
|
-
) ->
|
|
2566
|
+
tbl: IntoFrame, column: str, allow_stationary: bool, decreasing_tol: float, na_pass: bool
|
|
2567
|
+
) -> Any:
|
|
2532
2568
|
"""
|
|
2533
2569
|
Increasing interrogation.
|
|
2534
2570
|
|
|
@@ -2549,10 +2585,11 @@ def interrogate_increasing(
|
|
|
2549
2585
|
|
|
2550
2586
|
Returns
|
|
2551
2587
|
-------
|
|
2552
|
-
|
|
2588
|
+
Any
|
|
2553
2589
|
The table with a `pb_is_good_` column indicating pass/fail for each row.
|
|
2554
2590
|
"""
|
|
2555
2591
|
nw_tbl = nw.from_native(tbl)
|
|
2592
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2556
2593
|
|
|
2557
2594
|
# Create a lagged difference column
|
|
2558
2595
|
result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
|
|
@@ -2585,8 +2622,8 @@ def interrogate_increasing(
|
|
|
2585
2622
|
|
|
2586
2623
|
|
|
2587
2624
|
def interrogate_decreasing(
|
|
2588
|
-
tbl:
|
|
2589
|
-
) ->
|
|
2625
|
+
tbl: IntoFrame, column: str, allow_stationary: bool, increasing_tol: float, na_pass: bool
|
|
2626
|
+
) -> Any:
|
|
2590
2627
|
"""
|
|
2591
2628
|
Decreasing interrogation.
|
|
2592
2629
|
|
|
@@ -2607,10 +2644,11 @@ def interrogate_decreasing(
|
|
|
2607
2644
|
|
|
2608
2645
|
Returns
|
|
2609
2646
|
-------
|
|
2610
|
-
|
|
2647
|
+
Any
|
|
2611
2648
|
The table with a `pb_is_good_` column indicating pass/fail for each row.
|
|
2612
2649
|
"""
|
|
2613
2650
|
nw_tbl = nw.from_native(tbl)
|
|
2651
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2614
2652
|
|
|
2615
2653
|
# Create a lagged difference column
|
|
2616
2654
|
result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
|
|
@@ -2643,8 +2681,8 @@ def interrogate_decreasing(
|
|
|
2643
2681
|
|
|
2644
2682
|
|
|
2645
2683
|
def _interrogate_comparison_base(
|
|
2646
|
-
tbl:
|
|
2647
|
-
) ->
|
|
2684
|
+
tbl: IntoFrame, column: str, compare: Any, na_pass: bool, operator: str
|
|
2685
|
+
) -> Any:
|
|
2648
2686
|
"""
|
|
2649
2687
|
Unified base function for comparison operations (gt, ge, lt, le, eq, ne).
|
|
2650
2688
|
|
|
@@ -2663,13 +2701,14 @@ def _interrogate_comparison_base(
|
|
|
2663
2701
|
|
|
2664
2702
|
Returns
|
|
2665
2703
|
-------
|
|
2666
|
-
|
|
2704
|
+
Any
|
|
2667
2705
|
The result table with `pb_is_good_` column indicating the passing test units.
|
|
2668
2706
|
"""
|
|
2669
2707
|
|
|
2670
2708
|
compare_expr = _get_compare_expr_nw(compare=compare)
|
|
2671
2709
|
|
|
2672
2710
|
nw_tbl = nw.from_native(tbl)
|
|
2711
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2673
2712
|
compare_expr = _safe_modify_datetime_compare_val(nw_tbl, column, compare_expr)
|
|
2674
2713
|
|
|
2675
2714
|
# Create the comparison expression based on the operator
|
|
@@ -2716,7 +2755,7 @@ def _interrogate_comparison_base(
|
|
|
2716
2755
|
return result_tbl.to_native()
|
|
2717
2756
|
|
|
2718
2757
|
|
|
2719
|
-
def interrogate_rows_distinct(data_tbl:
|
|
2758
|
+
def interrogate_rows_distinct(data_tbl: IntoFrame, columns_subset: list[str] | None) -> Any:
|
|
2720
2759
|
"""
|
|
2721
2760
|
Check if rows in a DataFrame are distinct.
|
|
2722
2761
|
|
|
@@ -2733,10 +2772,11 @@ def interrogate_rows_distinct(data_tbl: FrameT, columns_subset: list[str] | None
|
|
|
2733
2772
|
|
|
2734
2773
|
Returns
|
|
2735
2774
|
-------
|
|
2736
|
-
|
|
2775
|
+
Any
|
|
2737
2776
|
A DataFrame with a `pb_is_good_` column indicating which rows pass the test.
|
|
2738
2777
|
"""
|
|
2739
2778
|
tbl = nw.from_native(data_tbl)
|
|
2779
|
+
assert is_narwhals_dataframe(tbl) or is_narwhals_lazyframe(tbl)
|
|
2740
2780
|
|
|
2741
2781
|
# Get the column subset to use for the test
|
|
2742
2782
|
if columns_subset is None:
|
|
@@ -2744,18 +2784,23 @@ def interrogate_rows_distinct(data_tbl: FrameT, columns_subset: list[str] | None
|
|
|
2744
2784
|
|
|
2745
2785
|
# Create a count of duplicates using group_by approach
|
|
2746
2786
|
# Group by the columns of interest and count occurrences
|
|
2747
|
-
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
|
|
2751
|
-
|
|
2752
|
-
|
|
2753
|
-
|
|
2754
|
-
|
|
2755
|
-
|
|
2787
|
+
# Handle DataFrame and LazyFrame separately for proper type narrowing
|
|
2788
|
+
if is_narwhals_dataframe(tbl):
|
|
2789
|
+
count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
|
|
2790
|
+
result = tbl.join(count_tbl, on=columns_subset, how="left")
|
|
2791
|
+
result = result.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
|
|
2792
|
+
return result.to_native()
|
|
2793
|
+
elif is_narwhals_lazyframe(tbl):
|
|
2794
|
+
count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
|
|
2795
|
+
result = tbl.join(count_tbl, on=columns_subset, how="left")
|
|
2796
|
+
result = result.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
|
|
2797
|
+
return result.to_native()
|
|
2798
|
+
else:
|
|
2799
|
+
msg = f"Expected DataFrame or LazyFrame, got {type(tbl)}"
|
|
2800
|
+
raise TypeError(msg)
|
|
2756
2801
|
|
|
2757
2802
|
|
|
2758
|
-
def interrogate_rows_complete(tbl:
|
|
2803
|
+
def interrogate_rows_complete(tbl: IntoFrame, columns_subset: list[str] | None) -> Any:
|
|
2759
2804
|
"""Rows complete interrogation."""
|
|
2760
2805
|
nw_tbl = nw.from_native(tbl)
|
|
2761
2806
|
|
|
@@ -2771,12 +2816,25 @@ def interrogate_rows_complete(tbl: FrameT, columns_subset: list[str] | None) ->
|
|
|
2771
2816
|
return result_tbl.to_native()
|
|
2772
2817
|
|
|
2773
2818
|
|
|
2774
|
-
def interrogate_prompt(
|
|
2819
|
+
def interrogate_prompt(
|
|
2820
|
+
tbl: IntoFrame, columns_subset: list[str] | None, ai_config: dict[str, Any]
|
|
2821
|
+
) -> Any:
|
|
2775
2822
|
"""AI-powered interrogation of rows."""
|
|
2776
2823
|
import logging
|
|
2777
2824
|
|
|
2778
2825
|
logger = logging.getLogger(__name__)
|
|
2779
2826
|
|
|
2827
|
+
# Convert to narwhals early for consistent row counting
|
|
2828
|
+
nw_tbl = nw.from_native(tbl)
|
|
2829
|
+
# Get row count - for LazyFrame we need to use select/collect
|
|
2830
|
+
if is_narwhals_lazyframe(nw_tbl):
|
|
2831
|
+
row_count = nw_tbl.select(nw.len()).collect().item()
|
|
2832
|
+
assert isinstance(row_count, int)
|
|
2833
|
+
total_rows = row_count
|
|
2834
|
+
else:
|
|
2835
|
+
assert is_narwhals_dataframe(nw_tbl)
|
|
2836
|
+
total_rows = len(nw_tbl)
|
|
2837
|
+
|
|
2780
2838
|
try:
|
|
2781
2839
|
# Import AI validation modules
|
|
2782
2840
|
from pointblank._utils_ai import (
|
|
@@ -2833,28 +2891,25 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
|
|
|
2833
2891
|
)
|
|
2834
2892
|
|
|
2835
2893
|
# Parse and combine results with signature mapping optimization
|
|
2836
|
-
parser = _ValidationResponseParser(total_rows=
|
|
2894
|
+
parser = _ValidationResponseParser(total_rows=total_rows)
|
|
2837
2895
|
combined_results = parser.combine_batch_results(batch_results, signature_mapping)
|
|
2838
2896
|
|
|
2839
2897
|
# Debug: Log table info and combined results
|
|
2840
2898
|
logger.debug("🏁 Final result conversion:")
|
|
2841
|
-
logger.debug(f" - Table length: {
|
|
2899
|
+
logger.debug(f" - Table length: {total_rows}")
|
|
2842
2900
|
logger.debug(
|
|
2843
2901
|
f" - Combined results keys: {sorted(combined_results.keys()) if combined_results else 'None'}"
|
|
2844
2902
|
)
|
|
2845
2903
|
|
|
2846
|
-
# Convert results to narwhals format
|
|
2847
|
-
nw_tbl = nw.from_native(tbl)
|
|
2848
|
-
|
|
2849
2904
|
# Create a boolean column for validation results
|
|
2850
2905
|
validation_results = []
|
|
2851
|
-
for i in range(
|
|
2906
|
+
for i in range(total_rows):
|
|
2852
2907
|
# Default to False if row wasn't processed
|
|
2853
2908
|
result = combined_results.get(i, False)
|
|
2854
2909
|
validation_results.append(result)
|
|
2855
2910
|
|
|
2856
2911
|
# Debug: Log first few conversions
|
|
2857
|
-
if i < 5 or
|
|
2912
|
+
if i < 5 or total_rows - i <= 2:
|
|
2858
2913
|
logger.debug(f" Row {i}: {result} (from combined_results.get({i}, False))")
|
|
2859
2914
|
|
|
2860
2915
|
logger.debug(f" - Final validation_results length: {len(validation_results)}")
|
|
@@ -2893,10 +2948,9 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
|
|
|
2893
2948
|
logger.error(f"Missing dependencies for AI validation: {e}")
|
|
2894
2949
|
logger.error("Install required packages: pip install openai anthropic aiohttp")
|
|
2895
2950
|
|
|
2896
|
-
# Return all False results as fallback
|
|
2897
|
-
nw_tbl = nw.from_native(tbl)
|
|
2951
|
+
# Return all False results as fallback (nw_tbl and total_rows defined at function start)
|
|
2898
2952
|
native_tbl = nw_tbl.to_native()
|
|
2899
|
-
validation_results = [False] *
|
|
2953
|
+
validation_results = [False] * total_rows
|
|
2900
2954
|
|
|
2901
2955
|
if hasattr(native_tbl, "with_columns"): # Polars
|
|
2902
2956
|
import polars as pl
|
|
@@ -2918,10 +2972,9 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
|
|
|
2918
2972
|
except Exception as e:
|
|
2919
2973
|
logger.error(f"AI validation failed: {e}")
|
|
2920
2974
|
|
|
2921
|
-
# Return all False results as fallback
|
|
2922
|
-
nw_tbl = nw.from_native(tbl)
|
|
2975
|
+
# Return all False results as fallback (nw_tbl and total_rows defined at function start)
|
|
2923
2976
|
native_tbl = nw_tbl.to_native()
|
|
2924
|
-
validation_results = [False] *
|
|
2977
|
+
validation_results = [False] * total_rows
|
|
2925
2978
|
|
|
2926
2979
|
if hasattr(native_tbl, "with_columns"): # Polars
|
|
2927
2980
|
import polars as pl
|