pointblank 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_agg.py +120 -0
- pointblank/_constants.py +207 -6
- pointblank/_constants_translations.py +1302 -0
- pointblank/_datascan_utils.py +28 -10
- pointblank/_interrogation.py +216 -139
- pointblank/_typing.py +12 -0
- pointblank/_utils.py +81 -44
- pointblank/_utils_ai.py +4 -5
- pointblank/_utils_check_args.py +3 -3
- pointblank/_utils_llms_txt.py +41 -2
- pointblank/actions.py +1 -1
- pointblank/assistant.py +2 -3
- pointblank/cli.py +1 -1
- pointblank/column.py +162 -46
- pointblank/data/api-docs.txt +2957 -50
- pointblank/datascan.py +17 -17
- pointblank/draft.py +2 -3
- pointblank/scan_profile.py +2 -1
- pointblank/schema.py +61 -20
- pointblank/thresholds.py +15 -13
- pointblank/validate.py +2280 -410
- pointblank/validate.pyi +1104 -0
- pointblank/yaml.py +15 -8
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/METADATA +7 -2
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/RECORD +30 -28
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/licenses/LICENSE +1 -1
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/WHEEL +0 -0
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/top_level.txt +0 -0
pointblank/_interrogation.py
CHANGED
|
@@ -1,12 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import functools
|
|
4
|
+
from collections.abc import Callable
|
|
4
5
|
from dataclasses import dataclass
|
|
5
|
-
from typing import Any
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
6
7
|
|
|
7
8
|
import narwhals as nw
|
|
8
|
-
from narwhals.dependencies import
|
|
9
|
-
|
|
9
|
+
from narwhals.dependencies import (
|
|
10
|
+
is_narwhals_dataframe,
|
|
11
|
+
is_narwhals_lazyframe,
|
|
12
|
+
is_pandas_dataframe,
|
|
13
|
+
is_polars_dataframe,
|
|
14
|
+
)
|
|
10
15
|
|
|
11
16
|
from pointblank._constants import IBIS_BACKENDS
|
|
12
17
|
from pointblank._spec_utils import (
|
|
@@ -16,6 +21,7 @@ from pointblank._spec_utils import (
|
|
|
16
21
|
check_postal_code,
|
|
17
22
|
check_vin,
|
|
18
23
|
)
|
|
24
|
+
from pointblank._typing import AbsoluteBounds
|
|
19
25
|
from pointblank._utils import (
|
|
20
26
|
_column_test_prep,
|
|
21
27
|
_convert_to_narwhals,
|
|
@@ -23,6 +29,9 @@ from pointblank._utils import (
|
|
|
23
29
|
)
|
|
24
30
|
from pointblank.column import Column
|
|
25
31
|
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from narwhals.typing import IntoFrame
|
|
34
|
+
|
|
26
35
|
|
|
27
36
|
def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val: Any) -> Any:
|
|
28
37
|
"""
|
|
@@ -92,7 +101,9 @@ def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val:
|
|
|
92
101
|
return compare_val
|
|
93
102
|
|
|
94
103
|
|
|
95
|
-
def _safe_is_nan_or_null_expr(
|
|
104
|
+
def _safe_is_nan_or_null_expr(
|
|
105
|
+
data_frame: Any, column_expr: Any, column_name: str | None = None
|
|
106
|
+
) -> Any:
|
|
96
107
|
"""
|
|
97
108
|
Create an expression that safely checks for both Null and NaN values.
|
|
98
109
|
|
|
@@ -423,7 +434,7 @@ class SpeciallyValidation:
|
|
|
423
434
|
else:
|
|
424
435
|
self.tbl_type = tbl_type
|
|
425
436
|
|
|
426
|
-
def get_test_results(self) ->
|
|
437
|
+
def get_test_results(self) -> Any | list[bool]:
|
|
427
438
|
"""Evaluate the expression get either a list of booleans or a results table."""
|
|
428
439
|
|
|
429
440
|
# Get the expression and inspect whether there is a `data` argument
|
|
@@ -517,7 +528,7 @@ class NumberOfTestUnits:
|
|
|
517
528
|
Count the number of test units in a column.
|
|
518
529
|
"""
|
|
519
530
|
|
|
520
|
-
df:
|
|
531
|
+
df: Any # Can be IntoFrame or Ibis table
|
|
521
532
|
column: str
|
|
522
533
|
|
|
523
534
|
def get_test_units(self, tbl_type: str) -> int:
|
|
@@ -534,15 +545,18 @@ class NumberOfTestUnits:
|
|
|
534
545
|
)
|
|
535
546
|
|
|
536
547
|
# Handle LazyFrames which don't have len()
|
|
537
|
-
if
|
|
548
|
+
if is_narwhals_lazyframe(dfn):
|
|
538
549
|
dfn = dfn.collect()
|
|
539
550
|
|
|
551
|
+
assert is_narwhals_dataframe(dfn)
|
|
540
552
|
return len(dfn)
|
|
541
553
|
|
|
542
554
|
if tbl_type in IBIS_BACKENDS:
|
|
543
555
|
# Get the count of test units and convert to a native format
|
|
544
556
|
# TODO: check whether pandas or polars is available
|
|
545
|
-
return self.df.count().to_polars()
|
|
557
|
+
return self.df.count().to_polars() # type: ignore[union-attr]
|
|
558
|
+
|
|
559
|
+
raise ValueError(f"Unsupported table type: {tbl_type}")
|
|
546
560
|
|
|
547
561
|
|
|
548
562
|
def _get_compare_expr_nw(compare: Any) -> Any:
|
|
@@ -553,28 +567,25 @@ def _get_compare_expr_nw(compare: Any) -> Any:
|
|
|
553
567
|
return compare
|
|
554
568
|
|
|
555
569
|
|
|
556
|
-
def _column_has_null_values(table:
|
|
570
|
+
def _column_has_null_values(table: nw.DataFrame[Any] | nw.LazyFrame[Any], column: str) -> bool:
|
|
557
571
|
try:
|
|
558
|
-
# Try the standard null_count() method
|
|
559
|
-
null_count = (table.select(column).null_count())[column][0]
|
|
572
|
+
# Try the standard null_count() method (DataFrame)
|
|
573
|
+
null_count = (table.select(column).null_count())[column][0] # type: ignore[union-attr]
|
|
560
574
|
except AttributeError:
|
|
561
575
|
# For LazyFrames, collect first then get null count
|
|
562
576
|
try:
|
|
563
|
-
collected = table.select(column).collect()
|
|
577
|
+
collected = table.select(column).collect() # type: ignore[union-attr]
|
|
564
578
|
null_count = (collected.null_count())[column][0]
|
|
565
579
|
except Exception:
|
|
566
580
|
# Fallback: check if any values are null
|
|
567
581
|
try:
|
|
568
|
-
result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect()
|
|
582
|
+
result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect() # type: ignore[union-attr]
|
|
569
583
|
null_count = result["null_count"][0]
|
|
570
584
|
except Exception:
|
|
571
585
|
# Last resort: return False (assume no nulls)
|
|
572
586
|
return False
|
|
573
587
|
|
|
574
|
-
|
|
575
|
-
return False
|
|
576
|
-
|
|
577
|
-
return True
|
|
588
|
+
return null_count is not None and null_count > 0
|
|
578
589
|
|
|
579
590
|
|
|
580
591
|
def _check_nulls_across_columns_nw(table, columns_subset):
|
|
@@ -594,7 +605,7 @@ def _check_nulls_across_columns_nw(table, columns_subset):
|
|
|
594
605
|
return result
|
|
595
606
|
|
|
596
607
|
|
|
597
|
-
def _modify_datetime_compare_val(tgt_column:
|
|
608
|
+
def _modify_datetime_compare_val(tgt_column: Any, compare_val: Any) -> Any:
|
|
598
609
|
tgt_col_dtype_str = str(tgt_column.dtype).lower()
|
|
599
610
|
|
|
600
611
|
if compare_val is isinstance(compare_val, Column): # pragma: no cover
|
|
@@ -638,7 +649,7 @@ def _modify_datetime_compare_val(tgt_column: any, compare_val: any) -> any:
|
|
|
638
649
|
return compare_expr
|
|
639
650
|
|
|
640
651
|
|
|
641
|
-
def col_vals_expr(data_tbl:
|
|
652
|
+
def col_vals_expr(data_tbl: Any, expr: Any, tbl_type: str = "local") -> Any:
|
|
642
653
|
"""Check if values in a column evaluate to True for a given predicate expression."""
|
|
643
654
|
if tbl_type == "local":
|
|
644
655
|
# Check the type of expression provided
|
|
@@ -668,21 +679,19 @@ def col_vals_expr(data_tbl: FrameT, expr, tbl_type: str = "local"):
|
|
|
668
679
|
return data_tbl # pragma: no cover
|
|
669
680
|
|
|
670
681
|
|
|
671
|
-
def rows_complete(data_tbl:
|
|
682
|
+
def rows_complete(data_tbl: IntoFrame, columns_subset: list[str] | None) -> Any:
|
|
672
683
|
"""
|
|
673
684
|
Check if rows in a DataFrame are complete (no null values).
|
|
674
685
|
|
|
675
686
|
This function replaces the RowsComplete dataclass for direct usage.
|
|
676
687
|
"""
|
|
677
|
-
tbl = _convert_to_narwhals(df=data_tbl)
|
|
678
|
-
|
|
679
688
|
return interrogate_rows_complete(
|
|
680
|
-
tbl=
|
|
689
|
+
tbl=data_tbl,
|
|
681
690
|
columns_subset=columns_subset,
|
|
682
691
|
)
|
|
683
692
|
|
|
684
693
|
|
|
685
|
-
def col_exists(data_tbl:
|
|
694
|
+
def col_exists(data_tbl: IntoFrame, column: str) -> bool:
|
|
686
695
|
"""
|
|
687
696
|
Check if a column exists in a DataFrame.
|
|
688
697
|
|
|
@@ -703,8 +712,8 @@ def col_exists(data_tbl: FrameT, column: str) -> bool:
|
|
|
703
712
|
|
|
704
713
|
|
|
705
714
|
def col_schema_match(
|
|
706
|
-
data_tbl:
|
|
707
|
-
schema,
|
|
715
|
+
data_tbl: IntoFrame,
|
|
716
|
+
schema: Any,
|
|
708
717
|
complete: bool,
|
|
709
718
|
in_order: bool,
|
|
710
719
|
case_sensitive_colnames: bool,
|
|
@@ -728,7 +737,9 @@ def col_schema_match(
|
|
|
728
737
|
)
|
|
729
738
|
|
|
730
739
|
|
|
731
|
-
def row_count_match(
|
|
740
|
+
def row_count_match(
|
|
741
|
+
data_tbl: IntoFrame, count: Any, inverse: bool, abs_tol_bounds: AbsoluteBounds
|
|
742
|
+
) -> bool:
|
|
732
743
|
"""
|
|
733
744
|
Check if DataFrame row count matches expected count.
|
|
734
745
|
"""
|
|
@@ -745,7 +756,34 @@ def row_count_match(data_tbl: FrameT, count, inverse: bool, abs_tol_bounds) -> b
|
|
|
745
756
|
return row_count >= min_val and row_count <= max_val
|
|
746
757
|
|
|
747
758
|
|
|
748
|
-
def
|
|
759
|
+
def col_pct_null(
|
|
760
|
+
data_tbl: IntoFrame, column: str, p: float, bound_finder: Callable[[int], AbsoluteBounds]
|
|
761
|
+
) -> bool:
|
|
762
|
+
"""Check if the percentage of null vales are within p given the absolute bounds."""
|
|
763
|
+
nw_frame = nw.from_native(data_tbl)
|
|
764
|
+
# Handle LazyFrames by collecting them first
|
|
765
|
+
if is_narwhals_lazyframe(nw_frame):
|
|
766
|
+
nw_frame = nw_frame.collect()
|
|
767
|
+
|
|
768
|
+
assert is_narwhals_dataframe(nw_frame)
|
|
769
|
+
|
|
770
|
+
# We cast as int because it could come back as an arbitary type. For example if the backend
|
|
771
|
+
# is numpy-like, we might get a scalar from `item()`. `int()` expects a certain signature though
|
|
772
|
+
# and `object` does not satisfy so we have to go with the type ignore.
|
|
773
|
+
total_rows: object = nw_frame.select(nw.len()).item()
|
|
774
|
+
total_rows: int = int(total_rows) # type: ignore
|
|
775
|
+
|
|
776
|
+
abs_target: float = round(total_rows * p)
|
|
777
|
+
lower_bound, upper_bound = bound_finder(abs_target)
|
|
778
|
+
|
|
779
|
+
# Count null values (see above comment on typing shenanigans)
|
|
780
|
+
n_null: object = nw_frame.select(nw.col(column).is_null().sum()).item()
|
|
781
|
+
n_null: int = int(n_null) # type: ignore
|
|
782
|
+
|
|
783
|
+
return n_null >= (abs_target - lower_bound) and n_null <= (abs_target + upper_bound)
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
def col_count_match(data_tbl: IntoFrame, count: Any, inverse: bool) -> bool:
|
|
749
787
|
"""
|
|
750
788
|
Check if DataFrame column count matches expected count.
|
|
751
789
|
"""
|
|
@@ -757,7 +795,7 @@ def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
|
|
|
757
795
|
return get_column_count(data=data_tbl) != count
|
|
758
796
|
|
|
759
797
|
|
|
760
|
-
def _coerce_to_common_backend(data_tbl:
|
|
798
|
+
def _coerce_to_common_backend(data_tbl: Any, tbl_compare: Any) -> tuple[Any, Any]:
|
|
761
799
|
"""
|
|
762
800
|
Coerce two tables to the same backend if they differ.
|
|
763
801
|
|
|
@@ -774,7 +812,7 @@ def _coerce_to_common_backend(data_tbl: FrameT, tbl_compare: FrameT) -> tuple[Fr
|
|
|
774
812
|
|
|
775
813
|
Returns
|
|
776
814
|
-------
|
|
777
|
-
tuple[
|
|
815
|
+
tuple[Any, Any]
|
|
778
816
|
Both tables, with tbl_compare potentially converted to data_tbl's backend.
|
|
779
817
|
"""
|
|
780
818
|
# Get backend types for both tables
|
|
@@ -860,7 +898,7 @@ def _coerce_to_common_backend(data_tbl: FrameT, tbl_compare: FrameT) -> tuple[Fr
|
|
|
860
898
|
return data_tbl, tbl_compare
|
|
861
899
|
|
|
862
900
|
|
|
863
|
-
def tbl_match(data_tbl:
|
|
901
|
+
def tbl_match(data_tbl: IntoFrame, tbl_compare: IntoFrame) -> bool:
|
|
864
902
|
"""
|
|
865
903
|
Check if two tables match exactly in schema, row count, and data.
|
|
866
904
|
|
|
@@ -974,33 +1012,37 @@ def tbl_match(data_tbl: FrameT, tbl_compare: FrameT) -> bool:
|
|
|
974
1012
|
|
|
975
1013
|
# Convert to native format for comparison
|
|
976
1014
|
# We need to collect if lazy frames
|
|
977
|
-
if
|
|
1015
|
+
if is_narwhals_lazyframe(col_data_1):
|
|
978
1016
|
col_data_1 = col_data_1.collect()
|
|
979
1017
|
|
|
980
|
-
if
|
|
1018
|
+
if is_narwhals_lazyframe(col_data_2):
|
|
981
1019
|
col_data_2 = col_data_2.collect()
|
|
982
1020
|
|
|
983
1021
|
# Convert to native and then to lists for comparison
|
|
984
|
-
|
|
985
|
-
|
|
1022
|
+
# Native frames could be Polars, Pandas, or Ibis - use Any for dynamic access
|
|
1023
|
+
col_1_native: Any = col_data_1.to_native()
|
|
1024
|
+
col_2_native: Any = col_data_2.to_native()
|
|
986
1025
|
|
|
987
1026
|
# Extract values as lists for comparison
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
1027
|
+
# Note: We use hasattr for runtime detection but maintain Any typing
|
|
1028
|
+
values_1: list[Any]
|
|
1029
|
+
values_2: list[Any]
|
|
1030
|
+
if hasattr(col_1_native, "to_list"): # Polars DataFrame
|
|
1031
|
+
values_1 = col_1_native[col_name].to_list() # type: ignore[index]
|
|
1032
|
+
values_2 = col_2_native[col_name].to_list() # type: ignore[index]
|
|
991
1033
|
|
|
992
|
-
elif hasattr(col_1_native, "tolist"): # Pandas
|
|
993
|
-
values_1 = col_1_native[col_name].tolist()
|
|
994
|
-
values_2 = col_2_native[col_name].tolist()
|
|
1034
|
+
elif hasattr(col_1_native, "tolist"): # Pandas DataFrame
|
|
1035
|
+
values_1 = col_1_native[col_name].tolist() # type: ignore[index]
|
|
1036
|
+
values_2 = col_2_native[col_name].tolist() # type: ignore[index]
|
|
995
1037
|
|
|
996
1038
|
elif hasattr(col_1_native, "collect"): # Ibis
|
|
997
|
-
values_1 = col_1_native[col_name].to_pandas().tolist()
|
|
998
|
-
values_2 = col_2_native[col_name].to_pandas().tolist()
|
|
1039
|
+
values_1 = col_1_native[col_name].to_pandas().tolist() # type: ignore[index]
|
|
1040
|
+
values_2 = col_2_native[col_name].to_pandas().tolist() # type: ignore[index]
|
|
999
1041
|
|
|
1000
1042
|
else:
|
|
1001
1043
|
# Fallback: try direct comparison
|
|
1002
|
-
values_1 = list(col_1_native[col_name])
|
|
1003
|
-
values_2 = list(col_2_native[col_name])
|
|
1044
|
+
values_1 = list(col_1_native[col_name]) # type: ignore[index]
|
|
1045
|
+
values_2 = list(col_2_native[col_name]) # type: ignore[index]
|
|
1004
1046
|
|
|
1005
1047
|
# Compare the two lists element by element, handling NaN/None
|
|
1006
1048
|
if len(values_1) != len(values_2):
|
|
@@ -1062,7 +1104,9 @@ def tbl_match(data_tbl: FrameT, tbl_compare: FrameT) -> bool:
|
|
|
1062
1104
|
return True
|
|
1063
1105
|
|
|
1064
1106
|
|
|
1065
|
-
def conjointly_validation(
|
|
1107
|
+
def conjointly_validation(
|
|
1108
|
+
data_tbl: IntoFrame, expressions: Any, threshold: int, tbl_type: str = "local"
|
|
1109
|
+
) -> Any:
|
|
1066
1110
|
"""
|
|
1067
1111
|
Perform conjoint validation using multiple expressions.
|
|
1068
1112
|
"""
|
|
@@ -1077,30 +1121,32 @@ def conjointly_validation(data_tbl: FrameT, expressions, threshold: int, tbl_typ
|
|
|
1077
1121
|
return conjointly_instance.get_test_results()
|
|
1078
1122
|
|
|
1079
1123
|
|
|
1080
|
-
|
|
1124
|
+
# TODO: we can certainly simplify this
|
|
1125
|
+
def interrogate_gt(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1081
1126
|
"""Greater than interrogation."""
|
|
1082
1127
|
return _interrogate_comparison_base(tbl, column, compare, na_pass, "gt")
|
|
1083
1128
|
|
|
1084
1129
|
|
|
1085
|
-
def interrogate_lt(tbl:
|
|
1130
|
+
def interrogate_lt(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1086
1131
|
"""Less than interrogation."""
|
|
1087
1132
|
return _interrogate_comparison_base(tbl, column, compare, na_pass, "lt")
|
|
1088
1133
|
|
|
1089
1134
|
|
|
1090
|
-
def interrogate_ge(tbl:
|
|
1135
|
+
def interrogate_ge(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1091
1136
|
"""Greater than or equal interrogation."""
|
|
1092
1137
|
return _interrogate_comparison_base(tbl, column, compare, na_pass, "ge")
|
|
1093
1138
|
|
|
1094
1139
|
|
|
1095
|
-
def interrogate_le(tbl:
|
|
1140
|
+
def interrogate_le(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1096
1141
|
"""Less than or equal interrogation."""
|
|
1097
1142
|
return _interrogate_comparison_base(tbl, column, compare, na_pass, "le")
|
|
1098
1143
|
|
|
1099
1144
|
|
|
1100
|
-
def interrogate_eq(tbl:
|
|
1145
|
+
def interrogate_eq(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1101
1146
|
"""Equal interrogation."""
|
|
1102
1147
|
|
|
1103
1148
|
nw_tbl = nw.from_native(tbl)
|
|
1149
|
+
assert is_narwhals_dataframe(nw_tbl) or is_narwhals_lazyframe(nw_tbl)
|
|
1104
1150
|
|
|
1105
1151
|
if isinstance(compare, Column):
|
|
1106
1152
|
compare_expr = _get_compare_expr_nw(compare=compare)
|
|
@@ -1146,10 +1192,10 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1146
1192
|
)
|
|
1147
1193
|
result_tbl = result_tbl.rename({"pb_is_good_4_tmp": "pb_is_good_4"})
|
|
1148
1194
|
elif "cannot compare" in str(e).lower():
|
|
1149
|
-
# Handle genuine type incompatibility
|
|
1195
|
+
# Handle genuine type incompatibility - native_df type varies by backend
|
|
1150
1196
|
native_df = result_tbl.to_native()
|
|
1151
|
-
col_dtype = str(native_df[column].dtype)
|
|
1152
|
-
compare_dtype = str(native_df[compare.name].dtype)
|
|
1197
|
+
col_dtype = str(native_df[column].dtype) # type: ignore[index]
|
|
1198
|
+
compare_dtype = str(native_df[compare.name].dtype) # type: ignore[index]
|
|
1153
1199
|
|
|
1154
1200
|
raise TypeError(
|
|
1155
1201
|
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
@@ -1184,21 +1230,19 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1184
1230
|
or "conversion" in error_msg
|
|
1185
1231
|
and "failed" in error_msg
|
|
1186
1232
|
):
|
|
1187
|
-
# Get column types for a descriptive error message
|
|
1233
|
+
# Get column types for a descriptive error message - native type varies by backend
|
|
1234
|
+
col_dtype = "unknown"
|
|
1235
|
+
compare_dtype = "unknown"
|
|
1188
1236
|
try:
|
|
1189
1237
|
native_df = result_tbl.to_native()
|
|
1190
1238
|
if hasattr(native_df, "dtypes"):
|
|
1191
|
-
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
1192
|
-
compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
|
|
1239
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown")) # type: ignore[union-attr]
|
|
1240
|
+
compare_dtype = str(native_df.dtypes.get(compare.name, "unknown")) # type: ignore[union-attr]
|
|
1193
1241
|
elif hasattr(native_df, "schema"):
|
|
1194
|
-
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
1195
|
-
compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
|
|
1196
|
-
else:
|
|
1197
|
-
col_dtype = "unknown"
|
|
1198
|
-
compare_dtype = "unknown"
|
|
1242
|
+
col_dtype = str(native_df.schema.get(column, "unknown")) # type: ignore[union-attr]
|
|
1243
|
+
compare_dtype = str(native_df.schema.get(compare.name, "unknown")) # type: ignore[union-attr]
|
|
1199
1244
|
except Exception:
|
|
1200
|
-
|
|
1201
|
-
compare_dtype = "unknown"
|
|
1245
|
+
pass
|
|
1202
1246
|
|
|
1203
1247
|
raise TypeError(
|
|
1204
1248
|
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
@@ -1247,17 +1291,16 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1247
1291
|
or "conversion" in error_msg
|
|
1248
1292
|
and "failed" in error_msg
|
|
1249
1293
|
):
|
|
1250
|
-
# Get column type for a descriptive error message
|
|
1294
|
+
# Get column type for a descriptive error message - native type varies by backend
|
|
1295
|
+
col_dtype = "unknown"
|
|
1251
1296
|
try:
|
|
1252
1297
|
native_df = result_tbl.to_native()
|
|
1253
1298
|
if hasattr(native_df, "dtypes"):
|
|
1254
|
-
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
1299
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown")) # type: ignore[union-attr]
|
|
1255
1300
|
elif hasattr(native_df, "schema"):
|
|
1256
|
-
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
1257
|
-
else:
|
|
1258
|
-
col_dtype = "unknown"
|
|
1301
|
+
col_dtype = str(native_df.schema.get(column, "unknown")) # type: ignore[union-attr]
|
|
1259
1302
|
except Exception:
|
|
1260
|
-
|
|
1303
|
+
pass
|
|
1261
1304
|
|
|
1262
1305
|
compare_type = type(compare).__name__
|
|
1263
1306
|
compare_value = str(compare)
|
|
@@ -1287,10 +1330,11 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1287
1330
|
return result_tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
|
|
1288
1331
|
|
|
1289
1332
|
|
|
1290
|
-
def interrogate_ne(tbl:
|
|
1333
|
+
def interrogate_ne(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1291
1334
|
"""Not equal interrogation."""
|
|
1292
1335
|
|
|
1293
1336
|
nw_tbl = nw.from_native(tbl)
|
|
1337
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
1294
1338
|
|
|
1295
1339
|
# Determine if the reference and comparison columns have any null values
|
|
1296
1340
|
ref_col_has_null_vals = _column_has_null_values(table=nw_tbl, column=column)
|
|
@@ -1843,14 +1887,15 @@ def interrogate_ne(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1843
1887
|
|
|
1844
1888
|
|
|
1845
1889
|
def interrogate_between(
|
|
1846
|
-
tbl:
|
|
1847
|
-
) ->
|
|
1890
|
+
tbl: IntoFrame, column: str, low: Any, high: Any, inclusive: tuple[bool, bool], na_pass: bool
|
|
1891
|
+
) -> Any:
|
|
1848
1892
|
"""Between interrogation."""
|
|
1849
1893
|
|
|
1850
1894
|
low_val = _get_compare_expr_nw(compare=low)
|
|
1851
1895
|
high_val = _get_compare_expr_nw(compare=high)
|
|
1852
1896
|
|
|
1853
1897
|
nw_tbl = nw.from_native(tbl)
|
|
1898
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
1854
1899
|
low_val = _safe_modify_datetime_compare_val(nw_tbl, column, low_val)
|
|
1855
1900
|
high_val = _safe_modify_datetime_compare_val(nw_tbl, column, high_val)
|
|
1856
1901
|
|
|
@@ -1912,14 +1957,15 @@ def interrogate_between(
|
|
|
1912
1957
|
|
|
1913
1958
|
|
|
1914
1959
|
def interrogate_outside(
|
|
1915
|
-
tbl:
|
|
1916
|
-
) ->
|
|
1960
|
+
tbl: IntoFrame, column: str, low: Any, high: Any, inclusive: tuple[bool, bool], na_pass: bool
|
|
1961
|
+
) -> Any:
|
|
1917
1962
|
"""Outside range interrogation."""
|
|
1918
1963
|
|
|
1919
1964
|
low_val = _get_compare_expr_nw(compare=low)
|
|
1920
1965
|
high_val = _get_compare_expr_nw(compare=high)
|
|
1921
1966
|
|
|
1922
1967
|
nw_tbl = nw.from_native(tbl)
|
|
1968
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
1923
1969
|
low_val = _safe_modify_datetime_compare_val(nw_tbl, column, low_val)
|
|
1924
1970
|
high_val = _safe_modify_datetime_compare_val(nw_tbl, column, high_val)
|
|
1925
1971
|
|
|
@@ -1978,10 +2024,11 @@ def interrogate_outside(
|
|
|
1978
2024
|
return result_tbl.to_native()
|
|
1979
2025
|
|
|
1980
2026
|
|
|
1981
|
-
def interrogate_isin(tbl:
|
|
2027
|
+
def interrogate_isin(tbl: IntoFrame, column: str, set_values: Any) -> Any:
|
|
1982
2028
|
"""In set interrogation."""
|
|
1983
2029
|
|
|
1984
2030
|
nw_tbl = nw.from_native(tbl)
|
|
2031
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
1985
2032
|
|
|
1986
2033
|
can_be_null: bool = None in set_values
|
|
1987
2034
|
base_expr: nw.Expr = nw.col(column).is_in(set_values)
|
|
@@ -1992,17 +2039,20 @@ def interrogate_isin(tbl: FrameT, column: str, set_values: any) -> FrameT:
|
|
|
1992
2039
|
return result_tbl.to_native()
|
|
1993
2040
|
|
|
1994
2041
|
|
|
1995
|
-
def interrogate_notin(tbl:
|
|
2042
|
+
def interrogate_notin(tbl: IntoFrame, column: str, set_values: Any) -> Any:
|
|
1996
2043
|
"""Not in set interrogation."""
|
|
1997
2044
|
|
|
1998
2045
|
nw_tbl = nw.from_native(tbl)
|
|
2046
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
1999
2047
|
result_tbl = nw_tbl.with_columns(
|
|
2000
2048
|
pb_is_good_=nw.col(column).is_in(set_values),
|
|
2001
2049
|
).with_columns(pb_is_good_=~nw.col("pb_is_good_"))
|
|
2002
2050
|
return result_tbl.to_native()
|
|
2003
2051
|
|
|
2004
2052
|
|
|
2005
|
-
def interrogate_regex(
|
|
2053
|
+
def interrogate_regex(
|
|
2054
|
+
tbl: IntoFrame, column: str, values: dict[str, Any] | str, na_pass: bool
|
|
2055
|
+
) -> Any:
|
|
2006
2056
|
"""Regex interrogation."""
|
|
2007
2057
|
|
|
2008
2058
|
# Handle both old and new formats for backward compatibility
|
|
@@ -2014,6 +2064,7 @@ def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: boo
|
|
|
2014
2064
|
inverse = values["inverse"]
|
|
2015
2065
|
|
|
2016
2066
|
nw_tbl = nw.from_native(tbl)
|
|
2067
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2017
2068
|
result_tbl = nw_tbl.with_columns(
|
|
2018
2069
|
pb_is_good_1=nw.col(column).is_null() & na_pass,
|
|
2019
2070
|
pb_is_good_2=nw.col(column).str.contains(pattern, literal=False).fill_null(False),
|
|
@@ -2033,7 +2084,9 @@ def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: boo
|
|
|
2033
2084
|
return result_tbl.to_native()
|
|
2034
2085
|
|
|
2035
2086
|
|
|
2036
|
-
def interrogate_within_spec(
|
|
2087
|
+
def interrogate_within_spec(
|
|
2088
|
+
tbl: IntoFrame, column: str, values: dict[str, Any], na_pass: bool
|
|
2089
|
+
) -> Any:
|
|
2037
2090
|
"""Within specification interrogation."""
|
|
2038
2091
|
from pointblank._spec_utils import (
|
|
2039
2092
|
regex_email,
|
|
@@ -2058,6 +2111,7 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
|
|
|
2058
2111
|
|
|
2059
2112
|
# Convert to Narwhals for cross-backend compatibility
|
|
2060
2113
|
nw_tbl = nw.from_native(tbl)
|
|
2114
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2061
2115
|
|
|
2062
2116
|
# Regex-based specifications can use Narwhals directly (no materialization needed)
|
|
2063
2117
|
regex_specs = {
|
|
@@ -2111,18 +2165,18 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
|
|
|
2111
2165
|
|
|
2112
2166
|
# For non-Ibis tables or other specs, materialize data and use Python validation
|
|
2113
2167
|
# Get the column data as a list
|
|
2114
|
-
col_data = nw_tbl.select(column).to_native()
|
|
2168
|
+
col_data: Any = nw_tbl.select(column).to_native()
|
|
2115
2169
|
|
|
2116
|
-
# Convert to list based on backend
|
|
2170
|
+
# Convert to list based on backend - type varies so use duck typing
|
|
2117
2171
|
if hasattr(col_data, "to_list"): # Polars
|
|
2118
|
-
col_list = col_data[column].to_list()
|
|
2172
|
+
col_list = col_data[column].to_list() # type: ignore[index]
|
|
2119
2173
|
elif hasattr(col_data, "tolist"): # Pandas
|
|
2120
|
-
col_list = col_data[column].tolist()
|
|
2174
|
+
col_list = col_data[column].tolist() # type: ignore[index]
|
|
2121
2175
|
else: # For Ibis tables, we need to execute the query first
|
|
2122
2176
|
try:
|
|
2123
2177
|
# Try to execute if it's an Ibis table
|
|
2124
2178
|
if hasattr(col_data, "execute"):
|
|
2125
|
-
col_data_exec = col_data.execute()
|
|
2179
|
+
col_data_exec = col_data.execute() # type: ignore[operator]
|
|
2126
2180
|
if hasattr(col_data_exec, "to_list"): # Polars result
|
|
2127
2181
|
col_list = col_data_exec[column].to_list()
|
|
2128
2182
|
elif hasattr(col_data_exec, "tolist"): # Pandas result
|
|
@@ -2135,6 +2189,8 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
|
|
|
2135
2189
|
# Fallback to direct list conversion
|
|
2136
2190
|
col_list = list(col_data[column])
|
|
2137
2191
|
|
|
2192
|
+
assert isinstance(col_list, list)
|
|
2193
|
+
|
|
2138
2194
|
# Validate based on spec type (checksum-based validations)
|
|
2139
2195
|
if spec_lower in ("isbn", "isbn-10", "isbn-13"):
|
|
2140
2196
|
is_valid_list = check_isbn(col_list)
|
|
@@ -2181,7 +2237,9 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
|
|
|
2181
2237
|
return result_tbl.to_native()
|
|
2182
2238
|
|
|
2183
2239
|
|
|
2184
|
-
def interrogate_within_spec_db(
|
|
2240
|
+
def interrogate_within_spec_db(
|
|
2241
|
+
tbl: IntoFrame, column: str, values: dict[str, Any], na_pass: bool
|
|
2242
|
+
) -> Any:
|
|
2185
2243
|
"""
|
|
2186
2244
|
Database-native specification validation (proof of concept).
|
|
2187
2245
|
|
|
@@ -2202,7 +2260,7 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2202
2260
|
|
|
2203
2261
|
Returns
|
|
2204
2262
|
-------
|
|
2205
|
-
|
|
2263
|
+
Any
|
|
2206
2264
|
Result table with pb_is_good_ column indicating validation results.
|
|
2207
2265
|
|
|
2208
2266
|
Notes
|
|
@@ -2215,9 +2273,9 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2215
2273
|
spec_lower = spec.lower()
|
|
2216
2274
|
|
|
2217
2275
|
# Check if this is an Ibis table
|
|
2218
|
-
native_tbl = tbl
|
|
2219
|
-
if
|
|
2220
|
-
native_tbl = tbl.to_native()
|
|
2276
|
+
native_tbl: Any = tbl
|
|
2277
|
+
if is_narwhals_dataframe(tbl) or is_narwhals_lazyframe(tbl):
|
|
2278
|
+
native_tbl = tbl.to_native()
|
|
2221
2279
|
|
|
2222
2280
|
is_ibis = hasattr(native_tbl, "execute")
|
|
2223
2281
|
|
|
@@ -2284,7 +2342,7 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2284
2342
|
weights = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
|
|
2285
2343
|
|
|
2286
2344
|
# Get the column as an Ibis expression
|
|
2287
|
-
col_expr = native_tbl[column]
|
|
2345
|
+
col_expr = native_tbl[column] # type: ignore[index]
|
|
2288
2346
|
|
|
2289
2347
|
# Basic checks: length must be 17, no invalid characters (I, O, Q)
|
|
2290
2348
|
valid_length = col_expr.length() == 17
|
|
@@ -2311,11 +2369,11 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2311
2369
|
value = ibis.cases(*conditions, else_=0) # Default: invalid char = 0 (will fail validation)
|
|
2312
2370
|
|
|
2313
2371
|
# Multiply by weight and add to checksum
|
|
2314
|
-
checksum = checksum + (value * weights[pos])
|
|
2372
|
+
checksum = checksum + (value * weights[pos]) # type: ignore[operator]
|
|
2315
2373
|
|
|
2316
2374
|
# Check digit calculation: checksum % 11
|
|
2317
2375
|
# If result is 10, check digit should be 'X', otherwise it's the digit itself
|
|
2318
|
-
expected_check = checksum % 11
|
|
2376
|
+
expected_check = checksum % 11 # type: ignore[operator]
|
|
2319
2377
|
actual_check_char = col_expr.upper().substr(8, 1) # Position 9 (0-indexed 8)
|
|
2320
2378
|
|
|
2321
2379
|
# Validate check digit using ibis.cases()
|
|
@@ -2338,14 +2396,14 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2338
2396
|
is_valid = is_valid.fill_null(False)
|
|
2339
2397
|
|
|
2340
2398
|
# Add validation column to table
|
|
2341
|
-
result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
|
|
2399
|
+
result_tbl = native_tbl.mutate(pb_is_good_=is_valid) # type: ignore[union-attr]
|
|
2342
2400
|
|
|
2343
2401
|
return result_tbl
|
|
2344
2402
|
|
|
2345
2403
|
|
|
2346
2404
|
def interrogate_credit_card_db(
|
|
2347
|
-
tbl:
|
|
2348
|
-
) ->
|
|
2405
|
+
tbl: IntoFrame, column: str, values: dict[str, str], na_pass: bool
|
|
2406
|
+
) -> Any:
|
|
2349
2407
|
"""
|
|
2350
2408
|
Database-native credit card validation using Luhn algorithm in SQL.
|
|
2351
2409
|
|
|
@@ -2367,7 +2425,7 @@ def interrogate_credit_card_db(
|
|
|
2367
2425
|
|
|
2368
2426
|
Returns
|
|
2369
2427
|
-------
|
|
2370
|
-
|
|
2428
|
+
Any
|
|
2371
2429
|
Result table with pb_is_good_ column indicating validation results.
|
|
2372
2430
|
|
|
2373
2431
|
Notes
|
|
@@ -2384,7 +2442,7 @@ def interrogate_credit_card_db(
|
|
|
2384
2442
|
# Check if this is an Ibis table
|
|
2385
2443
|
native_tbl = tbl
|
|
2386
2444
|
if hasattr(tbl, "to_native"):
|
|
2387
|
-
native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl
|
|
2445
|
+
native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl # type: ignore[operator]
|
|
2388
2446
|
|
|
2389
2447
|
is_ibis = hasattr(native_tbl, "execute")
|
|
2390
2448
|
|
|
@@ -2398,7 +2456,7 @@ def interrogate_credit_card_db(
|
|
|
2398
2456
|
raise ImportError("Ibis is required for database-native validation")
|
|
2399
2457
|
|
|
2400
2458
|
# Get the column as an Ibis expression
|
|
2401
|
-
col_expr = native_tbl[column]
|
|
2459
|
+
col_expr = native_tbl[column] # type: ignore[index]
|
|
2402
2460
|
|
|
2403
2461
|
# Step 1: Clean the input and remove spaces and hyphens
|
|
2404
2462
|
# First check format: only digits, spaces, and hyphens allowed
|
|
@@ -2451,7 +2509,7 @@ def interrogate_credit_card_db(
|
|
|
2451
2509
|
|
|
2452
2510
|
# Calculate contribution to checksum
|
|
2453
2511
|
# If should_double: double the digit, then if > 9 subtract 9
|
|
2454
|
-
doubled = digit_val * 2
|
|
2512
|
+
doubled = digit_val * 2 # type: ignore[operator]
|
|
2455
2513
|
adjusted = ibis.cases(
|
|
2456
2514
|
(should_double & (doubled > 9), doubled - 9),
|
|
2457
2515
|
(should_double, doubled),
|
|
@@ -2464,10 +2522,10 @@ def interrogate_credit_card_db(
|
|
|
2464
2522
|
else_=0,
|
|
2465
2523
|
)
|
|
2466
2524
|
|
|
2467
|
-
checksum = checksum + contribution
|
|
2525
|
+
checksum = checksum + contribution # type: ignore[operator]
|
|
2468
2526
|
|
|
2469
2527
|
# Step 4: Valid if checksum % 10 == 0
|
|
2470
|
-
luhn_valid = (checksum % 10) == 0
|
|
2528
|
+
luhn_valid = (checksum % 10) == 0 # type: ignore[operator]
|
|
2471
2529
|
|
|
2472
2530
|
# Combine all validation checks
|
|
2473
2531
|
is_valid = valid_chars & valid_length & luhn_valid
|
|
@@ -2481,30 +2539,32 @@ def interrogate_credit_card_db(
|
|
|
2481
2539
|
is_valid = is_valid.fill_null(False)
|
|
2482
2540
|
|
|
2483
2541
|
# Add validation column to table
|
|
2484
|
-
result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
|
|
2542
|
+
result_tbl = native_tbl.mutate(pb_is_good_=is_valid) # type: ignore[union-attr]
|
|
2485
2543
|
|
|
2486
2544
|
return result_tbl
|
|
2487
2545
|
|
|
2488
2546
|
|
|
2489
|
-
def interrogate_null(tbl:
|
|
2547
|
+
def interrogate_null(tbl: IntoFrame, column: str) -> Any:
|
|
2490
2548
|
"""Null interrogation."""
|
|
2491
2549
|
|
|
2492
2550
|
nw_tbl = nw.from_native(tbl)
|
|
2551
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2493
2552
|
result_tbl = nw_tbl.with_columns(pb_is_good_=nw.col(column).is_null())
|
|
2494
2553
|
return result_tbl.to_native()
|
|
2495
2554
|
|
|
2496
2555
|
|
|
2497
|
-
def interrogate_not_null(tbl:
|
|
2556
|
+
def interrogate_not_null(tbl: IntoFrame, column: str) -> Any:
|
|
2498
2557
|
"""Not null interrogation."""
|
|
2499
2558
|
|
|
2500
2559
|
nw_tbl = nw.from_native(tbl)
|
|
2560
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2501
2561
|
result_tbl = nw_tbl.with_columns(pb_is_good_=~nw.col(column).is_null())
|
|
2502
2562
|
return result_tbl.to_native()
|
|
2503
2563
|
|
|
2504
2564
|
|
|
2505
2565
|
def interrogate_increasing(
|
|
2506
|
-
tbl:
|
|
2507
|
-
) ->
|
|
2566
|
+
tbl: IntoFrame, column: str, allow_stationary: bool, decreasing_tol: float, na_pass: bool
|
|
2567
|
+
) -> Any:
|
|
2508
2568
|
"""
|
|
2509
2569
|
Increasing interrogation.
|
|
2510
2570
|
|
|
@@ -2525,10 +2585,11 @@ def interrogate_increasing(
|
|
|
2525
2585
|
|
|
2526
2586
|
Returns
|
|
2527
2587
|
-------
|
|
2528
|
-
|
|
2588
|
+
Any
|
|
2529
2589
|
The table with a `pb_is_good_` column indicating pass/fail for each row.
|
|
2530
2590
|
"""
|
|
2531
2591
|
nw_tbl = nw.from_native(tbl)
|
|
2592
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2532
2593
|
|
|
2533
2594
|
# Create a lagged difference column
|
|
2534
2595
|
result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
|
|
@@ -2561,8 +2622,8 @@ def interrogate_increasing(
|
|
|
2561
2622
|
|
|
2562
2623
|
|
|
2563
2624
|
def interrogate_decreasing(
|
|
2564
|
-
tbl:
|
|
2565
|
-
) ->
|
|
2625
|
+
tbl: IntoFrame, column: str, allow_stationary: bool, increasing_tol: float, na_pass: bool
|
|
2626
|
+
) -> Any:
|
|
2566
2627
|
"""
|
|
2567
2628
|
Decreasing interrogation.
|
|
2568
2629
|
|
|
@@ -2583,10 +2644,11 @@ def interrogate_decreasing(
|
|
|
2583
2644
|
|
|
2584
2645
|
Returns
|
|
2585
2646
|
-------
|
|
2586
|
-
|
|
2647
|
+
Any
|
|
2587
2648
|
The table with a `pb_is_good_` column indicating pass/fail for each row.
|
|
2588
2649
|
"""
|
|
2589
2650
|
nw_tbl = nw.from_native(tbl)
|
|
2651
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2590
2652
|
|
|
2591
2653
|
# Create a lagged difference column
|
|
2592
2654
|
result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
|
|
@@ -2619,8 +2681,8 @@ def interrogate_decreasing(
|
|
|
2619
2681
|
|
|
2620
2682
|
|
|
2621
2683
|
def _interrogate_comparison_base(
|
|
2622
|
-
tbl:
|
|
2623
|
-
) ->
|
|
2684
|
+
tbl: IntoFrame, column: str, compare: Any, na_pass: bool, operator: str
|
|
2685
|
+
) -> Any:
|
|
2624
2686
|
"""
|
|
2625
2687
|
Unified base function for comparison operations (gt, ge, lt, le, eq, ne).
|
|
2626
2688
|
|
|
@@ -2639,13 +2701,14 @@ def _interrogate_comparison_base(
|
|
|
2639
2701
|
|
|
2640
2702
|
Returns
|
|
2641
2703
|
-------
|
|
2642
|
-
|
|
2704
|
+
Any
|
|
2643
2705
|
The result table with `pb_is_good_` column indicating the passing test units.
|
|
2644
2706
|
"""
|
|
2645
2707
|
|
|
2646
2708
|
compare_expr = _get_compare_expr_nw(compare=compare)
|
|
2647
2709
|
|
|
2648
2710
|
nw_tbl = nw.from_native(tbl)
|
|
2711
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2649
2712
|
compare_expr = _safe_modify_datetime_compare_val(nw_tbl, column, compare_expr)
|
|
2650
2713
|
|
|
2651
2714
|
# Create the comparison expression based on the operator
|
|
@@ -2692,7 +2755,7 @@ def _interrogate_comparison_base(
|
|
|
2692
2755
|
return result_tbl.to_native()
|
|
2693
2756
|
|
|
2694
2757
|
|
|
2695
|
-
def interrogate_rows_distinct(data_tbl:
|
|
2758
|
+
def interrogate_rows_distinct(data_tbl: IntoFrame, columns_subset: list[str] | None) -> Any:
|
|
2696
2759
|
"""
|
|
2697
2760
|
Check if rows in a DataFrame are distinct.
|
|
2698
2761
|
|
|
@@ -2709,10 +2772,11 @@ def interrogate_rows_distinct(data_tbl: FrameT, columns_subset: list[str] | None
|
|
|
2709
2772
|
|
|
2710
2773
|
Returns
|
|
2711
2774
|
-------
|
|
2712
|
-
|
|
2775
|
+
Any
|
|
2713
2776
|
A DataFrame with a `pb_is_good_` column indicating which rows pass the test.
|
|
2714
2777
|
"""
|
|
2715
2778
|
tbl = nw.from_native(data_tbl)
|
|
2779
|
+
assert is_narwhals_dataframe(tbl) or is_narwhals_lazyframe(tbl)
|
|
2716
2780
|
|
|
2717
2781
|
# Get the column subset to use for the test
|
|
2718
2782
|
if columns_subset is None:
|
|
@@ -2720,18 +2784,23 @@ def interrogate_rows_distinct(data_tbl: FrameT, columns_subset: list[str] | None
|
|
|
2720
2784
|
|
|
2721
2785
|
# Create a count of duplicates using group_by approach
|
|
2722
2786
|
# Group by the columns of interest and count occurrences
|
|
2723
|
-
|
|
2724
|
-
|
|
2725
|
-
|
|
2726
|
-
|
|
2727
|
-
|
|
2728
|
-
|
|
2729
|
-
|
|
2730
|
-
|
|
2731
|
-
|
|
2787
|
+
# Handle DataFrame and LazyFrame separately for proper type narrowing
|
|
2788
|
+
if is_narwhals_dataframe(tbl):
|
|
2789
|
+
count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
|
|
2790
|
+
result = tbl.join(count_tbl, on=columns_subset, how="left")
|
|
2791
|
+
result = result.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
|
|
2792
|
+
return result.to_native()
|
|
2793
|
+
elif is_narwhals_lazyframe(tbl):
|
|
2794
|
+
count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
|
|
2795
|
+
result = tbl.join(count_tbl, on=columns_subset, how="left")
|
|
2796
|
+
result = result.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
|
|
2797
|
+
return result.to_native()
|
|
2798
|
+
else:
|
|
2799
|
+
msg = f"Expected DataFrame or LazyFrame, got {type(tbl)}"
|
|
2800
|
+
raise TypeError(msg)
|
|
2732
2801
|
|
|
2733
2802
|
|
|
2734
|
-
def interrogate_rows_complete(tbl:
|
|
2803
|
+
def interrogate_rows_complete(tbl: IntoFrame, columns_subset: list[str] | None) -> Any:
|
|
2735
2804
|
"""Rows complete interrogation."""
|
|
2736
2805
|
nw_tbl = nw.from_native(tbl)
|
|
2737
2806
|
|
|
@@ -2747,12 +2816,25 @@ def interrogate_rows_complete(tbl: FrameT, columns_subset: list[str] | None) ->
|
|
|
2747
2816
|
return result_tbl.to_native()
|
|
2748
2817
|
|
|
2749
2818
|
|
|
2750
|
-
def interrogate_prompt(
|
|
2819
|
+
def interrogate_prompt(
|
|
2820
|
+
tbl: IntoFrame, columns_subset: list[str] | None, ai_config: dict[str, Any]
|
|
2821
|
+
) -> Any:
|
|
2751
2822
|
"""AI-powered interrogation of rows."""
|
|
2752
2823
|
import logging
|
|
2753
2824
|
|
|
2754
2825
|
logger = logging.getLogger(__name__)
|
|
2755
2826
|
|
|
2827
|
+
# Convert to narwhals early for consistent row counting
|
|
2828
|
+
nw_tbl = nw.from_native(tbl)
|
|
2829
|
+
# Get row count - for LazyFrame we need to use select/collect
|
|
2830
|
+
if is_narwhals_lazyframe(nw_tbl):
|
|
2831
|
+
row_count = nw_tbl.select(nw.len()).collect().item()
|
|
2832
|
+
assert isinstance(row_count, int)
|
|
2833
|
+
total_rows = row_count
|
|
2834
|
+
else:
|
|
2835
|
+
assert is_narwhals_dataframe(nw_tbl)
|
|
2836
|
+
total_rows = len(nw_tbl)
|
|
2837
|
+
|
|
2756
2838
|
try:
|
|
2757
2839
|
# Import AI validation modules
|
|
2758
2840
|
from pointblank._utils_ai import (
|
|
@@ -2809,28 +2891,25 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
|
|
|
2809
2891
|
)
|
|
2810
2892
|
|
|
2811
2893
|
# Parse and combine results with signature mapping optimization
|
|
2812
|
-
parser = _ValidationResponseParser(total_rows=
|
|
2894
|
+
parser = _ValidationResponseParser(total_rows=total_rows)
|
|
2813
2895
|
combined_results = parser.combine_batch_results(batch_results, signature_mapping)
|
|
2814
2896
|
|
|
2815
2897
|
# Debug: Log table info and combined results
|
|
2816
2898
|
logger.debug("🏁 Final result conversion:")
|
|
2817
|
-
logger.debug(f" - Table length: {
|
|
2899
|
+
logger.debug(f" - Table length: {total_rows}")
|
|
2818
2900
|
logger.debug(
|
|
2819
2901
|
f" - Combined results keys: {sorted(combined_results.keys()) if combined_results else 'None'}"
|
|
2820
2902
|
)
|
|
2821
2903
|
|
|
2822
|
-
# Convert results to narwhals format
|
|
2823
|
-
nw_tbl = nw.from_native(tbl)
|
|
2824
|
-
|
|
2825
2904
|
# Create a boolean column for validation results
|
|
2826
2905
|
validation_results = []
|
|
2827
|
-
for i in range(
|
|
2906
|
+
for i in range(total_rows):
|
|
2828
2907
|
# Default to False if row wasn't processed
|
|
2829
2908
|
result = combined_results.get(i, False)
|
|
2830
2909
|
validation_results.append(result)
|
|
2831
2910
|
|
|
2832
2911
|
# Debug: Log first few conversions
|
|
2833
|
-
if i < 5 or
|
|
2912
|
+
if i < 5 or total_rows - i <= 2:
|
|
2834
2913
|
logger.debug(f" Row {i}: {result} (from combined_results.get({i}, False))")
|
|
2835
2914
|
|
|
2836
2915
|
logger.debug(f" - Final validation_results length: {len(validation_results)}")
|
|
@@ -2869,10 +2948,9 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
|
|
|
2869
2948
|
logger.error(f"Missing dependencies for AI validation: {e}")
|
|
2870
2949
|
logger.error("Install required packages: pip install openai anthropic aiohttp")
|
|
2871
2950
|
|
|
2872
|
-
# Return all False results as fallback
|
|
2873
|
-
nw_tbl = nw.from_native(tbl)
|
|
2951
|
+
# Return all False results as fallback (nw_tbl and total_rows defined at function start)
|
|
2874
2952
|
native_tbl = nw_tbl.to_native()
|
|
2875
|
-
validation_results = [False] *
|
|
2953
|
+
validation_results = [False] * total_rows
|
|
2876
2954
|
|
|
2877
2955
|
if hasattr(native_tbl, "with_columns"): # Polars
|
|
2878
2956
|
import polars as pl
|
|
@@ -2894,10 +2972,9 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
|
|
|
2894
2972
|
except Exception as e:
|
|
2895
2973
|
logger.error(f"AI validation failed: {e}")
|
|
2896
2974
|
|
|
2897
|
-
# Return all False results as fallback
|
|
2898
|
-
nw_tbl = nw.from_native(tbl)
|
|
2975
|
+
# Return all False results as fallback (nw_tbl and total_rows defined at function start)
|
|
2899
2976
|
native_tbl = nw_tbl.to_native()
|
|
2900
|
-
validation_results = [False] *
|
|
2977
|
+
validation_results = [False] * total_rows
|
|
2901
2978
|
|
|
2902
2979
|
if hasattr(native_tbl, "with_columns"): # Polars
|
|
2903
2980
|
import polars as pl
|