pointblank 0.17.0__py3-none-any.whl → 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_agg.py +120 -0
- pointblank/_constants.py +334 -55
- pointblank/_constants_translations.py +378 -0
- pointblank/_datascan_utils.py +28 -10
- pointblank/_interrogation.py +406 -149
- pointblank/_typing.py +12 -0
- pointblank/_utils.py +81 -44
- pointblank/_utils_ai.py +4 -5
- pointblank/_utils_check_args.py +3 -3
- pointblank/_utils_llms_txt.py +40 -2
- pointblank/actions.py +1 -1
- pointblank/assistant.py +2 -3
- pointblank/cli.py +1 -1
- pointblank/column.py +162 -46
- pointblank/data/api-docs.txt +2695 -49
- pointblank/datascan.py +17 -17
- pointblank/draft.py +2 -3
- pointblank/scan_profile.py +2 -1
- pointblank/schema.py +61 -20
- pointblank/thresholds.py +15 -13
- pointblank/validate.py +2034 -233
- pointblank/validate.pyi +1104 -0
- pointblank/yaml.py +10 -6
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/METADATA +2 -2
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/RECORD +30 -28
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/WHEEL +1 -1
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/licenses/LICENSE +1 -1
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/top_level.txt +0 -0
pointblank/_interrogation.py
CHANGED
|
@@ -3,11 +3,16 @@ from __future__ import annotations
|
|
|
3
3
|
import functools
|
|
4
4
|
from collections.abc import Callable
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
from zoneinfo import ZoneInfo
|
|
7
8
|
|
|
8
9
|
import narwhals as nw
|
|
9
|
-
from narwhals.dependencies import
|
|
10
|
-
|
|
10
|
+
from narwhals.dependencies import (
|
|
11
|
+
is_narwhals_dataframe,
|
|
12
|
+
is_narwhals_lazyframe,
|
|
13
|
+
is_pandas_dataframe,
|
|
14
|
+
is_polars_dataframe,
|
|
15
|
+
)
|
|
11
16
|
|
|
12
17
|
from pointblank._constants import IBIS_BACKENDS
|
|
13
18
|
from pointblank._spec_utils import (
|
|
@@ -25,6 +30,9 @@ from pointblank._utils import (
|
|
|
25
30
|
)
|
|
26
31
|
from pointblank.column import Column
|
|
27
32
|
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from narwhals.typing import IntoFrame
|
|
35
|
+
|
|
28
36
|
|
|
29
37
|
def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val: Any) -> Any:
|
|
30
38
|
"""
|
|
@@ -94,7 +102,9 @@ def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val:
|
|
|
94
102
|
return compare_val
|
|
95
103
|
|
|
96
104
|
|
|
97
|
-
def _safe_is_nan_or_null_expr(
|
|
105
|
+
def _safe_is_nan_or_null_expr(
|
|
106
|
+
data_frame: Any, column_expr: Any, column_name: str | None = None
|
|
107
|
+
) -> Any:
|
|
98
108
|
"""
|
|
99
109
|
Create an expression that safely checks for both Null and NaN values.
|
|
100
110
|
|
|
@@ -425,7 +435,7 @@ class SpeciallyValidation:
|
|
|
425
435
|
else:
|
|
426
436
|
self.tbl_type = tbl_type
|
|
427
437
|
|
|
428
|
-
def get_test_results(self) ->
|
|
438
|
+
def get_test_results(self) -> Any | list[bool]:
|
|
429
439
|
"""Evaluate the expression get either a list of booleans or a results table."""
|
|
430
440
|
|
|
431
441
|
# Get the expression and inspect whether there is a `data` argument
|
|
@@ -519,7 +529,7 @@ class NumberOfTestUnits:
|
|
|
519
529
|
Count the number of test units in a column.
|
|
520
530
|
"""
|
|
521
531
|
|
|
522
|
-
df:
|
|
532
|
+
df: Any # Can be IntoFrame or Ibis table
|
|
523
533
|
column: str
|
|
524
534
|
|
|
525
535
|
def get_test_units(self, tbl_type: str) -> int:
|
|
@@ -536,15 +546,18 @@ class NumberOfTestUnits:
|
|
|
536
546
|
)
|
|
537
547
|
|
|
538
548
|
# Handle LazyFrames which don't have len()
|
|
539
|
-
if
|
|
549
|
+
if is_narwhals_lazyframe(dfn):
|
|
540
550
|
dfn = dfn.collect()
|
|
541
551
|
|
|
552
|
+
assert is_narwhals_dataframe(dfn)
|
|
542
553
|
return len(dfn)
|
|
543
554
|
|
|
544
555
|
if tbl_type in IBIS_BACKENDS:
|
|
545
556
|
# Get the count of test units and convert to a native format
|
|
546
557
|
# TODO: check whether pandas or polars is available
|
|
547
|
-
return self.df.count().to_polars()
|
|
558
|
+
return self.df.count().to_polars() # type: ignore[union-attr]
|
|
559
|
+
|
|
560
|
+
raise ValueError(f"Unsupported table type: {tbl_type}")
|
|
548
561
|
|
|
549
562
|
|
|
550
563
|
def _get_compare_expr_nw(compare: Any) -> Any:
|
|
@@ -555,28 +568,25 @@ def _get_compare_expr_nw(compare: Any) -> Any:
|
|
|
555
568
|
return compare
|
|
556
569
|
|
|
557
570
|
|
|
558
|
-
def _column_has_null_values(table:
|
|
571
|
+
def _column_has_null_values(table: nw.DataFrame[Any] | nw.LazyFrame[Any], column: str) -> bool:
|
|
559
572
|
try:
|
|
560
|
-
# Try the standard null_count() method
|
|
561
|
-
null_count = (table.select(column).null_count())[column][0]
|
|
573
|
+
# Try the standard null_count() method (DataFrame)
|
|
574
|
+
null_count = (table.select(column).null_count())[column][0] # type: ignore[union-attr]
|
|
562
575
|
except AttributeError:
|
|
563
576
|
# For LazyFrames, collect first then get null count
|
|
564
577
|
try:
|
|
565
|
-
collected = table.select(column).collect()
|
|
578
|
+
collected = table.select(column).collect() # type: ignore[union-attr]
|
|
566
579
|
null_count = (collected.null_count())[column][0]
|
|
567
580
|
except Exception:
|
|
568
581
|
# Fallback: check if any values are null
|
|
569
582
|
try:
|
|
570
|
-
result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect()
|
|
583
|
+
result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect() # type: ignore[union-attr]
|
|
571
584
|
null_count = result["null_count"][0]
|
|
572
585
|
except Exception:
|
|
573
586
|
# Last resort: return False (assume no nulls)
|
|
574
587
|
return False
|
|
575
588
|
|
|
576
|
-
|
|
577
|
-
return False
|
|
578
|
-
|
|
579
|
-
return True
|
|
589
|
+
return null_count is not None and null_count > 0
|
|
580
590
|
|
|
581
591
|
|
|
582
592
|
def _check_nulls_across_columns_nw(table, columns_subset):
|
|
@@ -596,7 +606,7 @@ def _check_nulls_across_columns_nw(table, columns_subset):
|
|
|
596
606
|
return result
|
|
597
607
|
|
|
598
608
|
|
|
599
|
-
def _modify_datetime_compare_val(tgt_column:
|
|
609
|
+
def _modify_datetime_compare_val(tgt_column: Any, compare_val: Any) -> Any:
|
|
600
610
|
tgt_col_dtype_str = str(tgt_column.dtype).lower()
|
|
601
611
|
|
|
602
612
|
if compare_val is isinstance(compare_val, Column): # pragma: no cover
|
|
@@ -640,7 +650,7 @@ def _modify_datetime_compare_val(tgt_column: any, compare_val: any) -> any:
|
|
|
640
650
|
return compare_expr
|
|
641
651
|
|
|
642
652
|
|
|
643
|
-
def col_vals_expr(data_tbl:
|
|
653
|
+
def col_vals_expr(data_tbl: Any, expr: Any, tbl_type: str = "local") -> Any:
|
|
644
654
|
"""Check if values in a column evaluate to True for a given predicate expression."""
|
|
645
655
|
if tbl_type == "local":
|
|
646
656
|
# Check the type of expression provided
|
|
@@ -670,21 +680,19 @@ def col_vals_expr(data_tbl: FrameT, expr, tbl_type: str = "local"):
|
|
|
670
680
|
return data_tbl # pragma: no cover
|
|
671
681
|
|
|
672
682
|
|
|
673
|
-
def rows_complete(data_tbl:
|
|
683
|
+
def rows_complete(data_tbl: IntoFrame, columns_subset: list[str] | None) -> Any:
|
|
674
684
|
"""
|
|
675
685
|
Check if rows in a DataFrame are complete (no null values).
|
|
676
686
|
|
|
677
687
|
This function replaces the RowsComplete dataclass for direct usage.
|
|
678
688
|
"""
|
|
679
|
-
tbl = _convert_to_narwhals(df=data_tbl)
|
|
680
|
-
|
|
681
689
|
return interrogate_rows_complete(
|
|
682
|
-
tbl=
|
|
690
|
+
tbl=data_tbl,
|
|
683
691
|
columns_subset=columns_subset,
|
|
684
692
|
)
|
|
685
693
|
|
|
686
694
|
|
|
687
|
-
def col_exists(data_tbl:
|
|
695
|
+
def col_exists(data_tbl: IntoFrame, column: str) -> bool:
|
|
688
696
|
"""
|
|
689
697
|
Check if a column exists in a DataFrame.
|
|
690
698
|
|
|
@@ -705,8 +713,8 @@ def col_exists(data_tbl: FrameT, column: str) -> bool:
|
|
|
705
713
|
|
|
706
714
|
|
|
707
715
|
def col_schema_match(
|
|
708
|
-
data_tbl:
|
|
709
|
-
schema,
|
|
716
|
+
data_tbl: IntoFrame,
|
|
717
|
+
schema: Any,
|
|
710
718
|
complete: bool,
|
|
711
719
|
in_order: bool,
|
|
712
720
|
case_sensitive_colnames: bool,
|
|
@@ -730,7 +738,9 @@ def col_schema_match(
|
|
|
730
738
|
)
|
|
731
739
|
|
|
732
740
|
|
|
733
|
-
def row_count_match(
|
|
741
|
+
def row_count_match(
|
|
742
|
+
data_tbl: IntoFrame, count: Any, inverse: bool, abs_tol_bounds: AbsoluteBounds
|
|
743
|
+
) -> bool:
|
|
734
744
|
"""
|
|
735
745
|
Check if DataFrame row count matches expected count.
|
|
736
746
|
"""
|
|
@@ -748,28 +758,33 @@ def row_count_match(data_tbl: FrameT, count, inverse: bool, abs_tol_bounds) -> b
|
|
|
748
758
|
|
|
749
759
|
|
|
750
760
|
def col_pct_null(
|
|
751
|
-
data_tbl:
|
|
761
|
+
data_tbl: IntoFrame, column: str, p: float, bound_finder: Callable[[int], AbsoluteBounds]
|
|
752
762
|
) -> bool:
|
|
753
763
|
"""Check if the percentage of null vales are within p given the absolute bounds."""
|
|
754
|
-
|
|
755
|
-
nw_tbl = nw.from_native(data_tbl)
|
|
756
|
-
|
|
764
|
+
nw_frame = nw.from_native(data_tbl)
|
|
757
765
|
# Handle LazyFrames by collecting them first
|
|
758
|
-
if
|
|
759
|
-
|
|
766
|
+
if is_narwhals_lazyframe(nw_frame):
|
|
767
|
+
nw_frame = nw_frame.collect()
|
|
768
|
+
|
|
769
|
+
assert is_narwhals_dataframe(nw_frame)
|
|
770
|
+
|
|
771
|
+
# We cast as int because it could come back as an arbitary type. For example if the backend
|
|
772
|
+
# is numpy-like, we might get a scalar from `item()`. `int()` expects a certain signature though
|
|
773
|
+
# and `object` does not satisfy so we have to go with the type ignore.
|
|
774
|
+
total_rows: object = nw_frame.select(nw.len()).item()
|
|
775
|
+
total_rows: int = int(total_rows) # type: ignore
|
|
760
776
|
|
|
761
|
-
# Get total rows using narwhals
|
|
762
|
-
total_rows: int = nw_tbl.select(nw.len()).item()
|
|
763
777
|
abs_target: float = round(total_rows * p)
|
|
764
778
|
lower_bound, upper_bound = bound_finder(abs_target)
|
|
765
779
|
|
|
766
|
-
# Count null values
|
|
767
|
-
n_null:
|
|
780
|
+
# Count null values (see above comment on typing shenanigans)
|
|
781
|
+
n_null: object = nw_frame.select(nw.col(column).is_null().sum()).item()
|
|
782
|
+
n_null: int = int(n_null) # type: ignore
|
|
768
783
|
|
|
769
784
|
return n_null >= (abs_target - lower_bound) and n_null <= (abs_target + upper_bound)
|
|
770
785
|
|
|
771
786
|
|
|
772
|
-
def col_count_match(data_tbl:
|
|
787
|
+
def col_count_match(data_tbl: IntoFrame, count: Any, inverse: bool) -> bool:
|
|
773
788
|
"""
|
|
774
789
|
Check if DataFrame column count matches expected count.
|
|
775
790
|
"""
|
|
@@ -781,7 +796,7 @@ def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
|
|
|
781
796
|
return get_column_count(data=data_tbl) != count
|
|
782
797
|
|
|
783
798
|
|
|
784
|
-
def _coerce_to_common_backend(data_tbl:
|
|
799
|
+
def _coerce_to_common_backend(data_tbl: Any, tbl_compare: Any) -> tuple[Any, Any]:
|
|
785
800
|
"""
|
|
786
801
|
Coerce two tables to the same backend if they differ.
|
|
787
802
|
|
|
@@ -798,7 +813,7 @@ def _coerce_to_common_backend(data_tbl: FrameT, tbl_compare: FrameT) -> tuple[Fr
|
|
|
798
813
|
|
|
799
814
|
Returns
|
|
800
815
|
-------
|
|
801
|
-
tuple[
|
|
816
|
+
tuple[Any, Any]
|
|
802
817
|
Both tables, with tbl_compare potentially converted to data_tbl's backend.
|
|
803
818
|
"""
|
|
804
819
|
# Get backend types for both tables
|
|
@@ -884,7 +899,7 @@ def _coerce_to_common_backend(data_tbl: FrameT, tbl_compare: FrameT) -> tuple[Fr
|
|
|
884
899
|
return data_tbl, tbl_compare
|
|
885
900
|
|
|
886
901
|
|
|
887
|
-
def tbl_match(data_tbl:
|
|
902
|
+
def tbl_match(data_tbl: IntoFrame, tbl_compare: IntoFrame) -> bool:
|
|
888
903
|
"""
|
|
889
904
|
Check if two tables match exactly in schema, row count, and data.
|
|
890
905
|
|
|
@@ -998,33 +1013,37 @@ def tbl_match(data_tbl: FrameT, tbl_compare: FrameT) -> bool:
|
|
|
998
1013
|
|
|
999
1014
|
# Convert to native format for comparison
|
|
1000
1015
|
# We need to collect if lazy frames
|
|
1001
|
-
if
|
|
1016
|
+
if is_narwhals_lazyframe(col_data_1):
|
|
1002
1017
|
col_data_1 = col_data_1.collect()
|
|
1003
1018
|
|
|
1004
|
-
if
|
|
1019
|
+
if is_narwhals_lazyframe(col_data_2):
|
|
1005
1020
|
col_data_2 = col_data_2.collect()
|
|
1006
1021
|
|
|
1007
1022
|
# Convert to native and then to lists for comparison
|
|
1008
|
-
|
|
1009
|
-
|
|
1023
|
+
# Native frames could be Polars, Pandas, or Ibis - use Any for dynamic access
|
|
1024
|
+
col_1_native: Any = col_data_1.to_native()
|
|
1025
|
+
col_2_native: Any = col_data_2.to_native()
|
|
1010
1026
|
|
|
1011
1027
|
# Extract values as lists for comparison
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1028
|
+
# Note: We use hasattr for runtime detection but maintain Any typing
|
|
1029
|
+
values_1: list[Any]
|
|
1030
|
+
values_2: list[Any]
|
|
1031
|
+
if hasattr(col_1_native, "to_list"): # Polars DataFrame
|
|
1032
|
+
values_1 = col_1_native[col_name].to_list() # type: ignore[index]
|
|
1033
|
+
values_2 = col_2_native[col_name].to_list() # type: ignore[index]
|
|
1015
1034
|
|
|
1016
|
-
elif hasattr(col_1_native, "tolist"): # Pandas
|
|
1017
|
-
values_1 = col_1_native[col_name].tolist()
|
|
1018
|
-
values_2 = col_2_native[col_name].tolist()
|
|
1035
|
+
elif hasattr(col_1_native, "tolist"): # Pandas DataFrame
|
|
1036
|
+
values_1 = col_1_native[col_name].tolist() # type: ignore[index]
|
|
1037
|
+
values_2 = col_2_native[col_name].tolist() # type: ignore[index]
|
|
1019
1038
|
|
|
1020
1039
|
elif hasattr(col_1_native, "collect"): # Ibis
|
|
1021
|
-
values_1 = col_1_native[col_name].to_pandas().tolist()
|
|
1022
|
-
values_2 = col_2_native[col_name].to_pandas().tolist()
|
|
1040
|
+
values_1 = col_1_native[col_name].to_pandas().tolist() # type: ignore[index]
|
|
1041
|
+
values_2 = col_2_native[col_name].to_pandas().tolist() # type: ignore[index]
|
|
1023
1042
|
|
|
1024
1043
|
else:
|
|
1025
1044
|
# Fallback: try direct comparison
|
|
1026
|
-
values_1 = list(col_1_native[col_name])
|
|
1027
|
-
values_2 = list(col_2_native[col_name])
|
|
1045
|
+
values_1 = list(col_1_native[col_name]) # type: ignore[index]
|
|
1046
|
+
values_2 = list(col_2_native[col_name]) # type: ignore[index]
|
|
1028
1047
|
|
|
1029
1048
|
# Compare the two lists element by element, handling NaN/None
|
|
1030
1049
|
if len(values_1) != len(values_2):
|
|
@@ -1086,7 +1105,9 @@ def tbl_match(data_tbl: FrameT, tbl_compare: FrameT) -> bool:
|
|
|
1086
1105
|
return True
|
|
1087
1106
|
|
|
1088
1107
|
|
|
1089
|
-
def conjointly_validation(
|
|
1108
|
+
def conjointly_validation(
|
|
1109
|
+
data_tbl: IntoFrame, expressions: Any, threshold: int, tbl_type: str = "local"
|
|
1110
|
+
) -> Any:
|
|
1090
1111
|
"""
|
|
1091
1112
|
Perform conjoint validation using multiple expressions.
|
|
1092
1113
|
"""
|
|
@@ -1101,30 +1122,32 @@ def conjointly_validation(data_tbl: FrameT, expressions, threshold: int, tbl_typ
|
|
|
1101
1122
|
return conjointly_instance.get_test_results()
|
|
1102
1123
|
|
|
1103
1124
|
|
|
1104
|
-
|
|
1125
|
+
# TODO: we can certainly simplify this
|
|
1126
|
+
def interrogate_gt(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1105
1127
|
"""Greater than interrogation."""
|
|
1106
1128
|
return _interrogate_comparison_base(tbl, column, compare, na_pass, "gt")
|
|
1107
1129
|
|
|
1108
1130
|
|
|
1109
|
-
def interrogate_lt(tbl:
|
|
1131
|
+
def interrogate_lt(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1110
1132
|
"""Less than interrogation."""
|
|
1111
1133
|
return _interrogate_comparison_base(tbl, column, compare, na_pass, "lt")
|
|
1112
1134
|
|
|
1113
1135
|
|
|
1114
|
-
def interrogate_ge(tbl:
|
|
1136
|
+
def interrogate_ge(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1115
1137
|
"""Greater than or equal interrogation."""
|
|
1116
1138
|
return _interrogate_comparison_base(tbl, column, compare, na_pass, "ge")
|
|
1117
1139
|
|
|
1118
1140
|
|
|
1119
|
-
def interrogate_le(tbl:
|
|
1141
|
+
def interrogate_le(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1120
1142
|
"""Less than or equal interrogation."""
|
|
1121
1143
|
return _interrogate_comparison_base(tbl, column, compare, na_pass, "le")
|
|
1122
1144
|
|
|
1123
1145
|
|
|
1124
|
-
def interrogate_eq(tbl:
|
|
1146
|
+
def interrogate_eq(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1125
1147
|
"""Equal interrogation."""
|
|
1126
1148
|
|
|
1127
1149
|
nw_tbl = nw.from_native(tbl)
|
|
1150
|
+
assert is_narwhals_dataframe(nw_tbl) or is_narwhals_lazyframe(nw_tbl)
|
|
1128
1151
|
|
|
1129
1152
|
if isinstance(compare, Column):
|
|
1130
1153
|
compare_expr = _get_compare_expr_nw(compare=compare)
|
|
@@ -1170,10 +1193,10 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1170
1193
|
)
|
|
1171
1194
|
result_tbl = result_tbl.rename({"pb_is_good_4_tmp": "pb_is_good_4"})
|
|
1172
1195
|
elif "cannot compare" in str(e).lower():
|
|
1173
|
-
# Handle genuine type incompatibility
|
|
1196
|
+
# Handle genuine type incompatibility - native_df type varies by backend
|
|
1174
1197
|
native_df = result_tbl.to_native()
|
|
1175
|
-
col_dtype = str(native_df[column].dtype)
|
|
1176
|
-
compare_dtype = str(native_df[compare.name].dtype)
|
|
1198
|
+
col_dtype = str(native_df[column].dtype) # type: ignore[index]
|
|
1199
|
+
compare_dtype = str(native_df[compare.name].dtype) # type: ignore[index]
|
|
1177
1200
|
|
|
1178
1201
|
raise TypeError(
|
|
1179
1202
|
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
@@ -1208,21 +1231,19 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1208
1231
|
or "conversion" in error_msg
|
|
1209
1232
|
and "failed" in error_msg
|
|
1210
1233
|
):
|
|
1211
|
-
# Get column types for a descriptive error message
|
|
1234
|
+
# Get column types for a descriptive error message - native type varies by backend
|
|
1235
|
+
col_dtype = "unknown"
|
|
1236
|
+
compare_dtype = "unknown"
|
|
1212
1237
|
try:
|
|
1213
1238
|
native_df = result_tbl.to_native()
|
|
1214
1239
|
if hasattr(native_df, "dtypes"):
|
|
1215
|
-
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
1216
|
-
compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
|
|
1240
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown")) # type: ignore[union-attr]
|
|
1241
|
+
compare_dtype = str(native_df.dtypes.get(compare.name, "unknown")) # type: ignore[union-attr]
|
|
1217
1242
|
elif hasattr(native_df, "schema"):
|
|
1218
|
-
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
1219
|
-
compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
|
|
1220
|
-
else:
|
|
1221
|
-
col_dtype = "unknown"
|
|
1222
|
-
compare_dtype = "unknown"
|
|
1243
|
+
col_dtype = str(native_df.schema.get(column, "unknown")) # type: ignore[union-attr]
|
|
1244
|
+
compare_dtype = str(native_df.schema.get(compare.name, "unknown")) # type: ignore[union-attr]
|
|
1223
1245
|
except Exception:
|
|
1224
|
-
|
|
1225
|
-
compare_dtype = "unknown"
|
|
1246
|
+
pass
|
|
1226
1247
|
|
|
1227
1248
|
raise TypeError(
|
|
1228
1249
|
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
@@ -1271,17 +1292,16 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1271
1292
|
or "conversion" in error_msg
|
|
1272
1293
|
and "failed" in error_msg
|
|
1273
1294
|
):
|
|
1274
|
-
# Get column type for a descriptive error message
|
|
1295
|
+
# Get column type for a descriptive error message - native type varies by backend
|
|
1296
|
+
col_dtype = "unknown"
|
|
1275
1297
|
try:
|
|
1276
1298
|
native_df = result_tbl.to_native()
|
|
1277
1299
|
if hasattr(native_df, "dtypes"):
|
|
1278
|
-
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
1300
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown")) # type: ignore[union-attr]
|
|
1279
1301
|
elif hasattr(native_df, "schema"):
|
|
1280
|
-
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
1281
|
-
else:
|
|
1282
|
-
col_dtype = "unknown"
|
|
1302
|
+
col_dtype = str(native_df.schema.get(column, "unknown")) # type: ignore[union-attr]
|
|
1283
1303
|
except Exception:
|
|
1284
|
-
|
|
1304
|
+
pass
|
|
1285
1305
|
|
|
1286
1306
|
compare_type = type(compare).__name__
|
|
1287
1307
|
compare_value = str(compare)
|
|
@@ -1311,10 +1331,11 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1311
1331
|
return result_tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
|
|
1312
1332
|
|
|
1313
1333
|
|
|
1314
|
-
def interrogate_ne(tbl:
|
|
1334
|
+
def interrogate_ne(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
|
|
1315
1335
|
"""Not equal interrogation."""
|
|
1316
1336
|
|
|
1317
1337
|
nw_tbl = nw.from_native(tbl)
|
|
1338
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
1318
1339
|
|
|
1319
1340
|
# Determine if the reference and comparison columns have any null values
|
|
1320
1341
|
ref_col_has_null_vals = _column_has_null_values(table=nw_tbl, column=column)
|
|
@@ -1867,14 +1888,15 @@ def interrogate_ne(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
|
|
|
1867
1888
|
|
|
1868
1889
|
|
|
1869
1890
|
def interrogate_between(
|
|
1870
|
-
tbl:
|
|
1871
|
-
) ->
|
|
1891
|
+
tbl: IntoFrame, column: str, low: Any, high: Any, inclusive: tuple[bool, bool], na_pass: bool
|
|
1892
|
+
) -> Any:
|
|
1872
1893
|
"""Between interrogation."""
|
|
1873
1894
|
|
|
1874
1895
|
low_val = _get_compare_expr_nw(compare=low)
|
|
1875
1896
|
high_val = _get_compare_expr_nw(compare=high)
|
|
1876
1897
|
|
|
1877
1898
|
nw_tbl = nw.from_native(tbl)
|
|
1899
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
1878
1900
|
low_val = _safe_modify_datetime_compare_val(nw_tbl, column, low_val)
|
|
1879
1901
|
high_val = _safe_modify_datetime_compare_val(nw_tbl, column, high_val)
|
|
1880
1902
|
|
|
@@ -1936,14 +1958,15 @@ def interrogate_between(
|
|
|
1936
1958
|
|
|
1937
1959
|
|
|
1938
1960
|
def interrogate_outside(
|
|
1939
|
-
tbl:
|
|
1940
|
-
) ->
|
|
1961
|
+
tbl: IntoFrame, column: str, low: Any, high: Any, inclusive: tuple[bool, bool], na_pass: bool
|
|
1962
|
+
) -> Any:
|
|
1941
1963
|
"""Outside range interrogation."""
|
|
1942
1964
|
|
|
1943
1965
|
low_val = _get_compare_expr_nw(compare=low)
|
|
1944
1966
|
high_val = _get_compare_expr_nw(compare=high)
|
|
1945
1967
|
|
|
1946
1968
|
nw_tbl = nw.from_native(tbl)
|
|
1969
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
1947
1970
|
low_val = _safe_modify_datetime_compare_val(nw_tbl, column, low_val)
|
|
1948
1971
|
high_val = _safe_modify_datetime_compare_val(nw_tbl, column, high_val)
|
|
1949
1972
|
|
|
@@ -2002,10 +2025,11 @@ def interrogate_outside(
|
|
|
2002
2025
|
return result_tbl.to_native()
|
|
2003
2026
|
|
|
2004
2027
|
|
|
2005
|
-
def interrogate_isin(tbl:
|
|
2028
|
+
def interrogate_isin(tbl: IntoFrame, column: str, set_values: Any) -> Any:
|
|
2006
2029
|
"""In set interrogation."""
|
|
2007
2030
|
|
|
2008
2031
|
nw_tbl = nw.from_native(tbl)
|
|
2032
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2009
2033
|
|
|
2010
2034
|
can_be_null: bool = None in set_values
|
|
2011
2035
|
base_expr: nw.Expr = nw.col(column).is_in(set_values)
|
|
@@ -2016,17 +2040,20 @@ def interrogate_isin(tbl: FrameT, column: str, set_values: any) -> FrameT:
|
|
|
2016
2040
|
return result_tbl.to_native()
|
|
2017
2041
|
|
|
2018
2042
|
|
|
2019
|
-
def interrogate_notin(tbl:
|
|
2043
|
+
def interrogate_notin(tbl: IntoFrame, column: str, set_values: Any) -> Any:
|
|
2020
2044
|
"""Not in set interrogation."""
|
|
2021
2045
|
|
|
2022
2046
|
nw_tbl = nw.from_native(tbl)
|
|
2047
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2023
2048
|
result_tbl = nw_tbl.with_columns(
|
|
2024
2049
|
pb_is_good_=nw.col(column).is_in(set_values),
|
|
2025
2050
|
).with_columns(pb_is_good_=~nw.col("pb_is_good_"))
|
|
2026
2051
|
return result_tbl.to_native()
|
|
2027
2052
|
|
|
2028
2053
|
|
|
2029
|
-
def interrogate_regex(
|
|
2054
|
+
def interrogate_regex(
|
|
2055
|
+
tbl: IntoFrame, column: str, values: dict[str, Any] | str, na_pass: bool
|
|
2056
|
+
) -> Any:
|
|
2030
2057
|
"""Regex interrogation."""
|
|
2031
2058
|
|
|
2032
2059
|
# Handle both old and new formats for backward compatibility
|
|
@@ -2038,6 +2065,7 @@ def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: boo
|
|
|
2038
2065
|
inverse = values["inverse"]
|
|
2039
2066
|
|
|
2040
2067
|
nw_tbl = nw.from_native(tbl)
|
|
2068
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2041
2069
|
result_tbl = nw_tbl.with_columns(
|
|
2042
2070
|
pb_is_good_1=nw.col(column).is_null() & na_pass,
|
|
2043
2071
|
pb_is_good_2=nw.col(column).str.contains(pattern, literal=False).fill_null(False),
|
|
@@ -2057,7 +2085,9 @@ def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: boo
|
|
|
2057
2085
|
return result_tbl.to_native()
|
|
2058
2086
|
|
|
2059
2087
|
|
|
2060
|
-
def interrogate_within_spec(
|
|
2088
|
+
def interrogate_within_spec(
|
|
2089
|
+
tbl: IntoFrame, column: str, values: dict[str, Any], na_pass: bool
|
|
2090
|
+
) -> Any:
|
|
2061
2091
|
"""Within specification interrogation."""
|
|
2062
2092
|
from pointblank._spec_utils import (
|
|
2063
2093
|
regex_email,
|
|
@@ -2082,6 +2112,7 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
|
|
|
2082
2112
|
|
|
2083
2113
|
# Convert to Narwhals for cross-backend compatibility
|
|
2084
2114
|
nw_tbl = nw.from_native(tbl)
|
|
2115
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2085
2116
|
|
|
2086
2117
|
# Regex-based specifications can use Narwhals directly (no materialization needed)
|
|
2087
2118
|
regex_specs = {
|
|
@@ -2135,18 +2166,18 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
|
|
|
2135
2166
|
|
|
2136
2167
|
# For non-Ibis tables or other specs, materialize data and use Python validation
|
|
2137
2168
|
# Get the column data as a list
|
|
2138
|
-
col_data = nw_tbl.select(column).to_native()
|
|
2169
|
+
col_data: Any = nw_tbl.select(column).to_native()
|
|
2139
2170
|
|
|
2140
|
-
# Convert to list based on backend
|
|
2171
|
+
# Convert to list based on backend - type varies so use duck typing
|
|
2141
2172
|
if hasattr(col_data, "to_list"): # Polars
|
|
2142
|
-
col_list = col_data[column].to_list()
|
|
2173
|
+
col_list = col_data[column].to_list() # type: ignore[index]
|
|
2143
2174
|
elif hasattr(col_data, "tolist"): # Pandas
|
|
2144
|
-
col_list = col_data[column].tolist()
|
|
2175
|
+
col_list = col_data[column].tolist() # type: ignore[index]
|
|
2145
2176
|
else: # For Ibis tables, we need to execute the query first
|
|
2146
2177
|
try:
|
|
2147
2178
|
# Try to execute if it's an Ibis table
|
|
2148
2179
|
if hasattr(col_data, "execute"):
|
|
2149
|
-
col_data_exec = col_data.execute()
|
|
2180
|
+
col_data_exec = col_data.execute() # type: ignore[operator]
|
|
2150
2181
|
if hasattr(col_data_exec, "to_list"): # Polars result
|
|
2151
2182
|
col_list = col_data_exec[column].to_list()
|
|
2152
2183
|
elif hasattr(col_data_exec, "tolist"): # Pandas result
|
|
@@ -2159,6 +2190,8 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
|
|
|
2159
2190
|
# Fallback to direct list conversion
|
|
2160
2191
|
col_list = list(col_data[column])
|
|
2161
2192
|
|
|
2193
|
+
assert isinstance(col_list, list)
|
|
2194
|
+
|
|
2162
2195
|
# Validate based on spec type (checksum-based validations)
|
|
2163
2196
|
if spec_lower in ("isbn", "isbn-10", "isbn-13"):
|
|
2164
2197
|
is_valid_list = check_isbn(col_list)
|
|
@@ -2205,7 +2238,9 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
|
|
|
2205
2238
|
return result_tbl.to_native()
|
|
2206
2239
|
|
|
2207
2240
|
|
|
2208
|
-
def interrogate_within_spec_db(
|
|
2241
|
+
def interrogate_within_spec_db(
|
|
2242
|
+
tbl: IntoFrame, column: str, values: dict[str, Any], na_pass: bool
|
|
2243
|
+
) -> Any:
|
|
2209
2244
|
"""
|
|
2210
2245
|
Database-native specification validation (proof of concept).
|
|
2211
2246
|
|
|
@@ -2226,7 +2261,7 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2226
2261
|
|
|
2227
2262
|
Returns
|
|
2228
2263
|
-------
|
|
2229
|
-
|
|
2264
|
+
Any
|
|
2230
2265
|
Result table with pb_is_good_ column indicating validation results.
|
|
2231
2266
|
|
|
2232
2267
|
Notes
|
|
@@ -2239,9 +2274,9 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2239
2274
|
spec_lower = spec.lower()
|
|
2240
2275
|
|
|
2241
2276
|
# Check if this is an Ibis table
|
|
2242
|
-
native_tbl = tbl
|
|
2243
|
-
if
|
|
2244
|
-
native_tbl = tbl.to_native()
|
|
2277
|
+
native_tbl: Any = tbl
|
|
2278
|
+
if is_narwhals_dataframe(tbl) or is_narwhals_lazyframe(tbl):
|
|
2279
|
+
native_tbl = tbl.to_native()
|
|
2245
2280
|
|
|
2246
2281
|
is_ibis = hasattr(native_tbl, "execute")
|
|
2247
2282
|
|
|
@@ -2308,7 +2343,7 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2308
2343
|
weights = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
|
|
2309
2344
|
|
|
2310
2345
|
# Get the column as an Ibis expression
|
|
2311
|
-
col_expr = native_tbl[column]
|
|
2346
|
+
col_expr = native_tbl[column] # type: ignore[index]
|
|
2312
2347
|
|
|
2313
2348
|
# Basic checks: length must be 17, no invalid characters (I, O, Q)
|
|
2314
2349
|
valid_length = col_expr.length() == 17
|
|
@@ -2335,11 +2370,11 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2335
2370
|
value = ibis.cases(*conditions, else_=0) # Default: invalid char = 0 (will fail validation)
|
|
2336
2371
|
|
|
2337
2372
|
# Multiply by weight and add to checksum
|
|
2338
|
-
checksum = checksum + (value * weights[pos])
|
|
2373
|
+
checksum = checksum + (value * weights[pos]) # type: ignore[operator]
|
|
2339
2374
|
|
|
2340
2375
|
# Check digit calculation: checksum % 11
|
|
2341
2376
|
# If result is 10, check digit should be 'X', otherwise it's the digit itself
|
|
2342
|
-
expected_check = checksum % 11
|
|
2377
|
+
expected_check = checksum % 11 # type: ignore[operator]
|
|
2343
2378
|
actual_check_char = col_expr.upper().substr(8, 1) # Position 9 (0-indexed 8)
|
|
2344
2379
|
|
|
2345
2380
|
# Validate check digit using ibis.cases()
|
|
@@ -2362,14 +2397,14 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
|
|
|
2362
2397
|
is_valid = is_valid.fill_null(False)
|
|
2363
2398
|
|
|
2364
2399
|
# Add validation column to table
|
|
2365
|
-
result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
|
|
2400
|
+
result_tbl = native_tbl.mutate(pb_is_good_=is_valid) # type: ignore[union-attr]
|
|
2366
2401
|
|
|
2367
2402
|
return result_tbl
|
|
2368
2403
|
|
|
2369
2404
|
|
|
2370
2405
|
def interrogate_credit_card_db(
|
|
2371
|
-
tbl:
|
|
2372
|
-
) ->
|
|
2406
|
+
tbl: IntoFrame, column: str, values: dict[str, str], na_pass: bool
|
|
2407
|
+
) -> Any:
|
|
2373
2408
|
"""
|
|
2374
2409
|
Database-native credit card validation using Luhn algorithm in SQL.
|
|
2375
2410
|
|
|
@@ -2391,7 +2426,7 @@ def interrogate_credit_card_db(
|
|
|
2391
2426
|
|
|
2392
2427
|
Returns
|
|
2393
2428
|
-------
|
|
2394
|
-
|
|
2429
|
+
Any
|
|
2395
2430
|
Result table with pb_is_good_ column indicating validation results.
|
|
2396
2431
|
|
|
2397
2432
|
Notes
|
|
@@ -2408,7 +2443,7 @@ def interrogate_credit_card_db(
|
|
|
2408
2443
|
# Check if this is an Ibis table
|
|
2409
2444
|
native_tbl = tbl
|
|
2410
2445
|
if hasattr(tbl, "to_native"):
|
|
2411
|
-
native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl
|
|
2446
|
+
native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl # type: ignore[operator]
|
|
2412
2447
|
|
|
2413
2448
|
is_ibis = hasattr(native_tbl, "execute")
|
|
2414
2449
|
|
|
@@ -2422,7 +2457,7 @@ def interrogate_credit_card_db(
|
|
|
2422
2457
|
raise ImportError("Ibis is required for database-native validation")
|
|
2423
2458
|
|
|
2424
2459
|
# Get the column as an Ibis expression
|
|
2425
|
-
col_expr = native_tbl[column]
|
|
2460
|
+
col_expr = native_tbl[column] # type: ignore[index]
|
|
2426
2461
|
|
|
2427
2462
|
# Step 1: Clean the input and remove spaces and hyphens
|
|
2428
2463
|
# First check format: only digits, spaces, and hyphens allowed
|
|
@@ -2475,7 +2510,7 @@ def interrogate_credit_card_db(
|
|
|
2475
2510
|
|
|
2476
2511
|
# Calculate contribution to checksum
|
|
2477
2512
|
# If should_double: double the digit, then if > 9 subtract 9
|
|
2478
|
-
doubled = digit_val * 2
|
|
2513
|
+
doubled = digit_val * 2 # type: ignore[operator]
|
|
2479
2514
|
adjusted = ibis.cases(
|
|
2480
2515
|
(should_double & (doubled > 9), doubled - 9),
|
|
2481
2516
|
(should_double, doubled),
|
|
@@ -2488,10 +2523,10 @@ def interrogate_credit_card_db(
|
|
|
2488
2523
|
else_=0,
|
|
2489
2524
|
)
|
|
2490
2525
|
|
|
2491
|
-
checksum = checksum + contribution
|
|
2526
|
+
checksum = checksum + contribution # type: ignore[operator]
|
|
2492
2527
|
|
|
2493
2528
|
# Step 4: Valid if checksum % 10 == 0
|
|
2494
|
-
luhn_valid = (checksum % 10) == 0
|
|
2529
|
+
luhn_valid = (checksum % 10) == 0 # type: ignore[operator]
|
|
2495
2530
|
|
|
2496
2531
|
# Combine all validation checks
|
|
2497
2532
|
is_valid = valid_chars & valid_length & luhn_valid
|
|
@@ -2505,30 +2540,32 @@ def interrogate_credit_card_db(
|
|
|
2505
2540
|
is_valid = is_valid.fill_null(False)
|
|
2506
2541
|
|
|
2507
2542
|
# Add validation column to table
|
|
2508
|
-
result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
|
|
2543
|
+
result_tbl = native_tbl.mutate(pb_is_good_=is_valid) # type: ignore[union-attr]
|
|
2509
2544
|
|
|
2510
2545
|
return result_tbl
|
|
2511
2546
|
|
|
2512
2547
|
|
|
2513
|
-
def interrogate_null(tbl:
|
|
2548
|
+
def interrogate_null(tbl: IntoFrame, column: str) -> Any:
|
|
2514
2549
|
"""Null interrogation."""
|
|
2515
2550
|
|
|
2516
2551
|
nw_tbl = nw.from_native(tbl)
|
|
2552
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2517
2553
|
result_tbl = nw_tbl.with_columns(pb_is_good_=nw.col(column).is_null())
|
|
2518
2554
|
return result_tbl.to_native()
|
|
2519
2555
|
|
|
2520
2556
|
|
|
2521
|
-
def interrogate_not_null(tbl:
|
|
2557
|
+
def interrogate_not_null(tbl: IntoFrame, column: str) -> Any:
|
|
2522
2558
|
"""Not null interrogation."""
|
|
2523
2559
|
|
|
2524
2560
|
nw_tbl = nw.from_native(tbl)
|
|
2561
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2525
2562
|
result_tbl = nw_tbl.with_columns(pb_is_good_=~nw.col(column).is_null())
|
|
2526
2563
|
return result_tbl.to_native()
|
|
2527
2564
|
|
|
2528
2565
|
|
|
2529
2566
|
def interrogate_increasing(
|
|
2530
|
-
tbl:
|
|
2531
|
-
) ->
|
|
2567
|
+
tbl: IntoFrame, column: str, allow_stationary: bool, decreasing_tol: float, na_pass: bool
|
|
2568
|
+
) -> Any:
|
|
2532
2569
|
"""
|
|
2533
2570
|
Increasing interrogation.
|
|
2534
2571
|
|
|
@@ -2549,10 +2586,11 @@ def interrogate_increasing(
|
|
|
2549
2586
|
|
|
2550
2587
|
Returns
|
|
2551
2588
|
-------
|
|
2552
|
-
|
|
2589
|
+
Any
|
|
2553
2590
|
The table with a `pb_is_good_` column indicating pass/fail for each row.
|
|
2554
2591
|
"""
|
|
2555
2592
|
nw_tbl = nw.from_native(tbl)
|
|
2593
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2556
2594
|
|
|
2557
2595
|
# Create a lagged difference column
|
|
2558
2596
|
result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
|
|
@@ -2585,8 +2623,8 @@ def interrogate_increasing(
|
|
|
2585
2623
|
|
|
2586
2624
|
|
|
2587
2625
|
def interrogate_decreasing(
|
|
2588
|
-
tbl:
|
|
2589
|
-
) ->
|
|
2626
|
+
tbl: IntoFrame, column: str, allow_stationary: bool, increasing_tol: float, na_pass: bool
|
|
2627
|
+
) -> Any:
|
|
2590
2628
|
"""
|
|
2591
2629
|
Decreasing interrogation.
|
|
2592
2630
|
|
|
@@ -2607,10 +2645,11 @@ def interrogate_decreasing(
|
|
|
2607
2645
|
|
|
2608
2646
|
Returns
|
|
2609
2647
|
-------
|
|
2610
|
-
|
|
2648
|
+
Any
|
|
2611
2649
|
The table with a `pb_is_good_` column indicating pass/fail for each row.
|
|
2612
2650
|
"""
|
|
2613
2651
|
nw_tbl = nw.from_native(tbl)
|
|
2652
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2614
2653
|
|
|
2615
2654
|
# Create a lagged difference column
|
|
2616
2655
|
result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
|
|
@@ -2643,8 +2682,8 @@ def interrogate_decreasing(
|
|
|
2643
2682
|
|
|
2644
2683
|
|
|
2645
2684
|
def _interrogate_comparison_base(
|
|
2646
|
-
tbl:
|
|
2647
|
-
) ->
|
|
2685
|
+
tbl: IntoFrame, column: str, compare: Any, na_pass: bool, operator: str
|
|
2686
|
+
) -> Any:
|
|
2648
2687
|
"""
|
|
2649
2688
|
Unified base function for comparison operations (gt, ge, lt, le, eq, ne).
|
|
2650
2689
|
|
|
@@ -2663,13 +2702,14 @@ def _interrogate_comparison_base(
|
|
|
2663
2702
|
|
|
2664
2703
|
Returns
|
|
2665
2704
|
-------
|
|
2666
|
-
|
|
2705
|
+
Any
|
|
2667
2706
|
The result table with `pb_is_good_` column indicating the passing test units.
|
|
2668
2707
|
"""
|
|
2669
2708
|
|
|
2670
2709
|
compare_expr = _get_compare_expr_nw(compare=compare)
|
|
2671
2710
|
|
|
2672
2711
|
nw_tbl = nw.from_native(tbl)
|
|
2712
|
+
assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
|
|
2673
2713
|
compare_expr = _safe_modify_datetime_compare_val(nw_tbl, column, compare_expr)
|
|
2674
2714
|
|
|
2675
2715
|
# Create the comparison expression based on the operator
|
|
@@ -2716,7 +2756,7 @@ def _interrogate_comparison_base(
|
|
|
2716
2756
|
return result_tbl.to_native()
|
|
2717
2757
|
|
|
2718
2758
|
|
|
2719
|
-
def interrogate_rows_distinct(data_tbl:
|
|
2759
|
+
def interrogate_rows_distinct(data_tbl: IntoFrame, columns_subset: list[str] | None) -> Any:
|
|
2720
2760
|
"""
|
|
2721
2761
|
Check if rows in a DataFrame are distinct.
|
|
2722
2762
|
|
|
@@ -2733,10 +2773,11 @@ def interrogate_rows_distinct(data_tbl: FrameT, columns_subset: list[str] | None
|
|
|
2733
2773
|
|
|
2734
2774
|
Returns
|
|
2735
2775
|
-------
|
|
2736
|
-
|
|
2776
|
+
Any
|
|
2737
2777
|
A DataFrame with a `pb_is_good_` column indicating which rows pass the test.
|
|
2738
2778
|
"""
|
|
2739
2779
|
tbl = nw.from_native(data_tbl)
|
|
2780
|
+
assert is_narwhals_dataframe(tbl) or is_narwhals_lazyframe(tbl)
|
|
2740
2781
|
|
|
2741
2782
|
# Get the column subset to use for the test
|
|
2742
2783
|
if columns_subset is None:
|
|
@@ -2744,18 +2785,23 @@ def interrogate_rows_distinct(data_tbl: FrameT, columns_subset: list[str] | None
|
|
|
2744
2785
|
|
|
2745
2786
|
# Create a count of duplicates using group_by approach
|
|
2746
2787
|
# Group by the columns of interest and count occurrences
|
|
2747
|
-
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
|
|
2751
|
-
|
|
2752
|
-
|
|
2753
|
-
|
|
2754
|
-
|
|
2755
|
-
|
|
2788
|
+
# Handle DataFrame and LazyFrame separately for proper type narrowing
|
|
2789
|
+
if is_narwhals_dataframe(tbl):
|
|
2790
|
+
count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
|
|
2791
|
+
result = tbl.join(count_tbl, on=columns_subset, how="left")
|
|
2792
|
+
result = result.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
|
|
2793
|
+
return result.to_native()
|
|
2794
|
+
elif is_narwhals_lazyframe(tbl):
|
|
2795
|
+
count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
|
|
2796
|
+
result = tbl.join(count_tbl, on=columns_subset, how="left")
|
|
2797
|
+
result = result.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
|
|
2798
|
+
return result.to_native()
|
|
2799
|
+
else:
|
|
2800
|
+
msg = f"Expected DataFrame or LazyFrame, got {type(tbl)}"
|
|
2801
|
+
raise TypeError(msg)
|
|
2756
2802
|
|
|
2757
2803
|
|
|
2758
|
-
def interrogate_rows_complete(tbl:
|
|
2804
|
+
def interrogate_rows_complete(tbl: IntoFrame, columns_subset: list[str] | None) -> Any:
|
|
2759
2805
|
"""Rows complete interrogation."""
|
|
2760
2806
|
nw_tbl = nw.from_native(tbl)
|
|
2761
2807
|
|
|
@@ -2771,12 +2817,25 @@ def interrogate_rows_complete(tbl: FrameT, columns_subset: list[str] | None) ->
|
|
|
2771
2817
|
return result_tbl.to_native()
|
|
2772
2818
|
|
|
2773
2819
|
|
|
2774
|
-
def interrogate_prompt(
|
|
2820
|
+
def interrogate_prompt(
|
|
2821
|
+
tbl: IntoFrame, columns_subset: list[str] | None, ai_config: dict[str, Any]
|
|
2822
|
+
) -> Any:
|
|
2775
2823
|
"""AI-powered interrogation of rows."""
|
|
2776
2824
|
import logging
|
|
2777
2825
|
|
|
2778
2826
|
logger = logging.getLogger(__name__)
|
|
2779
2827
|
|
|
2828
|
+
# Convert to narwhals early for consistent row counting
|
|
2829
|
+
nw_tbl = nw.from_native(tbl)
|
|
2830
|
+
# Get row count - for LazyFrame we need to use select/collect
|
|
2831
|
+
if is_narwhals_lazyframe(nw_tbl):
|
|
2832
|
+
row_count = nw_tbl.select(nw.len()).collect().item()
|
|
2833
|
+
assert isinstance(row_count, int)
|
|
2834
|
+
total_rows = row_count
|
|
2835
|
+
else:
|
|
2836
|
+
assert is_narwhals_dataframe(nw_tbl)
|
|
2837
|
+
total_rows = len(nw_tbl)
|
|
2838
|
+
|
|
2780
2839
|
try:
|
|
2781
2840
|
# Import AI validation modules
|
|
2782
2841
|
from pointblank._utils_ai import (
|
|
@@ -2833,28 +2892,25 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
|
|
|
2833
2892
|
)
|
|
2834
2893
|
|
|
2835
2894
|
# Parse and combine results with signature mapping optimization
|
|
2836
|
-
parser = _ValidationResponseParser(total_rows=
|
|
2895
|
+
parser = _ValidationResponseParser(total_rows=total_rows)
|
|
2837
2896
|
combined_results = parser.combine_batch_results(batch_results, signature_mapping)
|
|
2838
2897
|
|
|
2839
2898
|
# Debug: Log table info and combined results
|
|
2840
2899
|
logger.debug("🏁 Final result conversion:")
|
|
2841
|
-
logger.debug(f" - Table length: {
|
|
2900
|
+
logger.debug(f" - Table length: {total_rows}")
|
|
2842
2901
|
logger.debug(
|
|
2843
2902
|
f" - Combined results keys: {sorted(combined_results.keys()) if combined_results else 'None'}"
|
|
2844
2903
|
)
|
|
2845
2904
|
|
|
2846
|
-
# Convert results to narwhals format
|
|
2847
|
-
nw_tbl = nw.from_native(tbl)
|
|
2848
|
-
|
|
2849
2905
|
# Create a boolean column for validation results
|
|
2850
2906
|
validation_results = []
|
|
2851
|
-
for i in range(
|
|
2907
|
+
for i in range(total_rows):
|
|
2852
2908
|
# Default to False if row wasn't processed
|
|
2853
2909
|
result = combined_results.get(i, False)
|
|
2854
2910
|
validation_results.append(result)
|
|
2855
2911
|
|
|
2856
2912
|
# Debug: Log first few conversions
|
|
2857
|
-
if i < 5 or
|
|
2913
|
+
if i < 5 or total_rows - i <= 2:
|
|
2858
2914
|
logger.debug(f" Row {i}: {result} (from combined_results.get({i}, False))")
|
|
2859
2915
|
|
|
2860
2916
|
logger.debug(f" - Final validation_results length: {len(validation_results)}")
|
|
@@ -2893,10 +2949,9 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
|
|
|
2893
2949
|
logger.error(f"Missing dependencies for AI validation: {e}")
|
|
2894
2950
|
logger.error("Install required packages: pip install openai anthropic aiohttp")
|
|
2895
2951
|
|
|
2896
|
-
# Return all False results as fallback
|
|
2897
|
-
nw_tbl = nw.from_native(tbl)
|
|
2952
|
+
# Return all False results as fallback (nw_tbl and total_rows defined at function start)
|
|
2898
2953
|
native_tbl = nw_tbl.to_native()
|
|
2899
|
-
validation_results = [False] *
|
|
2954
|
+
validation_results = [False] * total_rows
|
|
2900
2955
|
|
|
2901
2956
|
if hasattr(native_tbl, "with_columns"): # Polars
|
|
2902
2957
|
import polars as pl
|
|
@@ -2918,10 +2973,9 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
|
|
|
2918
2973
|
except Exception as e:
|
|
2919
2974
|
logger.error(f"AI validation failed: {e}")
|
|
2920
2975
|
|
|
2921
|
-
# Return all False results as fallback
|
|
2922
|
-
nw_tbl = nw.from_native(tbl)
|
|
2976
|
+
# Return all False results as fallback (nw_tbl and total_rows defined at function start)
|
|
2923
2977
|
native_tbl = nw_tbl.to_native()
|
|
2924
|
-
validation_results = [False] *
|
|
2978
|
+
validation_results = [False] * total_rows
|
|
2925
2979
|
|
|
2926
2980
|
if hasattr(native_tbl, "with_columns"): # Polars
|
|
2927
2981
|
import polars as pl
|
|
@@ -2939,3 +2993,206 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
|
|
|
2939
2993
|
result_tbl["pb_is_good_"] = validation_results
|
|
2940
2994
|
|
|
2941
2995
|
return result_tbl
|
|
2996
|
+
|
|
2997
|
+
|
|
2998
|
+
def data_freshness(
|
|
2999
|
+
data_tbl: IntoFrame,
|
|
3000
|
+
column: str,
|
|
3001
|
+
max_age: Any, # datetime.timedelta
|
|
3002
|
+
reference_time: Any | None, # datetime.datetime | None
|
|
3003
|
+
timezone: str | None,
|
|
3004
|
+
allow_tz_mismatch: bool,
|
|
3005
|
+
) -> dict:
|
|
3006
|
+
"""
|
|
3007
|
+
Check if the most recent datetime value in a column is within the allowed max_age.
|
|
3008
|
+
|
|
3009
|
+
Parameters
|
|
3010
|
+
----------
|
|
3011
|
+
data_tbl
|
|
3012
|
+
The data table to check.
|
|
3013
|
+
column
|
|
3014
|
+
The datetime column to check.
|
|
3015
|
+
max_age
|
|
3016
|
+
The maximum allowed age as a timedelta.
|
|
3017
|
+
reference_time
|
|
3018
|
+
The reference time to compare against (None = use current time).
|
|
3019
|
+
timezone
|
|
3020
|
+
The timezone to use for interpretation.
|
|
3021
|
+
allow_tz_mismatch
|
|
3022
|
+
Whether to suppress timezone mismatch warnings.
|
|
3023
|
+
|
|
3024
|
+
Returns
|
|
3025
|
+
-------
|
|
3026
|
+
dict
|
|
3027
|
+
A dictionary containing:
|
|
3028
|
+
- 'passed': bool, whether the validation passed
|
|
3029
|
+
- 'max_datetime': the maximum datetime found in the column
|
|
3030
|
+
- 'reference_time': the reference time used
|
|
3031
|
+
- 'age': the calculated age (timedelta)
|
|
3032
|
+
- 'max_age': the maximum allowed age
|
|
3033
|
+
- 'tz_warning': any timezone warning message
|
|
3034
|
+
"""
|
|
3035
|
+
import datetime
|
|
3036
|
+
|
|
3037
|
+
nw_frame = nw.from_native(data_tbl)
|
|
3038
|
+
|
|
3039
|
+
# Handle LazyFrames by collecting them first
|
|
3040
|
+
if is_narwhals_lazyframe(nw_frame):
|
|
3041
|
+
nw_frame = nw_frame.collect()
|
|
3042
|
+
|
|
3043
|
+
assert is_narwhals_dataframe(nw_frame)
|
|
3044
|
+
|
|
3045
|
+
result = {
|
|
3046
|
+
"passed": False,
|
|
3047
|
+
"max_datetime": None,
|
|
3048
|
+
"reference_time": None,
|
|
3049
|
+
"age": None,
|
|
3050
|
+
"max_age": max_age,
|
|
3051
|
+
"tz_warning": None,
|
|
3052
|
+
"column_empty": False,
|
|
3053
|
+
}
|
|
3054
|
+
|
|
3055
|
+
# Get the maximum datetime value from the column
|
|
3056
|
+
try:
|
|
3057
|
+
# Use narwhals to get max value
|
|
3058
|
+
max_val_result = nw_frame.select(nw.col(column).max())
|
|
3059
|
+
max_datetime_raw = max_val_result.item()
|
|
3060
|
+
|
|
3061
|
+
if max_datetime_raw is None:
|
|
3062
|
+
result["column_empty"] = True
|
|
3063
|
+
result["passed"] = False
|
|
3064
|
+
return result
|
|
3065
|
+
|
|
3066
|
+
# Convert to Python datetime if needed
|
|
3067
|
+
if hasattr(max_datetime_raw, "to_pydatetime"):
|
|
3068
|
+
# Pandas Timestamp
|
|
3069
|
+
max_datetime = max_datetime_raw.to_pydatetime()
|
|
3070
|
+
elif hasattr(max_datetime_raw, "isoformat"):
|
|
3071
|
+
# Already a datetime-like object
|
|
3072
|
+
max_datetime = max_datetime_raw
|
|
3073
|
+
else:
|
|
3074
|
+
# Try to parse as string or handle other types
|
|
3075
|
+
max_datetime = datetime.datetime.fromisoformat(str(max_datetime_raw))
|
|
3076
|
+
|
|
3077
|
+
result["max_datetime"] = max_datetime
|
|
3078
|
+
|
|
3079
|
+
except Exception as e:
|
|
3080
|
+
result["error"] = str(e)
|
|
3081
|
+
result["passed"] = False
|
|
3082
|
+
return result
|
|
3083
|
+
|
|
3084
|
+
# Determine the reference time
|
|
3085
|
+
# We'll set the reference time after we know the timezone awareness of the data
|
|
3086
|
+
if reference_time is None:
|
|
3087
|
+
ref_time = None # Will be set below based on data timezone awareness
|
|
3088
|
+
else:
|
|
3089
|
+
ref_time = reference_time
|
|
3090
|
+
|
|
3091
|
+
# Handle timezone awareness/naivete
|
|
3092
|
+
max_dt_aware = _is_datetime_aware(max_datetime)
|
|
3093
|
+
|
|
3094
|
+
# Helper to parse timezone string (supports IANA names and offsets like "-7", "-07:00")
|
|
3095
|
+
def _get_tz_from_string(tz_str: str) -> datetime.tzinfo:
|
|
3096
|
+
import re
|
|
3097
|
+
|
|
3098
|
+
# Check for offset formats: "-7", "+5", "-07:00", "+05:30", etc.
|
|
3099
|
+
offset_pattern = r"^([+-]?)(\d{1,2})(?::(\d{2}))?$"
|
|
3100
|
+
match = re.match(offset_pattern, tz_str.strip())
|
|
3101
|
+
|
|
3102
|
+
if match:
|
|
3103
|
+
sign_str, hours_str, minutes_str = match.groups()
|
|
3104
|
+
hours = int(hours_str)
|
|
3105
|
+
minutes = int(minutes_str) if minutes_str else 0
|
|
3106
|
+
|
|
3107
|
+
total_minutes = hours * 60 + minutes
|
|
3108
|
+
if sign_str == "-":
|
|
3109
|
+
total_minutes = -total_minutes
|
|
3110
|
+
|
|
3111
|
+
return datetime.timezone(datetime.timedelta(minutes=total_minutes))
|
|
3112
|
+
|
|
3113
|
+
# Try IANA timezone names (zoneinfo is standard in Python 3.9+)
|
|
3114
|
+
try:
|
|
3115
|
+
return ZoneInfo(tz_str)
|
|
3116
|
+
except KeyError:
|
|
3117
|
+
# Invalid timezone name, fall back to UTC
|
|
3118
|
+
return datetime.timezone.utc
|
|
3119
|
+
|
|
3120
|
+
# If ref_time is None (no reference_time provided), set it based on data awareness
|
|
3121
|
+
if ref_time is None:
|
|
3122
|
+
if max_dt_aware:
|
|
3123
|
+
# Data is timezone-aware, use timezone-aware now
|
|
3124
|
+
if timezone:
|
|
3125
|
+
ref_time = datetime.datetime.now(_get_tz_from_string(timezone))
|
|
3126
|
+
else:
|
|
3127
|
+
# Default to UTC when data is aware but no timezone specified
|
|
3128
|
+
ref_time = datetime.datetime.now(datetime.timezone.utc)
|
|
3129
|
+
else:
|
|
3130
|
+
# Data is naive, use naive local time for comparison
|
|
3131
|
+
if timezone:
|
|
3132
|
+
# If user specified timezone, use it for reference
|
|
3133
|
+
ref_time = datetime.datetime.now(_get_tz_from_string(timezone))
|
|
3134
|
+
else:
|
|
3135
|
+
# No timezone specified and data is naive -> use naive local time
|
|
3136
|
+
ref_time = datetime.datetime.now()
|
|
3137
|
+
|
|
3138
|
+
result["reference_time"] = ref_time
|
|
3139
|
+
ref_dt_aware = _is_datetime_aware(ref_time)
|
|
3140
|
+
|
|
3141
|
+
# Track timezone warnings - use keys for translation lookup
|
|
3142
|
+
tz_warning_key = None
|
|
3143
|
+
|
|
3144
|
+
if max_dt_aware != ref_dt_aware:
|
|
3145
|
+
if not allow_tz_mismatch:
|
|
3146
|
+
if max_dt_aware and not ref_dt_aware:
|
|
3147
|
+
tz_warning_key = "data_freshness_tz_warning_aware_naive"
|
|
3148
|
+
else:
|
|
3149
|
+
tz_warning_key = "data_freshness_tz_warning_naive_aware"
|
|
3150
|
+
result["tz_warning_key"] = tz_warning_key
|
|
3151
|
+
|
|
3152
|
+
# Make both comparable
|
|
3153
|
+
try:
|
|
3154
|
+
if max_dt_aware and not ref_dt_aware:
|
|
3155
|
+
# Add timezone to reference time
|
|
3156
|
+
if timezone:
|
|
3157
|
+
try:
|
|
3158
|
+
ref_time = ref_time.replace(tzinfo=ZoneInfo(timezone))
|
|
3159
|
+
except KeyError:
|
|
3160
|
+
ref_time = ref_time.replace(tzinfo=datetime.timezone.utc)
|
|
3161
|
+
else:
|
|
3162
|
+
# Assume UTC
|
|
3163
|
+
ref_time = ref_time.replace(tzinfo=datetime.timezone.utc)
|
|
3164
|
+
|
|
3165
|
+
elif not max_dt_aware and ref_dt_aware:
|
|
3166
|
+
# Localize the max_datetime if we have a timezone
|
|
3167
|
+
if timezone:
|
|
3168
|
+
try:
|
|
3169
|
+
max_datetime = max_datetime.replace(tzinfo=ZoneInfo(timezone))
|
|
3170
|
+
except KeyError:
|
|
3171
|
+
# Remove timezone from reference for comparison
|
|
3172
|
+
ref_time = ref_time.replace(tzinfo=None)
|
|
3173
|
+
else:
|
|
3174
|
+
# Remove timezone from reference for comparison
|
|
3175
|
+
ref_time = ref_time.replace(tzinfo=None)
|
|
3176
|
+
|
|
3177
|
+
# Calculate the age
|
|
3178
|
+
age = ref_time - max_datetime
|
|
3179
|
+
result["age"] = age
|
|
3180
|
+
result["reference_time"] = ref_time
|
|
3181
|
+
|
|
3182
|
+
# Check if within max_age
|
|
3183
|
+
result["passed"] = age <= max_age
|
|
3184
|
+
|
|
3185
|
+
except Exception as e:
|
|
3186
|
+
result["error"] = str(e)
|
|
3187
|
+
result["passed"] = False
|
|
3188
|
+
|
|
3189
|
+
return result
|
|
3190
|
+
|
|
3191
|
+
|
|
3192
|
+
def _is_datetime_aware(dt: Any) -> bool:
|
|
3193
|
+
"""Check if a datetime object is timezone-aware."""
|
|
3194
|
+
if dt is None:
|
|
3195
|
+
return False
|
|
3196
|
+
if hasattr(dt, "tzinfo"):
|
|
3197
|
+
return dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None
|
|
3198
|
+
return False
|