pointblank 0.17.0__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,11 +3,16 @@ from __future__ import annotations
3
3
  import functools
4
4
  from collections.abc import Callable
5
5
  from dataclasses import dataclass
6
- from typing import Any
6
+ from typing import TYPE_CHECKING, Any
7
+ from zoneinfo import ZoneInfo
7
8
 
8
9
  import narwhals as nw
9
- from narwhals.dependencies import is_pandas_dataframe, is_polars_dataframe
10
- from narwhals.typing import FrameT
10
+ from narwhals.dependencies import (
11
+ is_narwhals_dataframe,
12
+ is_narwhals_lazyframe,
13
+ is_pandas_dataframe,
14
+ is_polars_dataframe,
15
+ )
11
16
 
12
17
  from pointblank._constants import IBIS_BACKENDS
13
18
  from pointblank._spec_utils import (
@@ -25,6 +30,9 @@ from pointblank._utils import (
25
30
  )
26
31
  from pointblank.column import Column
27
32
 
33
+ if TYPE_CHECKING:
34
+ from narwhals.typing import IntoFrame
35
+
28
36
 
29
37
  def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val: Any) -> Any:
30
38
  """
@@ -94,7 +102,9 @@ def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val:
94
102
  return compare_val
95
103
 
96
104
 
97
- def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: str = None) -> Any:
105
+ def _safe_is_nan_or_null_expr(
106
+ data_frame: Any, column_expr: Any, column_name: str | None = None
107
+ ) -> Any:
98
108
  """
99
109
  Create an expression that safely checks for both Null and NaN values.
100
110
 
@@ -425,7 +435,7 @@ class SpeciallyValidation:
425
435
  else:
426
436
  self.tbl_type = tbl_type
427
437
 
428
- def get_test_results(self) -> any | list[bool]:
438
+ def get_test_results(self) -> Any | list[bool]:
429
439
  """Evaluate the expression get either a list of booleans or a results table."""
430
440
 
431
441
  # Get the expression and inspect whether there is a `data` argument
@@ -519,7 +529,7 @@ class NumberOfTestUnits:
519
529
  Count the number of test units in a column.
520
530
  """
521
531
 
522
- df: FrameT
532
+ df: Any # Can be IntoFrame or Ibis table
523
533
  column: str
524
534
 
525
535
  def get_test_units(self, tbl_type: str) -> int:
@@ -536,15 +546,18 @@ class NumberOfTestUnits:
536
546
  )
537
547
 
538
548
  # Handle LazyFrames which don't have len()
539
- if hasattr(dfn, "collect"):
549
+ if is_narwhals_lazyframe(dfn):
540
550
  dfn = dfn.collect()
541
551
 
552
+ assert is_narwhals_dataframe(dfn)
542
553
  return len(dfn)
543
554
 
544
555
  if tbl_type in IBIS_BACKENDS:
545
556
  # Get the count of test units and convert to a native format
546
557
  # TODO: check whether pandas or polars is available
547
- return self.df.count().to_polars()
558
+ return self.df.count().to_polars() # type: ignore[union-attr]
559
+
560
+ raise ValueError(f"Unsupported table type: {tbl_type}")
548
561
 
549
562
 
550
563
  def _get_compare_expr_nw(compare: Any) -> Any:
@@ -555,28 +568,25 @@ def _get_compare_expr_nw(compare: Any) -> Any:
555
568
  return compare
556
569
 
557
570
 
558
- def _column_has_null_values(table: FrameT, column: str) -> bool:
571
+ def _column_has_null_values(table: nw.DataFrame[Any] | nw.LazyFrame[Any], column: str) -> bool:
559
572
  try:
560
- # Try the standard null_count() method
561
- null_count = (table.select(column).null_count())[column][0]
573
+ # Try the standard null_count() method (DataFrame)
574
+ null_count = (table.select(column).null_count())[column][0] # type: ignore[union-attr]
562
575
  except AttributeError:
563
576
  # For LazyFrames, collect first then get null count
564
577
  try:
565
- collected = table.select(column).collect()
578
+ collected = table.select(column).collect() # type: ignore[union-attr]
566
579
  null_count = (collected.null_count())[column][0]
567
580
  except Exception:
568
581
  # Fallback: check if any values are null
569
582
  try:
570
- result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect()
583
+ result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect() # type: ignore[union-attr]
571
584
  null_count = result["null_count"][0]
572
585
  except Exception:
573
586
  # Last resort: return False (assume no nulls)
574
587
  return False
575
588
 
576
- if null_count is None or null_count == 0:
577
- return False
578
-
579
- return True
589
+ return null_count is not None and null_count > 0
580
590
 
581
591
 
582
592
  def _check_nulls_across_columns_nw(table, columns_subset):
@@ -596,7 +606,7 @@ def _check_nulls_across_columns_nw(table, columns_subset):
596
606
  return result
597
607
 
598
608
 
599
- def _modify_datetime_compare_val(tgt_column: any, compare_val: any) -> any:
609
+ def _modify_datetime_compare_val(tgt_column: Any, compare_val: Any) -> Any:
600
610
  tgt_col_dtype_str = str(tgt_column.dtype).lower()
601
611
 
602
612
  if compare_val is isinstance(compare_val, Column): # pragma: no cover
@@ -640,7 +650,7 @@ def _modify_datetime_compare_val(tgt_column: any, compare_val: any) -> any:
640
650
  return compare_expr
641
651
 
642
652
 
643
- def col_vals_expr(data_tbl: FrameT, expr, tbl_type: str = "local"):
653
+ def col_vals_expr(data_tbl: Any, expr: Any, tbl_type: str = "local") -> Any:
644
654
  """Check if values in a column evaluate to True for a given predicate expression."""
645
655
  if tbl_type == "local":
646
656
  # Check the type of expression provided
@@ -670,21 +680,19 @@ def col_vals_expr(data_tbl: FrameT, expr, tbl_type: str = "local"):
670
680
  return data_tbl # pragma: no cover
671
681
 
672
682
 
673
- def rows_complete(data_tbl: FrameT, columns_subset: list[str] | None):
683
+ def rows_complete(data_tbl: IntoFrame, columns_subset: list[str] | None) -> Any:
674
684
  """
675
685
  Check if rows in a DataFrame are complete (no null values).
676
686
 
677
687
  This function replaces the RowsComplete dataclass for direct usage.
678
688
  """
679
- tbl = _convert_to_narwhals(df=data_tbl)
680
-
681
689
  return interrogate_rows_complete(
682
- tbl=tbl,
690
+ tbl=data_tbl,
683
691
  columns_subset=columns_subset,
684
692
  )
685
693
 
686
694
 
687
- def col_exists(data_tbl: FrameT, column: str) -> bool:
695
+ def col_exists(data_tbl: IntoFrame, column: str) -> bool:
688
696
  """
689
697
  Check if a column exists in a DataFrame.
690
698
 
@@ -705,8 +713,8 @@ def col_exists(data_tbl: FrameT, column: str) -> bool:
705
713
 
706
714
 
707
715
  def col_schema_match(
708
- data_tbl: FrameT,
709
- schema,
716
+ data_tbl: IntoFrame,
717
+ schema: Any,
710
718
  complete: bool,
711
719
  in_order: bool,
712
720
  case_sensitive_colnames: bool,
@@ -730,7 +738,9 @@ def col_schema_match(
730
738
  )
731
739
 
732
740
 
733
- def row_count_match(data_tbl: FrameT, count, inverse: bool, abs_tol_bounds) -> bool:
741
+ def row_count_match(
742
+ data_tbl: IntoFrame, count: Any, inverse: bool, abs_tol_bounds: AbsoluteBounds
743
+ ) -> bool:
734
744
  """
735
745
  Check if DataFrame row count matches expected count.
736
746
  """
@@ -748,28 +758,33 @@ def row_count_match(data_tbl: FrameT, count, inverse: bool, abs_tol_bounds) -> b
748
758
 
749
759
 
750
760
  def col_pct_null(
751
- data_tbl: FrameT, column: str, p: float, bound_finder: Callable[[int], AbsoluteBounds]
761
+ data_tbl: IntoFrame, column: str, p: float, bound_finder: Callable[[int], AbsoluteBounds]
752
762
  ) -> bool:
753
763
  """Check if the percentage of null vales are within p given the absolute bounds."""
754
- # Convert to narwhals for consistent API across backends
755
- nw_tbl = nw.from_native(data_tbl)
756
-
764
+ nw_frame = nw.from_native(data_tbl)
757
765
  # Handle LazyFrames by collecting them first
758
- if hasattr(nw_tbl, "collect"):
759
- nw_tbl = nw_tbl.collect()
766
+ if is_narwhals_lazyframe(nw_frame):
767
+ nw_frame = nw_frame.collect()
768
+
769
+ assert is_narwhals_dataframe(nw_frame)
770
+
771
+ # We cast as int because it could come back as an arbitary type. For example if the backend
772
+ # is numpy-like, we might get a scalar from `item()`. `int()` expects a certain signature though
773
+ # and `object` does not satisfy so we have to go with the type ignore.
774
+ total_rows: object = nw_frame.select(nw.len()).item()
775
+ total_rows: int = int(total_rows) # type: ignore
760
776
 
761
- # Get total rows using narwhals
762
- total_rows: int = nw_tbl.select(nw.len()).item()
763
777
  abs_target: float = round(total_rows * p)
764
778
  lower_bound, upper_bound = bound_finder(abs_target)
765
779
 
766
- # Count null values
767
- n_null: int = nw_tbl.select(nw.col(column).is_null().sum()).item()
780
+ # Count null values (see above comment on typing shenanigans)
781
+ n_null: object = nw_frame.select(nw.col(column).is_null().sum()).item()
782
+ n_null: int = int(n_null) # type: ignore
768
783
 
769
784
  return n_null >= (abs_target - lower_bound) and n_null <= (abs_target + upper_bound)
770
785
 
771
786
 
772
- def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
787
+ def col_count_match(data_tbl: IntoFrame, count: Any, inverse: bool) -> bool:
773
788
  """
774
789
  Check if DataFrame column count matches expected count.
775
790
  """
@@ -781,7 +796,7 @@ def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
781
796
  return get_column_count(data=data_tbl) != count
782
797
 
783
798
 
784
- def _coerce_to_common_backend(data_tbl: FrameT, tbl_compare: FrameT) -> tuple[FrameT, FrameT]:
799
+ def _coerce_to_common_backend(data_tbl: Any, tbl_compare: Any) -> tuple[Any, Any]:
785
800
  """
786
801
  Coerce two tables to the same backend if they differ.
787
802
 
@@ -798,7 +813,7 @@ def _coerce_to_common_backend(data_tbl: FrameT, tbl_compare: FrameT) -> tuple[Fr
798
813
 
799
814
  Returns
800
815
  -------
801
- tuple[FrameT, FrameT]
816
+ tuple[Any, Any]
802
817
  Both tables, with tbl_compare potentially converted to data_tbl's backend.
803
818
  """
804
819
  # Get backend types for both tables
@@ -884,7 +899,7 @@ def _coerce_to_common_backend(data_tbl: FrameT, tbl_compare: FrameT) -> tuple[Fr
884
899
  return data_tbl, tbl_compare
885
900
 
886
901
 
887
- def tbl_match(data_tbl: FrameT, tbl_compare: FrameT) -> bool:
902
+ def tbl_match(data_tbl: IntoFrame, tbl_compare: IntoFrame) -> bool:
888
903
  """
889
904
  Check if two tables match exactly in schema, row count, and data.
890
905
 
@@ -998,33 +1013,37 @@ def tbl_match(data_tbl: FrameT, tbl_compare: FrameT) -> bool:
998
1013
 
999
1014
  # Convert to native format for comparison
1000
1015
  # We need to collect if lazy frames
1001
- if hasattr(col_data_1, "collect"):
1016
+ if is_narwhals_lazyframe(col_data_1):
1002
1017
  col_data_1 = col_data_1.collect()
1003
1018
 
1004
- if hasattr(col_data_2, "collect"):
1019
+ if is_narwhals_lazyframe(col_data_2):
1005
1020
  col_data_2 = col_data_2.collect()
1006
1021
 
1007
1022
  # Convert to native and then to lists for comparison
1008
- col_1_native = col_data_1.to_native()
1009
- col_2_native = col_data_2.to_native()
1023
+ # Native frames could be Polars, Pandas, or Ibis - use Any for dynamic access
1024
+ col_1_native: Any = col_data_1.to_native()
1025
+ col_2_native: Any = col_data_2.to_native()
1010
1026
 
1011
1027
  # Extract values as lists for comparison
1012
- if hasattr(col_1_native, "to_list"): # Polars Series
1013
- values_1 = col_1_native[col_name].to_list()
1014
- values_2 = col_2_native[col_name].to_list()
1028
+ # Note: We use hasattr for runtime detection but maintain Any typing
1029
+ values_1: list[Any]
1030
+ values_2: list[Any]
1031
+ if hasattr(col_1_native, "to_list"): # Polars DataFrame
1032
+ values_1 = col_1_native[col_name].to_list() # type: ignore[index]
1033
+ values_2 = col_2_native[col_name].to_list() # type: ignore[index]
1015
1034
 
1016
- elif hasattr(col_1_native, "tolist"): # Pandas Series/DataFrame
1017
- values_1 = col_1_native[col_name].tolist()
1018
- values_2 = col_2_native[col_name].tolist()
1035
+ elif hasattr(col_1_native, "tolist"): # Pandas DataFrame
1036
+ values_1 = col_1_native[col_name].tolist() # type: ignore[index]
1037
+ values_2 = col_2_native[col_name].tolist() # type: ignore[index]
1019
1038
 
1020
1039
  elif hasattr(col_1_native, "collect"): # Ibis
1021
- values_1 = col_1_native[col_name].to_pandas().tolist()
1022
- values_2 = col_2_native[col_name].to_pandas().tolist()
1040
+ values_1 = col_1_native[col_name].to_pandas().tolist() # type: ignore[index]
1041
+ values_2 = col_2_native[col_name].to_pandas().tolist() # type: ignore[index]
1023
1042
 
1024
1043
  else:
1025
1044
  # Fallback: try direct comparison
1026
- values_1 = list(col_1_native[col_name])
1027
- values_2 = list(col_2_native[col_name])
1045
+ values_1 = list(col_1_native[col_name]) # type: ignore[index]
1046
+ values_2 = list(col_2_native[col_name]) # type: ignore[index]
1028
1047
 
1029
1048
  # Compare the two lists element by element, handling NaN/None
1030
1049
  if len(values_1) != len(values_2):
@@ -1086,7 +1105,9 @@ def tbl_match(data_tbl: FrameT, tbl_compare: FrameT) -> bool:
1086
1105
  return True
1087
1106
 
1088
1107
 
1089
- def conjointly_validation(data_tbl: FrameT, expressions, threshold: int, tbl_type: str = "local"):
1108
+ def conjointly_validation(
1109
+ data_tbl: IntoFrame, expressions: Any, threshold: int, tbl_type: str = "local"
1110
+ ) -> Any:
1090
1111
  """
1091
1112
  Perform conjoint validation using multiple expressions.
1092
1113
  """
@@ -1101,30 +1122,32 @@ def conjointly_validation(data_tbl: FrameT, expressions, threshold: int, tbl_typ
1101
1122
  return conjointly_instance.get_test_results()
1102
1123
 
1103
1124
 
1104
- def interrogate_gt(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
1125
+ # TODO: we can certainly simplify this
1126
+ def interrogate_gt(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
1105
1127
  """Greater than interrogation."""
1106
1128
  return _interrogate_comparison_base(tbl, column, compare, na_pass, "gt")
1107
1129
 
1108
1130
 
1109
- def interrogate_lt(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
1131
+ def interrogate_lt(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
1110
1132
  """Less than interrogation."""
1111
1133
  return _interrogate_comparison_base(tbl, column, compare, na_pass, "lt")
1112
1134
 
1113
1135
 
1114
- def interrogate_ge(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
1136
+ def interrogate_ge(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
1115
1137
  """Greater than or equal interrogation."""
1116
1138
  return _interrogate_comparison_base(tbl, column, compare, na_pass, "ge")
1117
1139
 
1118
1140
 
1119
- def interrogate_le(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
1141
+ def interrogate_le(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
1120
1142
  """Less than or equal interrogation."""
1121
1143
  return _interrogate_comparison_base(tbl, column, compare, na_pass, "le")
1122
1144
 
1123
1145
 
1124
- def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
1146
+ def interrogate_eq(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
1125
1147
  """Equal interrogation."""
1126
1148
 
1127
1149
  nw_tbl = nw.from_native(tbl)
1150
+ assert is_narwhals_dataframe(nw_tbl) or is_narwhals_lazyframe(nw_tbl)
1128
1151
 
1129
1152
  if isinstance(compare, Column):
1130
1153
  compare_expr = _get_compare_expr_nw(compare=compare)
@@ -1170,10 +1193,10 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
1170
1193
  )
1171
1194
  result_tbl = result_tbl.rename({"pb_is_good_4_tmp": "pb_is_good_4"})
1172
1195
  elif "cannot compare" in str(e).lower():
1173
- # Handle genuine type incompatibility
1196
+ # Handle genuine type incompatibility - native_df type varies by backend
1174
1197
  native_df = result_tbl.to_native()
1175
- col_dtype = str(native_df[column].dtype)
1176
- compare_dtype = str(native_df[compare.name].dtype)
1198
+ col_dtype = str(native_df[column].dtype) # type: ignore[index]
1199
+ compare_dtype = str(native_df[compare.name].dtype) # type: ignore[index]
1177
1200
 
1178
1201
  raise TypeError(
1179
1202
  f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
@@ -1208,21 +1231,19 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
1208
1231
  or "conversion" in error_msg
1209
1232
  and "failed" in error_msg
1210
1233
  ):
1211
- # Get column types for a descriptive error message
1234
+ # Get column types for a descriptive error message - native type varies by backend
1235
+ col_dtype = "unknown"
1236
+ compare_dtype = "unknown"
1212
1237
  try:
1213
1238
  native_df = result_tbl.to_native()
1214
1239
  if hasattr(native_df, "dtypes"):
1215
- col_dtype = str(native_df.dtypes.get(column, "unknown"))
1216
- compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
1240
+ col_dtype = str(native_df.dtypes.get(column, "unknown")) # type: ignore[union-attr]
1241
+ compare_dtype = str(native_df.dtypes.get(compare.name, "unknown")) # type: ignore[union-attr]
1217
1242
  elif hasattr(native_df, "schema"):
1218
- col_dtype = str(native_df.schema.get(column, "unknown"))
1219
- compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
1220
- else:
1221
- col_dtype = "unknown"
1222
- compare_dtype = "unknown"
1243
+ col_dtype = str(native_df.schema.get(column, "unknown")) # type: ignore[union-attr]
1244
+ compare_dtype = str(native_df.schema.get(compare.name, "unknown")) # type: ignore[union-attr]
1223
1245
  except Exception:
1224
- col_dtype = "unknown"
1225
- compare_dtype = "unknown"
1246
+ pass
1226
1247
 
1227
1248
  raise TypeError(
1228
1249
  f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
@@ -1271,17 +1292,16 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
1271
1292
  or "conversion" in error_msg
1272
1293
  and "failed" in error_msg
1273
1294
  ):
1274
- # Get column type for a descriptive error message
1295
+ # Get column type for a descriptive error message - native type varies by backend
1296
+ col_dtype = "unknown"
1275
1297
  try:
1276
1298
  native_df = result_tbl.to_native()
1277
1299
  if hasattr(native_df, "dtypes"):
1278
- col_dtype = str(native_df.dtypes.get(column, "unknown"))
1300
+ col_dtype = str(native_df.dtypes.get(column, "unknown")) # type: ignore[union-attr]
1279
1301
  elif hasattr(native_df, "schema"):
1280
- col_dtype = str(native_df.schema.get(column, "unknown"))
1281
- else:
1282
- col_dtype = "unknown"
1302
+ col_dtype = str(native_df.schema.get(column, "unknown")) # type: ignore[union-attr]
1283
1303
  except Exception:
1284
- col_dtype = "unknown"
1304
+ pass
1285
1305
 
1286
1306
  compare_type = type(compare).__name__
1287
1307
  compare_value = str(compare)
@@ -1311,10 +1331,11 @@ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
1311
1331
  return result_tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
1312
1332
 
1313
1333
 
1314
- def interrogate_ne(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
1334
+ def interrogate_ne(tbl: IntoFrame, column: str, compare: Any, na_pass: bool) -> Any:
1315
1335
  """Not equal interrogation."""
1316
1336
 
1317
1337
  nw_tbl = nw.from_native(tbl)
1338
+ assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
1318
1339
 
1319
1340
  # Determine if the reference and comparison columns have any null values
1320
1341
  ref_col_has_null_vals = _column_has_null_values(table=nw_tbl, column=column)
@@ -1867,14 +1888,15 @@ def interrogate_ne(tbl: FrameT, column: str, compare: any, na_pass: bool) -> Fra
1867
1888
 
1868
1889
 
1869
1890
  def interrogate_between(
1870
- tbl: FrameT, column: str, low: any, high: any, inclusive: tuple, na_pass: bool
1871
- ) -> FrameT:
1891
+ tbl: IntoFrame, column: str, low: Any, high: Any, inclusive: tuple[bool, bool], na_pass: bool
1892
+ ) -> Any:
1872
1893
  """Between interrogation."""
1873
1894
 
1874
1895
  low_val = _get_compare_expr_nw(compare=low)
1875
1896
  high_val = _get_compare_expr_nw(compare=high)
1876
1897
 
1877
1898
  nw_tbl = nw.from_native(tbl)
1899
+ assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
1878
1900
  low_val = _safe_modify_datetime_compare_val(nw_tbl, column, low_val)
1879
1901
  high_val = _safe_modify_datetime_compare_val(nw_tbl, column, high_val)
1880
1902
 
@@ -1936,14 +1958,15 @@ def interrogate_between(
1936
1958
 
1937
1959
 
1938
1960
  def interrogate_outside(
1939
- tbl: FrameT, column: str, low: any, high: any, inclusive: tuple, na_pass: bool
1940
- ) -> FrameT:
1961
+ tbl: IntoFrame, column: str, low: Any, high: Any, inclusive: tuple[bool, bool], na_pass: bool
1962
+ ) -> Any:
1941
1963
  """Outside range interrogation."""
1942
1964
 
1943
1965
  low_val = _get_compare_expr_nw(compare=low)
1944
1966
  high_val = _get_compare_expr_nw(compare=high)
1945
1967
 
1946
1968
  nw_tbl = nw.from_native(tbl)
1969
+ assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
1947
1970
  low_val = _safe_modify_datetime_compare_val(nw_tbl, column, low_val)
1948
1971
  high_val = _safe_modify_datetime_compare_val(nw_tbl, column, high_val)
1949
1972
 
@@ -2002,10 +2025,11 @@ def interrogate_outside(
2002
2025
  return result_tbl.to_native()
2003
2026
 
2004
2027
 
2005
- def interrogate_isin(tbl: FrameT, column: str, set_values: any) -> FrameT:
2028
+ def interrogate_isin(tbl: IntoFrame, column: str, set_values: Any) -> Any:
2006
2029
  """In set interrogation."""
2007
2030
 
2008
2031
  nw_tbl = nw.from_native(tbl)
2032
+ assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
2009
2033
 
2010
2034
  can_be_null: bool = None in set_values
2011
2035
  base_expr: nw.Expr = nw.col(column).is_in(set_values)
@@ -2016,17 +2040,20 @@ def interrogate_isin(tbl: FrameT, column: str, set_values: any) -> FrameT:
2016
2040
  return result_tbl.to_native()
2017
2041
 
2018
2042
 
2019
- def interrogate_notin(tbl: FrameT, column: str, set_values: any) -> FrameT:
2043
+ def interrogate_notin(tbl: IntoFrame, column: str, set_values: Any) -> Any:
2020
2044
  """Not in set interrogation."""
2021
2045
 
2022
2046
  nw_tbl = nw.from_native(tbl)
2047
+ assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
2023
2048
  result_tbl = nw_tbl.with_columns(
2024
2049
  pb_is_good_=nw.col(column).is_in(set_values),
2025
2050
  ).with_columns(pb_is_good_=~nw.col("pb_is_good_"))
2026
2051
  return result_tbl.to_native()
2027
2052
 
2028
2053
 
2029
- def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: bool) -> FrameT:
2054
+ def interrogate_regex(
2055
+ tbl: IntoFrame, column: str, values: dict[str, Any] | str, na_pass: bool
2056
+ ) -> Any:
2030
2057
  """Regex interrogation."""
2031
2058
 
2032
2059
  # Handle both old and new formats for backward compatibility
@@ -2038,6 +2065,7 @@ def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: boo
2038
2065
  inverse = values["inverse"]
2039
2066
 
2040
2067
  nw_tbl = nw.from_native(tbl)
2068
+ assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
2041
2069
  result_tbl = nw_tbl.with_columns(
2042
2070
  pb_is_good_1=nw.col(column).is_null() & na_pass,
2043
2071
  pb_is_good_2=nw.col(column).str.contains(pattern, literal=False).fill_null(False),
@@ -2057,7 +2085,9 @@ def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: boo
2057
2085
  return result_tbl.to_native()
2058
2086
 
2059
2087
 
2060
- def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: bool) -> FrameT:
2088
+ def interrogate_within_spec(
2089
+ tbl: IntoFrame, column: str, values: dict[str, Any], na_pass: bool
2090
+ ) -> Any:
2061
2091
  """Within specification interrogation."""
2062
2092
  from pointblank._spec_utils import (
2063
2093
  regex_email,
@@ -2082,6 +2112,7 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
2082
2112
 
2083
2113
  # Convert to Narwhals for cross-backend compatibility
2084
2114
  nw_tbl = nw.from_native(tbl)
2115
+ assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
2085
2116
 
2086
2117
  # Regex-based specifications can use Narwhals directly (no materialization needed)
2087
2118
  regex_specs = {
@@ -2135,18 +2166,18 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
2135
2166
 
2136
2167
  # For non-Ibis tables or other specs, materialize data and use Python validation
2137
2168
  # Get the column data as a list
2138
- col_data = nw_tbl.select(column).to_native()
2169
+ col_data: Any = nw_tbl.select(column).to_native()
2139
2170
 
2140
- # Convert to list based on backend
2171
+ # Convert to list based on backend - type varies so use duck typing
2141
2172
  if hasattr(col_data, "to_list"): # Polars
2142
- col_list = col_data[column].to_list()
2173
+ col_list = col_data[column].to_list() # type: ignore[index]
2143
2174
  elif hasattr(col_data, "tolist"): # Pandas
2144
- col_list = col_data[column].tolist()
2175
+ col_list = col_data[column].tolist() # type: ignore[index]
2145
2176
  else: # For Ibis tables, we need to execute the query first
2146
2177
  try:
2147
2178
  # Try to execute if it's an Ibis table
2148
2179
  if hasattr(col_data, "execute"):
2149
- col_data_exec = col_data.execute()
2180
+ col_data_exec = col_data.execute() # type: ignore[operator]
2150
2181
  if hasattr(col_data_exec, "to_list"): # Polars result
2151
2182
  col_list = col_data_exec[column].to_list()
2152
2183
  elif hasattr(col_data_exec, "tolist"): # Pandas result
@@ -2159,6 +2190,8 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
2159
2190
  # Fallback to direct list conversion
2160
2191
  col_list = list(col_data[column])
2161
2192
 
2193
+ assert isinstance(col_list, list)
2194
+
2162
2195
  # Validate based on spec type (checksum-based validations)
2163
2196
  if spec_lower in ("isbn", "isbn-10", "isbn-13"):
2164
2197
  is_valid_list = check_isbn(col_list)
@@ -2205,7 +2238,9 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
2205
2238
  return result_tbl.to_native()
2206
2239
 
2207
2240
 
2208
- def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass: bool) -> FrameT:
2241
+ def interrogate_within_spec_db(
2242
+ tbl: IntoFrame, column: str, values: dict[str, Any], na_pass: bool
2243
+ ) -> Any:
2209
2244
  """
2210
2245
  Database-native specification validation (proof of concept).
2211
2246
 
@@ -2226,7 +2261,7 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
2226
2261
 
2227
2262
  Returns
2228
2263
  -------
2229
- FrameT
2264
+ Any
2230
2265
  Result table with pb_is_good_ column indicating validation results.
2231
2266
 
2232
2267
  Notes
@@ -2239,9 +2274,9 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
2239
2274
  spec_lower = spec.lower()
2240
2275
 
2241
2276
  # Check if this is an Ibis table
2242
- native_tbl = tbl
2243
- if hasattr(tbl, "to_native"):
2244
- native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl
2277
+ native_tbl: Any = tbl
2278
+ if is_narwhals_dataframe(tbl) or is_narwhals_lazyframe(tbl):
2279
+ native_tbl = tbl.to_native()
2245
2280
 
2246
2281
  is_ibis = hasattr(native_tbl, "execute")
2247
2282
 
@@ -2308,7 +2343,7 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
2308
2343
  weights = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
2309
2344
 
2310
2345
  # Get the column as an Ibis expression
2311
- col_expr = native_tbl[column]
2346
+ col_expr = native_tbl[column] # type: ignore[index]
2312
2347
 
2313
2348
  # Basic checks: length must be 17, no invalid characters (I, O, Q)
2314
2349
  valid_length = col_expr.length() == 17
@@ -2335,11 +2370,11 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
2335
2370
  value = ibis.cases(*conditions, else_=0) # Default: invalid char = 0 (will fail validation)
2336
2371
 
2337
2372
  # Multiply by weight and add to checksum
2338
- checksum = checksum + (value * weights[pos])
2373
+ checksum = checksum + (value * weights[pos]) # type: ignore[operator]
2339
2374
 
2340
2375
  # Check digit calculation: checksum % 11
2341
2376
  # If result is 10, check digit should be 'X', otherwise it's the digit itself
2342
- expected_check = checksum % 11
2377
+ expected_check = checksum % 11 # type: ignore[operator]
2343
2378
  actual_check_char = col_expr.upper().substr(8, 1) # Position 9 (0-indexed 8)
2344
2379
 
2345
2380
  # Validate check digit using ibis.cases()
@@ -2362,14 +2397,14 @@ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass:
2362
2397
  is_valid = is_valid.fill_null(False)
2363
2398
 
2364
2399
  # Add validation column to table
2365
- result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
2400
+ result_tbl = native_tbl.mutate(pb_is_good_=is_valid) # type: ignore[union-attr]
2366
2401
 
2367
2402
  return result_tbl
2368
2403
 
2369
2404
 
2370
2405
  def interrogate_credit_card_db(
2371
- tbl: FrameT, column: str, values: dict[str, str], na_pass: bool
2372
- ) -> FrameT:
2406
+ tbl: IntoFrame, column: str, values: dict[str, str], na_pass: bool
2407
+ ) -> Any:
2373
2408
  """
2374
2409
  Database-native credit card validation using Luhn algorithm in SQL.
2375
2410
 
@@ -2391,7 +2426,7 @@ def interrogate_credit_card_db(
2391
2426
 
2392
2427
  Returns
2393
2428
  -------
2394
- FrameT
2429
+ Any
2395
2430
  Result table with pb_is_good_ column indicating validation results.
2396
2431
 
2397
2432
  Notes
@@ -2408,7 +2443,7 @@ def interrogate_credit_card_db(
2408
2443
  # Check if this is an Ibis table
2409
2444
  native_tbl = tbl
2410
2445
  if hasattr(tbl, "to_native"):
2411
- native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl
2446
+ native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl # type: ignore[operator]
2412
2447
 
2413
2448
  is_ibis = hasattr(native_tbl, "execute")
2414
2449
 
@@ -2422,7 +2457,7 @@ def interrogate_credit_card_db(
2422
2457
  raise ImportError("Ibis is required for database-native validation")
2423
2458
 
2424
2459
  # Get the column as an Ibis expression
2425
- col_expr = native_tbl[column]
2460
+ col_expr = native_tbl[column] # type: ignore[index]
2426
2461
 
2427
2462
  # Step 1: Clean the input and remove spaces and hyphens
2428
2463
  # First check format: only digits, spaces, and hyphens allowed
@@ -2475,7 +2510,7 @@ def interrogate_credit_card_db(
2475
2510
 
2476
2511
  # Calculate contribution to checksum
2477
2512
  # If should_double: double the digit, then if > 9 subtract 9
2478
- doubled = digit_val * 2
2513
+ doubled = digit_val * 2 # type: ignore[operator]
2479
2514
  adjusted = ibis.cases(
2480
2515
  (should_double & (doubled > 9), doubled - 9),
2481
2516
  (should_double, doubled),
@@ -2488,10 +2523,10 @@ def interrogate_credit_card_db(
2488
2523
  else_=0,
2489
2524
  )
2490
2525
 
2491
- checksum = checksum + contribution
2526
+ checksum = checksum + contribution # type: ignore[operator]
2492
2527
 
2493
2528
  # Step 4: Valid if checksum % 10 == 0
2494
- luhn_valid = (checksum % 10) == 0
2529
+ luhn_valid = (checksum % 10) == 0 # type: ignore[operator]
2495
2530
 
2496
2531
  # Combine all validation checks
2497
2532
  is_valid = valid_chars & valid_length & luhn_valid
@@ -2505,30 +2540,32 @@ def interrogate_credit_card_db(
2505
2540
  is_valid = is_valid.fill_null(False)
2506
2541
 
2507
2542
  # Add validation column to table
2508
- result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
2543
+ result_tbl = native_tbl.mutate(pb_is_good_=is_valid) # type: ignore[union-attr]
2509
2544
 
2510
2545
  return result_tbl
2511
2546
 
2512
2547
 
2513
- def interrogate_null(tbl: FrameT, column: str) -> FrameT:
2548
+ def interrogate_null(tbl: IntoFrame, column: str) -> Any:
2514
2549
  """Null interrogation."""
2515
2550
 
2516
2551
  nw_tbl = nw.from_native(tbl)
2552
+ assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
2517
2553
  result_tbl = nw_tbl.with_columns(pb_is_good_=nw.col(column).is_null())
2518
2554
  return result_tbl.to_native()
2519
2555
 
2520
2556
 
2521
- def interrogate_not_null(tbl: FrameT, column: str) -> FrameT:
2557
+ def interrogate_not_null(tbl: IntoFrame, column: str) -> Any:
2522
2558
  """Not null interrogation."""
2523
2559
 
2524
2560
  nw_tbl = nw.from_native(tbl)
2561
+ assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
2525
2562
  result_tbl = nw_tbl.with_columns(pb_is_good_=~nw.col(column).is_null())
2526
2563
  return result_tbl.to_native()
2527
2564
 
2528
2565
 
2529
2566
  def interrogate_increasing(
2530
- tbl: FrameT, column: str, allow_stationary: bool, decreasing_tol: float, na_pass: bool
2531
- ) -> FrameT:
2567
+ tbl: IntoFrame, column: str, allow_stationary: bool, decreasing_tol: float, na_pass: bool
2568
+ ) -> Any:
2532
2569
  """
2533
2570
  Increasing interrogation.
2534
2571
 
@@ -2549,10 +2586,11 @@ def interrogate_increasing(
2549
2586
 
2550
2587
  Returns
2551
2588
  -------
2552
- FrameT
2589
+ Any
2553
2590
  The table with a `pb_is_good_` column indicating pass/fail for each row.
2554
2591
  """
2555
2592
  nw_tbl = nw.from_native(tbl)
2593
+ assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
2556
2594
 
2557
2595
  # Create a lagged difference column
2558
2596
  result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
@@ -2585,8 +2623,8 @@ def interrogate_increasing(
2585
2623
 
2586
2624
 
2587
2625
  def interrogate_decreasing(
2588
- tbl: FrameT, column: str, allow_stationary: bool, increasing_tol: float, na_pass: bool
2589
- ) -> FrameT:
2626
+ tbl: IntoFrame, column: str, allow_stationary: bool, increasing_tol: float, na_pass: bool
2627
+ ) -> Any:
2590
2628
  """
2591
2629
  Decreasing interrogation.
2592
2630
 
@@ -2607,10 +2645,11 @@ def interrogate_decreasing(
2607
2645
 
2608
2646
  Returns
2609
2647
  -------
2610
- FrameT
2648
+ Any
2611
2649
  The table with a `pb_is_good_` column indicating pass/fail for each row.
2612
2650
  """
2613
2651
  nw_tbl = nw.from_native(tbl)
2652
+ assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
2614
2653
 
2615
2654
  # Create a lagged difference column
2616
2655
  result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
@@ -2643,8 +2682,8 @@ def interrogate_decreasing(
2643
2682
 
2644
2683
 
2645
2684
  def _interrogate_comparison_base(
2646
- tbl: FrameT, column: str, compare: any, na_pass: bool, operator: str
2647
- ) -> FrameT:
2685
+ tbl: IntoFrame, column: str, compare: Any, na_pass: bool, operator: str
2686
+ ) -> Any:
2648
2687
  """
2649
2688
  Unified base function for comparison operations (gt, ge, lt, le, eq, ne).
2650
2689
 
@@ -2663,13 +2702,14 @@ def _interrogate_comparison_base(
2663
2702
 
2664
2703
  Returns
2665
2704
  -------
2666
- FrameT
2705
+ Any
2667
2706
  The result table with `pb_is_good_` column indicating the passing test units.
2668
2707
  """
2669
2708
 
2670
2709
  compare_expr = _get_compare_expr_nw(compare=compare)
2671
2710
 
2672
2711
  nw_tbl = nw.from_native(tbl)
2712
+ assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame))
2673
2713
  compare_expr = _safe_modify_datetime_compare_val(nw_tbl, column, compare_expr)
2674
2714
 
2675
2715
  # Create the comparison expression based on the operator
@@ -2716,7 +2756,7 @@ def _interrogate_comparison_base(
2716
2756
  return result_tbl.to_native()
2717
2757
 
2718
2758
 
2719
- def interrogate_rows_distinct(data_tbl: FrameT, columns_subset: list[str] | None) -> FrameT:
2759
+ def interrogate_rows_distinct(data_tbl: IntoFrame, columns_subset: list[str] | None) -> Any:
2720
2760
  """
2721
2761
  Check if rows in a DataFrame are distinct.
2722
2762
 
@@ -2733,10 +2773,11 @@ def interrogate_rows_distinct(data_tbl: FrameT, columns_subset: list[str] | None
2733
2773
 
2734
2774
  Returns
2735
2775
  -------
2736
- FrameT
2776
+ Any
2737
2777
  A DataFrame with a `pb_is_good_` column indicating which rows pass the test.
2738
2778
  """
2739
2779
  tbl = nw.from_native(data_tbl)
2780
+ assert is_narwhals_dataframe(tbl) or is_narwhals_lazyframe(tbl)
2740
2781
 
2741
2782
  # Get the column subset to use for the test
2742
2783
  if columns_subset is None:
@@ -2744,18 +2785,23 @@ def interrogate_rows_distinct(data_tbl: FrameT, columns_subset: list[str] | None
2744
2785
 
2745
2786
  # Create a count of duplicates using group_by approach
2746
2787
  # Group by the columns of interest and count occurrences
2747
- count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
2748
-
2749
- # Join back to original table to get count for each row
2750
- tbl = tbl.join(count_tbl, on=columns_subset, how="left")
2751
-
2752
- # Passing rows will have the value `1` (no duplicates, so True), otherwise False applies
2753
- tbl = tbl.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
2754
-
2755
- return tbl.to_native()
2788
+ # Handle DataFrame and LazyFrame separately for proper type narrowing
2789
+ if is_narwhals_dataframe(tbl):
2790
+ count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
2791
+ result = tbl.join(count_tbl, on=columns_subset, how="left")
2792
+ result = result.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
2793
+ return result.to_native()
2794
+ elif is_narwhals_lazyframe(tbl):
2795
+ count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
2796
+ result = tbl.join(count_tbl, on=columns_subset, how="left")
2797
+ result = result.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
2798
+ return result.to_native()
2799
+ else:
2800
+ msg = f"Expected DataFrame or LazyFrame, got {type(tbl)}"
2801
+ raise TypeError(msg)
2756
2802
 
2757
2803
 
2758
- def interrogate_rows_complete(tbl: FrameT, columns_subset: list[str] | None) -> FrameT:
2804
+ def interrogate_rows_complete(tbl: IntoFrame, columns_subset: list[str] | None) -> Any:
2759
2805
  """Rows complete interrogation."""
2760
2806
  nw_tbl = nw.from_native(tbl)
2761
2807
 
@@ -2771,12 +2817,25 @@ def interrogate_rows_complete(tbl: FrameT, columns_subset: list[str] | None) ->
2771
2817
  return result_tbl.to_native()
2772
2818
 
2773
2819
 
2774
- def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config: dict) -> FrameT:
2820
+ def interrogate_prompt(
2821
+ tbl: IntoFrame, columns_subset: list[str] | None, ai_config: dict[str, Any]
2822
+ ) -> Any:
2775
2823
  """AI-powered interrogation of rows."""
2776
2824
  import logging
2777
2825
 
2778
2826
  logger = logging.getLogger(__name__)
2779
2827
 
2828
+ # Convert to narwhals early for consistent row counting
2829
+ nw_tbl = nw.from_native(tbl)
2830
+ # Get row count - for LazyFrame we need to use select/collect
2831
+ if is_narwhals_lazyframe(nw_tbl):
2832
+ row_count = nw_tbl.select(nw.len()).collect().item()
2833
+ assert isinstance(row_count, int)
2834
+ total_rows = row_count
2835
+ else:
2836
+ assert is_narwhals_dataframe(nw_tbl)
2837
+ total_rows = len(nw_tbl)
2838
+
2780
2839
  try:
2781
2840
  # Import AI validation modules
2782
2841
  from pointblank._utils_ai import (
@@ -2833,28 +2892,25 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
2833
2892
  )
2834
2893
 
2835
2894
  # Parse and combine results with signature mapping optimization
2836
- parser = _ValidationResponseParser(total_rows=len(tbl))
2895
+ parser = _ValidationResponseParser(total_rows=total_rows)
2837
2896
  combined_results = parser.combine_batch_results(batch_results, signature_mapping)
2838
2897
 
2839
2898
  # Debug: Log table info and combined results
2840
2899
  logger.debug("🏁 Final result conversion:")
2841
- logger.debug(f" - Table length: {len(tbl)}")
2900
+ logger.debug(f" - Table length: {total_rows}")
2842
2901
  logger.debug(
2843
2902
  f" - Combined results keys: {sorted(combined_results.keys()) if combined_results else 'None'}"
2844
2903
  )
2845
2904
 
2846
- # Convert results to narwhals format
2847
- nw_tbl = nw.from_native(tbl)
2848
-
2849
2905
  # Create a boolean column for validation results
2850
2906
  validation_results = []
2851
- for i in range(len(tbl)):
2907
+ for i in range(total_rows):
2852
2908
  # Default to False if row wasn't processed
2853
2909
  result = combined_results.get(i, False)
2854
2910
  validation_results.append(result)
2855
2911
 
2856
2912
  # Debug: Log first few conversions
2857
- if i < 5 or len(tbl) - i <= 2:
2913
+ if i < 5 or total_rows - i <= 2:
2858
2914
  logger.debug(f" Row {i}: {result} (from combined_results.get({i}, False))")
2859
2915
 
2860
2916
  logger.debug(f" - Final validation_results length: {len(validation_results)}")
@@ -2893,10 +2949,9 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
2893
2949
  logger.error(f"Missing dependencies for AI validation: {e}")
2894
2950
  logger.error("Install required packages: pip install openai anthropic aiohttp")
2895
2951
 
2896
- # Return all False results as fallback
2897
- nw_tbl = nw.from_native(tbl)
2952
+ # Return all False results as fallback (nw_tbl and total_rows defined at function start)
2898
2953
  native_tbl = nw_tbl.to_native()
2899
- validation_results = [False] * len(tbl)
2954
+ validation_results = [False] * total_rows
2900
2955
 
2901
2956
  if hasattr(native_tbl, "with_columns"): # Polars
2902
2957
  import polars as pl
@@ -2918,10 +2973,9 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
2918
2973
  except Exception as e:
2919
2974
  logger.error(f"AI validation failed: {e}")
2920
2975
 
2921
- # Return all False results as fallback
2922
- nw_tbl = nw.from_native(tbl)
2976
+ # Return all False results as fallback (nw_tbl and total_rows defined at function start)
2923
2977
  native_tbl = nw_tbl.to_native()
2924
- validation_results = [False] * len(tbl)
2978
+ validation_results = [False] * total_rows
2925
2979
 
2926
2980
  if hasattr(native_tbl, "with_columns"): # Polars
2927
2981
  import polars as pl
@@ -2939,3 +2993,206 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
2939
2993
  result_tbl["pb_is_good_"] = validation_results
2940
2994
 
2941
2995
  return result_tbl
2996
+
2997
+
2998
+ def data_freshness(
2999
+ data_tbl: IntoFrame,
3000
+ column: str,
3001
+ max_age: Any, # datetime.timedelta
3002
+ reference_time: Any | None, # datetime.datetime | None
3003
+ timezone: str | None,
3004
+ allow_tz_mismatch: bool,
3005
+ ) -> dict:
3006
+ """
3007
+ Check if the most recent datetime value in a column is within the allowed max_age.
3008
+
3009
+ Parameters
3010
+ ----------
3011
+ data_tbl
3012
+ The data table to check.
3013
+ column
3014
+ The datetime column to check.
3015
+ max_age
3016
+ The maximum allowed age as a timedelta.
3017
+ reference_time
3018
+ The reference time to compare against (None = use current time).
3019
+ timezone
3020
+ The timezone to use for interpretation.
3021
+ allow_tz_mismatch
3022
+ Whether to suppress timezone mismatch warnings.
3023
+
3024
+ Returns
3025
+ -------
3026
+ dict
3027
+ A dictionary containing:
3028
+ - 'passed': bool, whether the validation passed
3029
+ - 'max_datetime': the maximum datetime found in the column
3030
+ - 'reference_time': the reference time used
3031
+ - 'age': the calculated age (timedelta)
3032
+ - 'max_age': the maximum allowed age
3033
+ - 'tz_warning': any timezone warning message
3034
+ """
3035
+ import datetime
3036
+
3037
+ nw_frame = nw.from_native(data_tbl)
3038
+
3039
+ # Handle LazyFrames by collecting them first
3040
+ if is_narwhals_lazyframe(nw_frame):
3041
+ nw_frame = nw_frame.collect()
3042
+
3043
+ assert is_narwhals_dataframe(nw_frame)
3044
+
3045
+ result = {
3046
+ "passed": False,
3047
+ "max_datetime": None,
3048
+ "reference_time": None,
3049
+ "age": None,
3050
+ "max_age": max_age,
3051
+ "tz_warning": None,
3052
+ "column_empty": False,
3053
+ }
3054
+
3055
+ # Get the maximum datetime value from the column
3056
+ try:
3057
+ # Use narwhals to get max value
3058
+ max_val_result = nw_frame.select(nw.col(column).max())
3059
+ max_datetime_raw = max_val_result.item()
3060
+
3061
+ if max_datetime_raw is None:
3062
+ result["column_empty"] = True
3063
+ result["passed"] = False
3064
+ return result
3065
+
3066
+ # Convert to Python datetime if needed
3067
+ if hasattr(max_datetime_raw, "to_pydatetime"):
3068
+ # Pandas Timestamp
3069
+ max_datetime = max_datetime_raw.to_pydatetime()
3070
+ elif hasattr(max_datetime_raw, "isoformat"):
3071
+ # Already a datetime-like object
3072
+ max_datetime = max_datetime_raw
3073
+ else:
3074
+ # Try to parse as string or handle other types
3075
+ max_datetime = datetime.datetime.fromisoformat(str(max_datetime_raw))
3076
+
3077
+ result["max_datetime"] = max_datetime
3078
+
3079
+ except Exception as e:
3080
+ result["error"] = str(e)
3081
+ result["passed"] = False
3082
+ return result
3083
+
3084
+ # Determine the reference time
3085
+ # We'll set the reference time after we know the timezone awareness of the data
3086
+ if reference_time is None:
3087
+ ref_time = None # Will be set below based on data timezone awareness
3088
+ else:
3089
+ ref_time = reference_time
3090
+
3091
+ # Handle timezone awareness/naivete
3092
+ max_dt_aware = _is_datetime_aware(max_datetime)
3093
+
3094
+ # Helper to parse timezone string (supports IANA names and offsets like "-7", "-07:00")
3095
+ def _get_tz_from_string(tz_str: str) -> datetime.tzinfo:
3096
+ import re
3097
+
3098
+ # Check for offset formats: "-7", "+5", "-07:00", "+05:30", etc.
3099
+ offset_pattern = r"^([+-]?)(\d{1,2})(?::(\d{2}))?$"
3100
+ match = re.match(offset_pattern, tz_str.strip())
3101
+
3102
+ if match:
3103
+ sign_str, hours_str, minutes_str = match.groups()
3104
+ hours = int(hours_str)
3105
+ minutes = int(minutes_str) if minutes_str else 0
3106
+
3107
+ total_minutes = hours * 60 + minutes
3108
+ if sign_str == "-":
3109
+ total_minutes = -total_minutes
3110
+
3111
+ return datetime.timezone(datetime.timedelta(minutes=total_minutes))
3112
+
3113
+ # Try IANA timezone names (zoneinfo is standard in Python 3.9+)
3114
+ try:
3115
+ return ZoneInfo(tz_str)
3116
+ except KeyError:
3117
+ # Invalid timezone name, fall back to UTC
3118
+ return datetime.timezone.utc
3119
+
3120
+ # If ref_time is None (no reference_time provided), set it based on data awareness
3121
+ if ref_time is None:
3122
+ if max_dt_aware:
3123
+ # Data is timezone-aware, use timezone-aware now
3124
+ if timezone:
3125
+ ref_time = datetime.datetime.now(_get_tz_from_string(timezone))
3126
+ else:
3127
+ # Default to UTC when data is aware but no timezone specified
3128
+ ref_time = datetime.datetime.now(datetime.timezone.utc)
3129
+ else:
3130
+ # Data is naive, use naive local time for comparison
3131
+ if timezone:
3132
+ # If user specified timezone, use it for reference
3133
+ ref_time = datetime.datetime.now(_get_tz_from_string(timezone))
3134
+ else:
3135
+ # No timezone specified and data is naive -> use naive local time
3136
+ ref_time = datetime.datetime.now()
3137
+
3138
+ result["reference_time"] = ref_time
3139
+ ref_dt_aware = _is_datetime_aware(ref_time)
3140
+
3141
+ # Track timezone warnings - use keys for translation lookup
3142
+ tz_warning_key = None
3143
+
3144
+ if max_dt_aware != ref_dt_aware:
3145
+ if not allow_tz_mismatch:
3146
+ if max_dt_aware and not ref_dt_aware:
3147
+ tz_warning_key = "data_freshness_tz_warning_aware_naive"
3148
+ else:
3149
+ tz_warning_key = "data_freshness_tz_warning_naive_aware"
3150
+ result["tz_warning_key"] = tz_warning_key
3151
+
3152
+ # Make both comparable
3153
+ try:
3154
+ if max_dt_aware and not ref_dt_aware:
3155
+ # Add timezone to reference time
3156
+ if timezone:
3157
+ try:
3158
+ ref_time = ref_time.replace(tzinfo=ZoneInfo(timezone))
3159
+ except KeyError:
3160
+ ref_time = ref_time.replace(tzinfo=datetime.timezone.utc)
3161
+ else:
3162
+ # Assume UTC
3163
+ ref_time = ref_time.replace(tzinfo=datetime.timezone.utc)
3164
+
3165
+ elif not max_dt_aware and ref_dt_aware:
3166
+ # Localize the max_datetime if we have a timezone
3167
+ if timezone:
3168
+ try:
3169
+ max_datetime = max_datetime.replace(tzinfo=ZoneInfo(timezone))
3170
+ except KeyError:
3171
+ # Remove timezone from reference for comparison
3172
+ ref_time = ref_time.replace(tzinfo=None)
3173
+ else:
3174
+ # Remove timezone from reference for comparison
3175
+ ref_time = ref_time.replace(tzinfo=None)
3176
+
3177
+ # Calculate the age
3178
+ age = ref_time - max_datetime
3179
+ result["age"] = age
3180
+ result["reference_time"] = ref_time
3181
+
3182
+ # Check if within max_age
3183
+ result["passed"] = age <= max_age
3184
+
3185
+ except Exception as e:
3186
+ result["error"] = str(e)
3187
+ result["passed"] = False
3188
+
3189
+ return result
3190
+
3191
+
3192
+ def _is_datetime_aware(dt: Any) -> bool:
3193
+ """Check if a datetime object is timezone-aware."""
3194
+ if dt is None:
3195
+ return False
3196
+ if hasattr(dt, "tzinfo"):
3197
+ return dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None
3198
+ return False