pointblank 0.17.0__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -15,8 +15,9 @@ from enum import Enum
15
15
  from functools import partial
16
16
  from importlib.metadata import version
17
17
  from pathlib import Path
18
- from typing import TYPE_CHECKING, Any, Callable, Literal
18
+ from typing import TYPE_CHECKING, Any, Callable, Literal, NoReturn, ParamSpec, TypeVar
19
19
  from zipfile import ZipFile
20
+ from zoneinfo import ZoneInfo
20
21
 
21
22
  import commonmark
22
23
  import narwhals as nw
@@ -24,8 +25,8 @@ from great_tables import GT, from_column, google_font, html, loc, md, style, val
24
25
  from great_tables.gt import _get_column_of_values
25
26
  from great_tables.vals import fmt_integer, fmt_number
26
27
  from importlib_resources import files
27
- from narwhals.typing import FrameT
28
28
 
29
+ from pointblank._agg import is_valid_agg, load_validation_method_grid, resolve_agg_registries
29
30
  from pointblank._constants import (
30
31
  ASSERTION_TYPE_METHOD_MAP,
31
32
  CHECK_MARK_SPAN,
@@ -92,6 +93,8 @@ from pointblank._utils import (
92
93
  _is_lib_present,
93
94
  _is_narwhals_table,
94
95
  _is_value_a_df,
96
+ _PBUnresolvedColumn,
97
+ _resolve_columns,
95
98
  _select_df_lib,
96
99
  )
97
100
  from pointblank._utils_check_args import (
@@ -102,7 +105,14 @@ from pointblank._utils_check_args import (
102
105
  _check_thresholds,
103
106
  )
104
107
  from pointblank._utils_html import _create_table_dims_html, _create_table_type_html
105
- from pointblank.column import Column, ColumnLiteral, ColumnSelector, ColumnSelectorNarwhals, col
108
+ from pointblank.column import (
109
+ Column,
110
+ ColumnLiteral,
111
+ ColumnSelector,
112
+ ColumnSelectorNarwhals,
113
+ ReferenceColumn,
114
+ col,
115
+ )
106
116
  from pointblank.schema import Schema, _get_schema_validation_info
107
117
  from pointblank.segments import Segment
108
118
  from pointblank.thresholds import (
@@ -113,10 +123,18 @@ from pointblank.thresholds import (
113
123
  _normalize_thresholds_creation,
114
124
  )
115
125
 
126
+ P = ParamSpec("P")
127
+ R = TypeVar("R")
128
+
116
129
  if TYPE_CHECKING:
117
130
  from collections.abc import Collection
131
+ from typing import Any
132
+
133
+ import polars as pl
134
+ from narwhals.typing import IntoDataFrame, IntoFrame
135
+
136
+ from pointblank._typing import AbsoluteBounds, Tolerance, _CompliantValue, _CompliantValues
118
137
 
119
- from pointblank._typing import AbsoluteBounds, Tolerance
120
138
 
121
139
  __all__ = [
122
140
  "Validate",
@@ -135,6 +153,7 @@ __all__ = [
135
153
  "get_validation_summary",
136
154
  ]
137
155
 
156
+
138
157
  # Create a thread-local storage for the metadata
139
158
  _action_context = threading.local()
140
159
 
@@ -424,12 +443,13 @@ def config(
424
443
  global_config.report_incl_footer_timings = report_incl_footer_timings # pragma: no cover
425
444
  global_config.report_incl_footer_notes = report_incl_footer_notes # pragma: no cover
426
445
  global_config.preview_incl_header = preview_incl_header # pragma: no cover
446
+ return global_config # pragma: no cover
427
447
 
428
448
 
429
449
  def load_dataset(
430
450
  dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
431
451
  tbl_type: Literal["polars", "pandas", "duckdb"] = "polars",
432
- ) -> FrameT | Any:
452
+ ) -> Any:
433
453
  """
434
454
  Load a dataset hosted in the library as specified table type.
435
455
 
@@ -450,7 +470,7 @@ def load_dataset(
450
470
 
451
471
  Returns
452
472
  -------
453
- FrameT | Any
473
+ Any
454
474
  The dataset for the `Validate` object. This could be a Polars DataFrame, a Pandas DataFrame,
455
475
  or a DuckDB table as an Ibis table.
456
476
 
@@ -1523,7 +1543,7 @@ def get_data_path(
1523
1543
  return tmp_file.name
1524
1544
 
1525
1545
 
1526
- def _process_data(data: FrameT | Any) -> FrameT | Any:
1546
+ def _process_data(data: Any) -> Any:
1527
1547
  """
1528
1548
  Centralized data processing pipeline that handles all supported input types.
1529
1549
 
@@ -1540,7 +1560,7 @@ def _process_data(data: FrameT | Any) -> FrameT | Any:
1540
1560
 
1541
1561
  Parameters
1542
1562
  ----------
1543
- data : FrameT | Any
1563
+ data
1544
1564
  The input data which could be:
1545
1565
  - a DataFrame object (Polars, Pandas, Ibis, etc.)
1546
1566
  - a GitHub URL pointing to a CSV or Parquet file
@@ -1551,7 +1571,7 @@ def _process_data(data: FrameT | Any) -> FrameT | Any:
1551
1571
 
1552
1572
  Returns
1553
1573
  -------
1554
- FrameT | Any
1574
+ Any
1555
1575
  Processed data as a DataFrame if input was a supported data source type,
1556
1576
  otherwise the original data unchanged.
1557
1577
  """
@@ -1570,7 +1590,7 @@ def _process_data(data: FrameT | Any) -> FrameT | Any:
1570
1590
  return data
1571
1591
 
1572
1592
 
1573
- def _process_github_url(data: FrameT | Any) -> FrameT | Any:
1593
+ def _process_github_url(data: Any) -> Any:
1574
1594
  """
1575
1595
  Process data parameter to handle GitHub URLs pointing to CSV or Parquet files.
1576
1596
 
@@ -1585,12 +1605,12 @@ def _process_github_url(data: FrameT | Any) -> FrameT | Any:
1585
1605
 
1586
1606
  Parameters
1587
1607
  ----------
1588
- data : FrameT | Any
1608
+ data
1589
1609
  The data parameter which may be a GitHub URL string or any other data type.
1590
1610
 
1591
1611
  Returns
1592
1612
  -------
1593
- FrameT | Any
1613
+ Any
1594
1614
  If the input is a supported GitHub URL, returns a DataFrame loaded from the downloaded file.
1595
1615
  Otherwise, returns the original data unchanged.
1596
1616
 
@@ -1675,7 +1695,7 @@ def _process_github_url(data: FrameT | Any) -> FrameT | Any:
1675
1695
  return data
1676
1696
 
1677
1697
 
1678
- def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
1698
+ def _process_connection_string(data: Any) -> Any:
1679
1699
  """
1680
1700
  Process data parameter to handle database connection strings.
1681
1701
 
@@ -1702,7 +1722,7 @@ def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
1702
1722
  return connect_to_table(data)
1703
1723
 
1704
1724
 
1705
- def _process_csv_input(data: FrameT | Any) -> FrameT | Any:
1725
+ def _process_csv_input(data: Any) -> Any:
1706
1726
  """
1707
1727
  Process data parameter to handle CSV file inputs.
1708
1728
 
@@ -1760,7 +1780,7 @@ def _process_csv_input(data: FrameT | Any) -> FrameT | Any:
1760
1780
  )
1761
1781
 
1762
1782
 
1763
- def _process_parquet_input(data: FrameT | Any) -> FrameT | Any:
1783
+ def _process_parquet_input(data: Any) -> Any:
1764
1784
  """
1765
1785
  Process data parameter to handle Parquet file inputs.
1766
1786
 
@@ -1903,7 +1923,7 @@ def _process_parquet_input(data: FrameT | Any) -> FrameT | Any:
1903
1923
 
1904
1924
 
1905
1925
  def preview(
1906
- data: FrameT | Any,
1926
+ data: Any,
1907
1927
  columns_subset: str | list[str] | Column | None = None,
1908
1928
  n_head: int = 5,
1909
1929
  n_tail: int = 5,
@@ -1911,7 +1931,7 @@ def preview(
1911
1931
  show_row_numbers: bool = True,
1912
1932
  max_col_width: int = 250,
1913
1933
  min_tbl_width: int = 500,
1914
- incl_header: bool = None,
1934
+ incl_header: bool | None = None,
1915
1935
  ) -> GT:
1916
1936
  """
1917
1937
  Display a table preview that shows some rows from the top, some from the bottom.
@@ -2169,7 +2189,7 @@ def preview(
2169
2189
 
2170
2190
 
2171
2191
  def _generate_display_table(
2172
- data: FrameT | Any,
2192
+ data: Any,
2173
2193
  columns_subset: str | list[str] | Column | None = None,
2174
2194
  n_head: int = 5,
2175
2195
  n_tail: int = 5,
@@ -2177,7 +2197,7 @@ def _generate_display_table(
2177
2197
  show_row_numbers: bool = True,
2178
2198
  max_col_width: int = 250,
2179
2199
  min_tbl_width: int = 500,
2180
- incl_header: bool = None,
2200
+ incl_header: bool | None = None,
2181
2201
  mark_missing_values: bool = True,
2182
2202
  row_number_list: list[int] | None = None,
2183
2203
  ) -> GT:
@@ -2274,7 +2294,8 @@ def _generate_display_table(
2274
2294
  tbl_schema = Schema(tbl=data)
2275
2295
 
2276
2296
  # Get the row count for the table
2277
- ibis_rows = data.count()
2297
+ # Note: ibis tables have count(), to_polars(), to_pandas() methods
2298
+ ibis_rows = data.count() # type: ignore[union-attr]
2278
2299
  n_rows = ibis_rows.to_polars() if df_lib_name_gt == "polars" else int(ibis_rows.to_pandas())
2279
2300
 
2280
2301
  # If n_head + n_tail is greater than the row count, display the entire table
@@ -2283,11 +2304,11 @@ def _generate_display_table(
2283
2304
  data_subset = data
2284
2305
 
2285
2306
  if row_number_list is None:
2286
- row_number_list = range(1, n_rows + 1)
2307
+ row_number_list = list(range(1, n_rows + 1))
2287
2308
  else:
2288
2309
  # Get the first n and last n rows of the table
2289
- data_head = data.head(n_head)
2290
- data_tail = data.filter(
2310
+ data_head = data.head(n_head) # type: ignore[union-attr]
2311
+ data_tail = data.filter( # type: ignore[union-attr]
2291
2312
  [ibis.row_number() >= (n_rows - n_tail), ibis.row_number() <= n_rows]
2292
2313
  )
2293
2314
  data_subset = data_head.union(data_tail)
@@ -2299,9 +2320,9 @@ def _generate_display_table(
2299
2320
 
2300
2321
  # Convert either to Polars or Pandas depending on the available library
2301
2322
  if df_lib_name_gt == "polars":
2302
- data = data_subset.to_polars()
2323
+ data = data_subset.to_polars() # type: ignore[union-attr]
2303
2324
  else:
2304
- data = data_subset.to_pandas()
2325
+ data = data_subset.to_pandas() # type: ignore[union-attr]
2305
2326
 
2306
2327
  # From a DataFrame:
2307
2328
  # - get the row count
@@ -2312,17 +2333,18 @@ def _generate_display_table(
2312
2333
  tbl_schema = Schema(tbl=data)
2313
2334
 
2314
2335
  if tbl_type == "polars":
2315
- n_rows = int(data.height)
2336
+ # Note: polars DataFrames have height, head(), tail() attributes
2337
+ n_rows = int(data.height) # type: ignore[union-attr]
2316
2338
 
2317
2339
  # If n_head + n_tail is greater than the row count, display the entire table
2318
2340
  if n_head + n_tail >= n_rows:
2319
2341
  full_dataset = True
2320
2342
 
2321
2343
  if row_number_list is None:
2322
- row_number_list = range(1, n_rows + 1)
2344
+ row_number_list = list(range(1, n_rows + 1))
2323
2345
 
2324
2346
  else:
2325
- data = pl.concat([data.head(n=n_head), data.tail(n=n_tail)])
2347
+ data = pl.concat([data.head(n=n_head), data.tail(n=n_tail)]) # type: ignore[union-attr]
2326
2348
 
2327
2349
  if row_number_list is None:
2328
2350
  row_number_list = list(range(1, n_head + 1)) + list(
@@ -2330,40 +2352,42 @@ def _generate_display_table(
2330
2352
  )
2331
2353
 
2332
2354
  if tbl_type == "pandas":
2333
- n_rows = data.shape[0]
2355
+ # Note: pandas DataFrames have shape, head(), tail() attributes
2356
+ n_rows = data.shape[0] # type: ignore[union-attr]
2334
2357
 
2335
2358
  # If n_head + n_tail is greater than the row count, display the entire table
2336
2359
  if n_head + n_tail >= n_rows:
2337
2360
  full_dataset = True
2338
2361
  data_subset = data
2339
2362
 
2340
- row_number_list = range(1, n_rows + 1)
2363
+ row_number_list = list(range(1, n_rows + 1))
2341
2364
  else:
2342
- data = pd.concat([data.head(n=n_head), data.tail(n=n_tail)])
2365
+ data = pd.concat([data.head(n=n_head), data.tail(n=n_tail)]) # type: ignore[union-attr]
2343
2366
 
2344
2367
  row_number_list = list(range(1, n_head + 1)) + list(
2345
2368
  range(n_rows - n_tail + 1, n_rows + 1)
2346
2369
  )
2347
2370
 
2348
2371
  if tbl_type == "pyspark":
2349
- n_rows = data.count()
2372
+ # Note: pyspark DataFrames have count(), toPandas(), limit(), tail(), sparkSession
2373
+ n_rows = data.count() # type: ignore[union-attr]
2350
2374
 
2351
2375
  # If n_head + n_tail is greater than the row count, display the entire table
2352
2376
  if n_head + n_tail >= n_rows:
2353
2377
  full_dataset = True
2354
2378
  # Convert to pandas for Great Tables compatibility
2355
- data = data.toPandas()
2379
+ data = data.toPandas() # type: ignore[union-attr]
2356
2380
 
2357
- row_number_list = range(1, n_rows + 1)
2381
+ row_number_list = list(range(1, n_rows + 1))
2358
2382
  else:
2359
2383
  # Get head and tail samples, then convert to pandas
2360
- head_data = data.limit(n_head).toPandas()
2384
+ head_data = data.limit(n_head).toPandas() # type: ignore[union-attr]
2361
2385
 
2362
2386
  # PySpark tail() returns a list of Row objects, need to convert to DataFrame
2363
- tail_rows = data.tail(n_tail)
2387
+ tail_rows = data.tail(n_tail) # type: ignore[union-attr]
2364
2388
  if tail_rows:
2365
2389
  # Convert list of Row objects back to DataFrame, then to pandas
2366
- tail_df = data.sparkSession.createDataFrame(tail_rows, data.schema)
2390
+ tail_df = data.sparkSession.createDataFrame(tail_rows, data.schema) # type: ignore[union-attr]
2367
2391
  tail_data = tail_df.toPandas()
2368
2392
  else:
2369
2393
  # If no tail data, create empty DataFrame with same schema
@@ -2391,14 +2415,14 @@ def _generate_display_table(
2391
2415
  tbl_schema = Schema(tbl=data)
2392
2416
 
2393
2417
  # From the table schema, get a list of tuples containing column names and data types
2394
- col_dtype_dict = tbl_schema.columns
2418
+ col_dtype_list = tbl_schema.columns or []
2395
2419
 
2396
2420
  # Extract the column names from the list of tuples (first element of each tuple)
2397
- col_names = [col[0] for col in col_dtype_dict]
2421
+ col_names = [col[0] for col in col_dtype_list]
2398
2422
 
2399
2423
  # Iterate over the list of tuples and create a new dictionary with the
2400
2424
  # column names and data types
2401
- col_dtype_dict = {k: v for k, v in col_dtype_dict}
2425
+ col_dtype_dict = {k: v for k, v in col_dtype_list}
2402
2426
 
2403
2427
  # Create short versions of the data types by omitting any text in parentheses
2404
2428
  col_dtype_dict_short = {
@@ -2497,21 +2521,21 @@ def _generate_display_table(
2497
2521
  # Prepend a column that contains the row numbers if `show_row_numbers=True`
2498
2522
  if show_row_numbers or has_leading_row_num_col:
2499
2523
  if has_leading_row_num_col:
2500
- row_number_list = data["_row_num_"].to_list()
2524
+ row_number_list = data["_row_num_"].to_list() # type: ignore[union-attr]
2501
2525
 
2502
2526
  else:
2503
2527
  if df_lib_name_gt == "polars":
2504
2528
  import polars as pl
2505
2529
 
2506
2530
  row_number_series = pl.Series("_row_num_", row_number_list)
2507
- data = data.insert_column(0, row_number_series)
2531
+ data = data.insert_column(0, row_number_series) # type: ignore[union-attr]
2508
2532
 
2509
2533
  if df_lib_name_gt == "pandas":
2510
- data.insert(0, "_row_num_", row_number_list)
2534
+ data.insert(0, "_row_num_", row_number_list) # type: ignore[union-attr]
2511
2535
 
2512
2536
  if df_lib_name_gt == "pyspark":
2513
2537
  # For PySpark converted to pandas, use pandas method
2514
- data.insert(0, "_row_num_", row_number_list)
2538
+ data.insert(0, "_row_num_", row_number_list) # type: ignore[union-attr]
2515
2539
 
2516
2540
  # Get the highest number in the `row_number_list` and calculate a width that will
2517
2541
  # safely fit a number of that magnitude
@@ -2620,7 +2644,7 @@ def _generate_display_table(
2620
2644
  return gt_tbl
2621
2645
 
2622
2646
 
2623
- def missing_vals_tbl(data: FrameT | Any) -> GT:
2647
+ def missing_vals_tbl(data: Any) -> GT:
2624
2648
  """
2625
2649
  Display a table that shows the missing values in the input table.
2626
2650
 
@@ -3221,7 +3245,7 @@ def _get_column_names_safe(data: Any) -> list[str]:
3221
3245
  return list(data.columns) # pragma: no cover
3222
3246
 
3223
3247
 
3224
- def _get_column_names(data: FrameT | Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
3248
+ def _get_column_names(data: Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
3225
3249
  if ibis_tbl:
3226
3250
  return data.columns if df_lib_name_gt == "polars" else list(data.columns)
3227
3251
 
@@ -3245,12 +3269,10 @@ def _validate_columns_subset(
3245
3269
  )
3246
3270
  return columns_subset
3247
3271
 
3248
- return columns_subset.resolve(columns=col_names)
3272
+ return columns_subset.resolve(columns=col_names) # type: ignore[union-attr]
3249
3273
 
3250
3274
 
3251
- def _select_columns(
3252
- data: FrameT | Any, resolved_columns: list[str], ibis_tbl: bool, tbl_type: str
3253
- ) -> FrameT | Any:
3275
+ def _select_columns(data: Any, resolved_columns: list[str], ibis_tbl: bool, tbl_type: str) -> Any:
3254
3276
  if ibis_tbl:
3255
3277
  return data[resolved_columns]
3256
3278
  if tbl_type == "polars":
@@ -3258,7 +3280,7 @@ def _select_columns(
3258
3280
  return data[resolved_columns]
3259
3281
 
3260
3282
 
3261
- def get_column_count(data: FrameT | Any) -> int:
3283
+ def get_column_count(data: Any) -> int:
3262
3284
  """
3263
3285
  Get the number of columns in a table.
3264
3286
 
@@ -3470,7 +3492,7 @@ def _extract_enum_values(set_values: Any) -> list[Any]:
3470
3492
  return [set_values]
3471
3493
 
3472
3494
 
3473
- def get_row_count(data: FrameT | Any) -> int:
3495
+ def get_row_count(data: Any) -> int:
3474
3496
  """
3475
3497
  Get the number of rows in a table.
3476
3498
 
@@ -3723,18 +3745,46 @@ class _ValidationInfo:
3723
3745
  insertion order, ensuring notes appear in a consistent sequence in reports and logs.
3724
3746
  """
3725
3747
 
3748
+ @classmethod
3749
+ def from_agg_validator(
3750
+ cls,
3751
+ assertion_type: str,
3752
+ columns: _PBUnresolvedColumn,
3753
+ value: float | Column | ReferenceColumn,
3754
+ tol: Tolerance = 0,
3755
+ thresholds: float | bool | tuple | dict | Thresholds | None = None,
3756
+ brief: str | bool = False,
3757
+ actions: Actions | None = None,
3758
+ active: bool = True,
3759
+ ) -> _ValidationInfo:
3760
+ # This factory method creates a `_ValidationInfo` instance for aggregate
3761
+ # methods. The reason this is created, is because all agg methods share the same
3762
+ # signature so instead of instantiating the class directly each time, this method
3763
+ # can be used to reduce redundancy, boilerplate and mistakes :)
3764
+ _check_thresholds(thresholds=thresholds)
3765
+
3766
+ return cls(
3767
+ assertion_type=assertion_type,
3768
+ column=_resolve_columns(columns),
3769
+ values={"value": value, "tol": tol},
3770
+ thresholds=_normalize_thresholds_creation(thresholds),
3771
+ brief=_transform_auto_brief(brief=brief),
3772
+ actions=actions,
3773
+ active=active,
3774
+ )
3775
+
3726
3776
  # Validation plan
3727
3777
  i: int | None = None
3728
3778
  i_o: int | None = None
3729
3779
  step_id: str | None = None
3730
3780
  sha1: str | None = None
3731
3781
  assertion_type: str | None = None
3732
- column: any | None = None
3733
- values: any | list[any] | tuple | None = None
3782
+ column: Any | None = None
3783
+ values: Any | list[Any] | tuple | None = None
3734
3784
  inclusive: tuple[bool, bool] | None = None
3735
3785
  na_pass: bool | None = None
3736
3786
  pre: Callable | None = None
3737
- segments: any | None = None
3787
+ segments: Any | None = None
3738
3788
  thresholds: Thresholds | None = None
3739
3789
  actions: Actions | None = None
3740
3790
  label: str | None = None
@@ -3753,14 +3803,14 @@ class _ValidationInfo:
3753
3803
  error: bool | None = None
3754
3804
  critical: bool | None = None
3755
3805
  failure_text: str | None = None
3756
- tbl_checked: FrameT | None = None
3757
- extract: FrameT | None = None
3758
- val_info: dict[str, any] | None = None
3806
+ tbl_checked: Any = None
3807
+ extract: Any = None
3808
+ val_info: dict[str, Any] | None = None
3759
3809
  time_processed: str | None = None
3760
3810
  proc_duration_s: float | None = None
3761
3811
  notes: dict[str, dict[str, str]] | None = None
3762
3812
 
3763
- def get_val_info(self) -> dict[str, any]:
3813
+ def get_val_info(self) -> dict[str, Any] | None:
3764
3814
  return self.val_info
3765
3815
 
3766
3816
  def _add_note(self, key: str, markdown: str, text: str | None = None) -> None:
@@ -3936,7 +3986,7 @@ class _ValidationInfo:
3936
3986
  return self.notes is not None and len(self.notes) > 0
3937
3987
 
3938
3988
 
3939
- def _handle_connection_errors(e: Exception, connection_string: str) -> None:
3989
+ def _handle_connection_errors(e: Exception, connection_string: str) -> NoReturn:
3940
3990
  """
3941
3991
  Shared error handling for database connection failures.
3942
3992
 
@@ -4301,6 +4351,18 @@ class Validate:
4301
4351
  locale's rules. Examples include `"en-US"` for English (United States) and `"fr-FR"` for
4302
4352
  French (France). More simply, this can be a language identifier without a designation of
4303
4353
  territory, like `"es"` for Spanish.
4354
+ owner
4355
+ An optional string identifying the owner of the data being validated. This is useful for
4356
+ governance purposes, indicating who is responsible for the quality and maintenance of the
4357
+ data. For example, `"data-platform-team"` or `"analytics-engineering"`.
4358
+ consumers
4359
+ An optional string or list of strings identifying who depends on or consumes this data.
4360
+ This helps document data dependencies and can be useful for impact analysis when data
4361
+ quality issues are detected. For example, `"ml-team"` or `["ml-team", "analytics"]`.
4362
+ version
4363
+ An optional string representing the version of the validation plan or data contract. This
4364
+ supports semantic versioning (e.g., `"1.0.0"`, `"2.1.0"`) and is useful for tracking changes
4365
+ to validation rules over time and for organizational governance.
4304
4366
 
4305
4367
  Returns
4306
4368
  -------
@@ -4777,7 +4839,8 @@ class Validate:
4777
4839
  when table specifications are missing or backend dependencies are not installed.
4778
4840
  """
4779
4841
 
4780
- data: FrameT | Any
4842
+ data: IntoDataFrame
4843
+ reference: IntoFrame | None = None
4781
4844
  tbl_name: str | None = None
4782
4845
  label: str | None = None
4783
4846
  thresholds: int | float | bool | tuple | dict | Thresholds | None = None
@@ -4786,11 +4849,18 @@ class Validate:
4786
4849
  brief: str | bool | None = None
4787
4850
  lang: str | None = None
4788
4851
  locale: str | None = None
4852
+ owner: str | None = None
4853
+ consumers: str | list[str] | None = None
4854
+ version: str | None = None
4789
4855
 
4790
4856
  def __post_init__(self):
4791
4857
  # Process data through the centralized data processing pipeline
4792
4858
  self.data = _process_data(self.data)
4793
4859
 
4860
+ # Process reference data if provided
4861
+ if self.reference is not None:
4862
+ self.reference = _process_data(self.reference)
4863
+
4794
4864
  # Check input of the `thresholds=` argument
4795
4865
  _check_thresholds(thresholds=self.thresholds)
4796
4866
 
@@ -4826,6 +4896,36 @@ class Validate:
4826
4896
  # Transform any shorthands of `brief` to string representations
4827
4897
  self.brief = _transform_auto_brief(brief=self.brief)
4828
4898
 
4899
+ # Validate and normalize the `owner` parameter
4900
+ if self.owner is not None and not isinstance(self.owner, str):
4901
+ raise TypeError(
4902
+ "The `owner=` parameter must be a string representing the owner of the data. "
4903
+ f"Received type: {type(self.owner).__name__}"
4904
+ )
4905
+
4906
+ # Validate and normalize the `consumers` parameter
4907
+ if self.consumers is not None:
4908
+ if isinstance(self.consumers, str):
4909
+ self.consumers = [self.consumers]
4910
+ elif isinstance(self.consumers, list):
4911
+ if not all(isinstance(c, str) for c in self.consumers):
4912
+ raise TypeError(
4913
+ "The `consumers=` parameter must be a string or a list of strings. "
4914
+ "All elements in the list must be strings."
4915
+ )
4916
+ else:
4917
+ raise TypeError(
4918
+ "The `consumers=` parameter must be a string or a list of strings. "
4919
+ f"Received type: {type(self.consumers).__name__}"
4920
+ )
4921
+
4922
+ # Validate the `version` parameter
4923
+ if self.version is not None and not isinstance(self.version, str):
4924
+ raise TypeError(
4925
+ "The `version=` parameter must be a string representing the version. "
4926
+ f"Received type: {type(self.version).__name__}"
4927
+ )
4928
+
4829
4929
  # TODO: Add functionality to obtain the column names and types from the table
4830
4930
  self.col_names = None
4831
4931
  self.col_types = None
@@ -4835,9 +4935,107 @@ class Validate:
4835
4935
 
4836
4936
  self.validation_info = []
4837
4937
 
4938
+ def _add_agg_validation(
4939
+ self,
4940
+ *,
4941
+ assertion_type: str,
4942
+ columns: str | Collection[str],
4943
+ value,
4944
+ tol=0,
4945
+ thresholds=None,
4946
+ brief=False,
4947
+ actions=None,
4948
+ active=True,
4949
+ ):
4950
+ """
4951
+ Add an aggregation-based validation step to the validation plan.
4952
+
4953
+ This internal method is used by all aggregation-based column validation methods
4954
+ (e.g., `col_sum_eq`, `col_avg_gt`, `col_sd_le`) to create and register validation
4955
+ steps. It relies heavily on the `_ValidationInfo.from_agg_validator()` class method.
4956
+
4957
+ Automatic Reference Inference
4958
+ -----------------------------
4959
+ When `value` is None and reference data has been set on the Validate object,
4960
+ this method automatically creates a `ReferenceColumn` pointing to the same
4961
+ column name in the reference data. This enables a convenient shorthand:
4962
+
4963
+ .. code-block:: python
4964
+
4965
+ # Instead of writing:
4966
+ Validate(data=df, reference=ref_df).col_sum_eq("a", ref("a"))
4967
+
4968
+ # You can simply write:
4969
+ Validate(data=df, reference=ref_df).col_sum_eq("a")
4970
+
4971
+ If `value` is None and no reference data is set, a `ValueError` is raised
4972
+ immediately to provide clear feedback to the user.
4973
+
4974
+ Parameters
4975
+ ----------
4976
+ assertion_type
4977
+ The type of assertion (e.g., "col_sum_eq", "col_avg_gt").
4978
+ columns
4979
+ Column name or collection of column names to validate.
4980
+ value
4981
+ The target value to compare against. Can be:
4982
+ - A numeric literal (int or float)
4983
+ - A `Column` object for cross-column comparison
4984
+ - A `ReferenceColumn` object for reference data comparison
4985
+ - None to automatically use `ref(column)` when reference data is set
4986
+ tol
4987
+ Tolerance for the comparison. Defaults to 0.
4988
+ thresholds
4989
+ Custom thresholds for the validation step.
4990
+ brief
4991
+ Brief description or auto-generate flag.
4992
+ actions
4993
+ Actions to take based on validation results.
4994
+ active
4995
+ Whether this validation step is active.
4996
+
4997
+ Returns
4998
+ -------
4999
+ Validate
5000
+ The Validate instance for method chaining.
5001
+
5002
+ Raises
5003
+ ------
5004
+ ValueError
5005
+ If `value` is None and no reference data is set on the Validate object.
5006
+ """
5007
+ if isinstance(columns, str):
5008
+ columns = [columns]
5009
+ for column in columns:
5010
+ # If value is None, default to referencing the same column from reference data
5011
+ resolved_value = value
5012
+ if value is None:
5013
+ if self.reference is None:
5014
+ raise ValueError(
5015
+ f"The 'value' parameter is required for {assertion_type}() "
5016
+ "when no reference data is set. Either provide a value, or "
5017
+ "set reference data on the Validate object using "
5018
+ "Validate(data=..., reference=...)."
5019
+ )
5020
+ resolved_value = ReferenceColumn(column_name=column)
5021
+
5022
+ val_info = _ValidationInfo.from_agg_validator(
5023
+ assertion_type=assertion_type,
5024
+ columns=column,
5025
+ value=resolved_value,
5026
+ tol=tol,
5027
+ thresholds=self.thresholds if thresholds is None else thresholds,
5028
+ actions=self.actions if actions is None else actions,
5029
+ brief=self.brief if brief is None else brief,
5030
+ active=active,
5031
+ )
5032
+ self._add_validation(validation_info=val_info)
5033
+
5034
+ return self
5035
+
4838
5036
  def set_tbl(
4839
5037
  self,
4840
- tbl: FrameT | Any,
5038
+ tbl: Any,
4841
5039
  tbl_name: str | None = None,
4842
5040
  label: str | None = None,
4843
5041
  ) -> Validate:
@@ -4980,7 +5178,7 @@ class Validate:
4980
5178
  na_pass: bool = False,
4981
5179
  pre: Callable | None = None,
4982
5180
  segments: SegmentSpec | None = None,
4983
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
5181
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
4984
5182
  actions: Actions | None = None,
4985
5183
  brief: str | bool | None = None,
4986
5184
  active: bool = True,
@@ -5214,7 +5412,6 @@ class Validate:
5214
5412
  - Row 1: `c` is `1` and `b` is `2`.
5215
5413
  - Row 3: `c` is `2` and `b` is `2`.
5216
5414
  """
5217
-
5218
5415
  assertion_type = _get_fn_name()
5219
5416
 
5220
5417
  _check_column(column=columns)
@@ -5234,14 +5431,7 @@ class Validate:
5234
5431
  self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
5235
5432
  )
5236
5433
 
5237
- # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
5238
- # resolve the columns
5239
- if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
5240
- columns = col(columns)
5241
-
5242
- # If `columns` is Column value or a string, place it in a list for iteration
5243
- if isinstance(columns, (Column, str)):
5244
- columns = [columns]
5434
+ columns = _resolve_columns(columns)
5245
5435
 
5246
5436
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
5247
5437
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
@@ -5272,7 +5462,7 @@ class Validate:
5272
5462
  na_pass: bool = False,
5273
5463
  pre: Callable | None = None,
5274
5464
  segments: SegmentSpec | None = None,
5275
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
5465
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
5276
5466
  actions: Actions | None = None,
5277
5467
  brief: str | bool | None = None,
5278
5468
  active: bool = True,
@@ -5563,7 +5753,7 @@ class Validate:
5563
5753
  na_pass: bool = False,
5564
5754
  pre: Callable | None = None,
5565
5755
  segments: SegmentSpec | None = None,
5566
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
5756
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
5567
5757
  actions: Actions | None = None,
5568
5758
  brief: str | bool | None = None,
5569
5759
  active: bool = True,
@@ -5854,7 +6044,7 @@ class Validate:
5854
6044
  na_pass: bool = False,
5855
6045
  pre: Callable | None = None,
5856
6046
  segments: SegmentSpec | None = None,
5857
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
6047
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
5858
6048
  actions: Actions | None = None,
5859
6049
  brief: str | bool | None = None,
5860
6050
  active: bool = True,
@@ -6143,7 +6333,7 @@ class Validate:
6143
6333
  na_pass: bool = False,
6144
6334
  pre: Callable | None = None,
6145
6335
  segments: SegmentSpec | None = None,
6146
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
6336
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
6147
6337
  actions: Actions | None = None,
6148
6338
  brief: str | bool | None = None,
6149
6339
  active: bool = True,
@@ -6435,7 +6625,7 @@ class Validate:
6435
6625
  na_pass: bool = False,
6436
6626
  pre: Callable | None = None,
6437
6627
  segments: SegmentSpec | None = None,
6438
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
6628
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
6439
6629
  actions: Actions | None = None,
6440
6630
  brief: str | bool | None = None,
6441
6631
  active: bool = True,
@@ -6729,7 +6919,7 @@ class Validate:
6729
6919
  na_pass: bool = False,
6730
6920
  pre: Callable | None = None,
6731
6921
  segments: SegmentSpec | None = None,
6732
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
6922
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
6733
6923
  actions: Actions | None = None,
6734
6924
  brief: str | bool | None = None,
6735
6925
  active: bool = True,
@@ -7049,7 +7239,7 @@ class Validate:
7049
7239
  na_pass: bool = False,
7050
7240
  pre: Callable | None = None,
7051
7241
  segments: SegmentSpec | None = None,
7052
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
7242
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
7053
7243
  actions: Actions | None = None,
7054
7244
  brief: str | bool | None = None,
7055
7245
  active: bool = True,
@@ -7366,7 +7556,7 @@ class Validate:
7366
7556
  set: Collection[Any],
7367
7557
  pre: Callable | None = None,
7368
7558
  segments: SegmentSpec | None = None,
7369
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
7559
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
7370
7560
  actions: Actions | None = None,
7371
7561
  brief: str | bool | None = None,
7372
7562
  active: bool = True,
@@ -7683,7 +7873,7 @@ class Validate:
7683
7873
  set: Collection[Any],
7684
7874
  pre: Callable | None = None,
7685
7875
  segments: SegmentSpec | None = None,
7686
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
7876
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
7687
7877
  actions: Actions | None = None,
7688
7878
  brief: str | bool | None = None,
7689
7879
  active: bool = True,
@@ -7974,7 +8164,7 @@ class Validate:
7974
8164
  na_pass: bool = False,
7975
8165
  pre: Callable | None = None,
7976
8166
  segments: SegmentSpec | None = None,
7977
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
8167
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
7978
8168
  actions: Actions | None = None,
7979
8169
  brief: str | bool | None = None,
7980
8170
  active: bool = True,
@@ -8162,7 +8352,7 @@ class Validate:
8162
8352
  na_pass: bool = False,
8163
8353
  pre: Callable | None = None,
8164
8354
  segments: SegmentSpec | None = None,
8165
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
8355
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
8166
8356
  actions: Actions | None = None,
8167
8357
  brief: str | bool | None = None,
8168
8358
  active: bool = True,
@@ -8347,7 +8537,7 @@ class Validate:
8347
8537
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
8348
8538
  pre: Callable | None = None,
8349
8539
  segments: SegmentSpec | None = None,
8350
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
8540
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
8351
8541
  actions: Actions | None = None,
8352
8542
  brief: str | bool | None = None,
8353
8543
  active: bool = True,
@@ -8590,7 +8780,7 @@ class Validate:
8590
8780
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
8591
8781
  pre: Callable | None = None,
8592
8782
  segments: SegmentSpec | None = None,
8593
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
8783
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
8594
8784
  actions: Actions | None = None,
8595
8785
  brief: str | bool | None = None,
8596
8786
  active: bool = True,
@@ -8836,7 +9026,7 @@ class Validate:
8836
9026
  inverse: bool = False,
8837
9027
  pre: Callable | None = None,
8838
9028
  segments: SegmentSpec | None = None,
8839
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
9029
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
8840
9030
  actions: Actions | None = None,
8841
9031
  brief: str | bool | None = None,
8842
9032
  active: bool = True,
@@ -9099,7 +9289,7 @@ class Validate:
9099
9289
  na_pass: bool = False,
9100
9290
  pre: Callable | None = None,
9101
9291
  segments: SegmentSpec | None = None,
9102
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
9292
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
9103
9293
  actions: Actions | None = None,
9104
9294
  brief: str | bool | None = None,
9105
9295
  active: bool = True,
@@ -9379,10 +9569,10 @@ class Validate:
9379
9569
 
9380
9570
  def col_vals_expr(
9381
9571
  self,
9382
- expr: any,
9572
+ expr: Any,
9383
9573
  pre: Callable | None = None,
9384
9574
  segments: SegmentSpec | None = None,
9385
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
9575
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
9386
9576
  actions: Actions | None = None,
9387
9577
  brief: str | bool | None = None,
9388
9578
  active: bool = True,
@@ -9600,7 +9790,7 @@ class Validate:
9600
9790
  def col_exists(
9601
9791
  self,
9602
9792
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
9603
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
9793
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
9604
9794
  actions: Actions | None = None,
9605
9795
  brief: str | bool | None = None,
9606
9796
  active: bool = True,
@@ -10072,7 +10262,7 @@ class Validate:
10072
10262
  columns_subset: str | list[str] | None = None,
10073
10263
  pre: Callable | None = None,
10074
10264
  segments: SegmentSpec | None = None,
10075
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
10265
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
10076
10266
  actions: Actions | None = None,
10077
10267
  brief: str | bool | None = None,
10078
10268
  active: bool = True,
@@ -10313,7 +10503,7 @@ class Validate:
10313
10503
  columns_subset: str | list[str] | None = None,
10314
10504
  pre: Callable | None = None,
10315
10505
  segments: SegmentSpec | None = None,
10316
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
10506
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
10317
10507
  actions: Actions | None = None,
10318
10508
  brief: str | bool | None = None,
10319
10509
  active: bool = True,
@@ -10558,7 +10748,7 @@ class Validate:
10558
10748
  max_concurrent: int = 3,
10559
10749
  pre: Callable | None = None,
10560
10750
  segments: SegmentSpec | None = None,
10561
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
10751
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
10562
10752
  actions: Actions | None = None,
10563
10753
  brief: str | bool | None = None,
10564
10754
  active: bool = True,
@@ -10953,7 +11143,7 @@ class Validate:
10953
11143
  case_sensitive_dtypes: bool = True,
10954
11144
  full_match_dtypes: bool = True,
10955
11145
  pre: Callable | None = None,
10956
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
11146
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
10957
11147
  actions: Actions | None = None,
10958
11148
  brief: str | bool | None = None,
10959
11149
  active: bool = True,
@@ -11169,11 +11359,11 @@ class Validate:
11169
11359
 
11170
11360
  def row_count_match(
11171
11361
  self,
11172
- count: int | FrameT | Any,
11362
+ count: int | Any,
11173
11363
  tol: Tolerance = 0,
11174
11364
  inverse: bool = False,
11175
11365
  pre: Callable | None = None,
11176
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
11366
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
11177
11367
  actions: Actions | None = None,
11178
11368
  brief: str | bool | None = None,
11179
11369
  active: bool = True,
@@ -11386,12 +11576,375 @@ class Validate:
11386
11576
 
11387
11577
  return self
11388
11578
 
11579
+ def data_freshness(
11580
+ self,
11581
+ column: str,
11582
+ max_age: str | datetime.timedelta,
11583
+ reference_time: datetime.datetime | str | None = None,
11584
+ timezone: str | None = None,
11585
+ allow_tz_mismatch: bool = False,
11586
+ pre: Callable | None = None,
11587
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
11588
+ actions: Actions | None = None,
11589
+ brief: str | bool | None = None,
11590
+ active: bool = True,
11591
+ ) -> Validate:
11592
+ """
11593
+ Validate that data in a datetime column is not older than a specified maximum age.
11594
+
11595
+ The `data_freshness()` validation method checks whether the most recent timestamp in the
11596
+ specified datetime column is within the allowed `max_age=` from the `reference_time=` (which
11597
+ defaults to the current time). This is useful for ensuring data pipelines are delivering
11598
+ fresh data and for enforcing data SLAs.
11599
+
11600
+ This method helps detect stale data by comparing the maximum (most recent) value in a
11601
+ datetime column against an expected freshness threshold.
11602
+
11603
+ Parameters
11604
+ ----------
11605
+ column
11606
+ The name of the datetime column to check for freshness. This column should contain
11607
+ date or datetime values.
11608
+ max_age
11609
+ The maximum allowed age of the data. Can be specified as: (1) a string with a
11610
+ human-readable duration like `"24 hours"`, `"1 day"`, `"30 minutes"`, `"2 weeks"`, etc.
11611
+ (supported units: `seconds`, `minutes`, `hours`, `days`, `weeks`), or (2) a
11612
+ `datetime.timedelta` object for precise control.
11613
+ reference_time
11614
+ The reference point in time to compare against. Defaults to `None`, which uses the
11615
+ current time (UTC if `timezone=` is not specified). Can be: (1) a `datetime.datetime`
11616
+ object (timezone-aware recommended), (2) a string in ISO 8601 format (e.g.,
11617
+ `"2024-01-15T10:30:00"` or `"2024-01-15T10:30:00+05:30"`), or (3) `None` to use the
11618
+ current time.
11619
+ timezone
11620
+ The timezone to use for interpreting the data and reference time. Accepts IANA
11621
+ timezone names (e.g., `"America/New_York"`), hour offsets (e.g., `"-7"`), or ISO 8601
11622
+ offsets (e.g., `"-07:00"`). When `None` (default), naive datetimes are treated as UTC.
11623
+ See the *The `timezone=` Parameter* section for details.
11624
+ allow_tz_mismatch
11625
+ Whether to allow timezone mismatches between the column data and reference time.
11626
+ By default (`False`), a warning note is added when comparing timezone-naive with
11627
+ timezone-aware datetimes. Set to `True` to suppress these warnings.
11628
+ pre
11629
+ An optional preprocessing function or lambda to apply to the data table during
11630
+ interrogation. This function should take a table as input and return a modified table.
11631
+ thresholds
11632
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
11633
+ The thresholds are set at the step level and will override any global thresholds set in
11634
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
11635
+ be set locally and global thresholds (if any) will take effect.
11636
+ actions
11637
+ Optional actions to take when the validation step meets or exceeds any set threshold
11638
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
11639
+ define the actions.
11640
+ brief
11641
+ An optional brief description of the validation step that will be displayed in the
11642
+ reporting table. You can use the templating elements like `"{step}"` to insert
11643
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
11644
+ the entire brief will be automatically generated. If `None` (the default) then there
11645
+ won't be a brief.
11646
+ active
11647
+ A boolean value indicating whether the validation step should be active. Using `False`
11648
+ will make the validation step inactive (still reporting its presence and keeping indexes
11649
+ for the steps unchanged).
11650
+
11651
+ Returns
11652
+ -------
11653
+ Validate
11654
+ The `Validate` object with the added validation step.
11655
+
11656
+ How Timezones Affect Freshness Checks
11657
+ -------------------------------------
11658
+ Freshness validation involves comparing two times: the **data time** (the most recent
11659
+ timestamp in your column) and the **execution time** (when and where the validation runs).
11660
+ Timezone confusion typically arises because these two times may originate from different
11661
+ contexts.
11662
+
11663
+ Consider these common scenarios:
11664
+
11665
+ - your data timestamps are stored in UTC (common for databases), but you're running
11666
+ validation on your laptop in New York (Eastern Time)
11667
+ - you develop and test validation locally, then deploy it to a cloud workflow that runs
11668
+ in UTC—suddenly your 'same' validation behaves differently
11669
+ - your data comes from servers in multiple regions, each recording timestamps in their
11670
+ local timezone
11671
+
11672
+ The `timezone=` parameter exists to solve this problem by establishing a single, explicit
11673
+ timezone context for the freshness comparison. When you specify a timezone, Pointblank
11674
+ interprets both the data timestamps (if naive) and the execution time in that timezone,
11675
+ ensuring consistent behavior whether you run validation on your laptop or in a cloud
11676
+ workflow.
11677
+
11678
+ **Scenario 1: Data has timezone-aware datetimes**
11679
+
11680
+ ```python
11681
+ # Your data column has values like: 2024-01-15 10:30:00+00:00 (UTC)
11682
+ # Comparison is straightforward as both sides have explicit timezones
11683
+ .data_freshness(column="updated_at", max_age="24 hours")
11684
+ ```
11685
+
11686
+ **Scenario 2: Data has naive datetimes (no timezone)**
11687
+
11688
+ ```python
11689
+ # Your data column has values like: 2024-01-15 10:30:00 (no timezone)
11690
+ # Specify the timezone the data was recorded in:
11691
+ .data_freshness(column="updated_at", max_age="24 hours", timezone="America/New_York")
11692
+ ```
11693
+
11694
+ **Scenario 3: Ensuring consistent behavior across environments**
11695
+
11696
+ ```python
11697
+ # Pin the timezone to ensure identical results whether running locally or in the cloud
11698
+ .data_freshness(
11699
+ column="updated_at",
11700
+ max_age="24 hours",
11701
+ timezone="UTC", # Explicit timezone removes environment dependence
11702
+ )
11703
+ ```
11704
+
11705
+ The `timezone=` Parameter
11706
+ ---------------------------
11707
+ The `timezone=` parameter accepts several convenient formats, making it easy to specify
11708
+ timezones in whatever way is most natural for your use case. The following examples
11709
+ illustrate the three supported input styles.
11710
+
11711
+ **IANA Timezone Names** (recommended for regions with daylight saving time):
11712
+
11713
+ ```python
11714
+ timezone="America/New_York" # Eastern Time (handles DST automatically)
11715
+ timezone="Europe/London" # UK time
11716
+ timezone="Asia/Tokyo" # Japan Standard Time
11717
+ timezone="Australia/Sydney" # Australian Eastern Time
11718
+ timezone="UTC" # Coordinated Universal Time
11719
+ ```
11720
+
11721
+ **Simple Hour Offsets** (quick and easy):
11722
+
11723
+ ```python
11724
+ timezone="-7" # UTC-7 (e.g., Mountain Standard Time)
11725
+ timezone="+5" # UTC+5 (e.g., Pakistan Standard Time)
11726
+ timezone="0" # UTC
11727
+ timezone="-12" # UTC-12
11728
+ ```
11729
+
11730
+ **ISO 8601 Offset Format** (precise, including fractional hours):
11731
+
11732
+ ```python
11733
+ timezone="-07:00" # UTC-7
11734
+ timezone="+05:30" # UTC+5:30 (e.g., India Standard Time)
11735
+ timezone="+00:00" # UTC
11736
+ timezone="-09:30" # UTC-9:30
11737
+ ```
11738
+
11739
+ When a timezone is specified:
11740
+
11741
+ - naive datetime values in the column are assumed to be in this timezone.
11742
+ - the reference time (if naive) is assumed to be in this timezone.
11743
+ - the validation report will show times in this timezone.
11744
+
11745
+ When `None` (default):
11746
+
11747
+ - if your column has timezone-aware datetimes, those timezones are used
11748
+ - if your column has naive datetimes, they're treated as UTC
11749
+ - the current time reference uses UTC
11750
+
11751
+ Note that IANA timezone names are preferred when daylight saving time transitions matter, as
11752
+ they automatically handle the offset changes. Fixed offsets like `"-7"` or `"-07:00"` do not
11753
+ account for DST.
11754
+
11755
+ Recommendations for Working with Timestamps
11756
+ -------------------------------------------
11757
+ When working with datetime data, storing timestamps in UTC in your databases is strongly
11758
+ recommended since it provides a consistent reference point regardless of where your data
11759
+ originates or where it's consumed. Using timezone-aware datetimes whenever possible helps
11760
+ avoid ambiguity—when a datetime has an explicit timezone, there's no guessing about what
11761
+ time it actually represents.
11762
+
11763
+ If you're working with naive datetimes (which lack timezone information), always specify the
11764
+ `timezone=` parameter so Pointblank knows how to interpret those values. When providing
11765
+ `reference_time=` as a string, use ISO 8601 format with the timezone offset included (e.g.,
11766
+ `"2024-01-15T10:30:00+00:00"`) to ensure unambiguous parsing. Finally, prefer IANA timezone
11767
+ names (like `"America/New_York"`) over fixed offsets (like `"-05:00"`) when daylight saving
11768
+ time transitions matter, since IANA names automatically handle the twice-yearly offset
11769
+ changes. To see all available IANA timezone names in Python, use
11770
+ `zoneinfo.available_timezones()` from the standard library's `zoneinfo` module.
11771
+
11772
+ Examples
11773
+ --------
11774
+ ```{python}
11775
+ #| echo: false
11776
+ #| output: false
11777
+ import pointblank as pb
11778
+ pb.config(report_incl_header=False, report_incl_footer=False)
11779
+ ```
11780
+
11781
+ The simplest use of `data_freshness()` requires just two arguments: the `column=` containing
11782
+ your timestamps and `max_age=` specifying how old the data can be. In this first example,
11783
+ we create sample data with an `"updated_at"` column containing timestamps from 1, 12, and
11784
+ 20 hours ago. By setting `max_age="24 hours"`, we're asserting that the most recent
11785
+ timestamp should be within 24 hours of the current time. Since the newest record is only
11786
+ 1 hour old, this validation passes.
11787
+
11788
+ ```{python}
11789
+ import pointblank as pb
11790
+ import polars as pl
11791
+ from datetime import datetime, timedelta
11792
+
11793
+ # Create sample data with recent timestamps
11794
+ recent_data = pl.DataFrame({
11795
+ "id": [1, 2, 3],
11796
+ "updated_at": [
11797
+ datetime.now() - timedelta(hours=1),
11798
+ datetime.now() - timedelta(hours=12),
11799
+ datetime.now() - timedelta(hours=20),
11800
+ ]
11801
+ })
11802
+
11803
+ validation = (
11804
+ pb.Validate(data=recent_data)
11805
+ .data_freshness(column="updated_at", max_age="24 hours")
11806
+ .interrogate()
11807
+ )
11808
+
11809
+ validation
11810
+ ```
11811
+
11812
+ The `max_age=` parameter accepts human-readable strings with various time units. You can
11813
+ chain multiple `data_freshness()` calls to check different freshness thresholds
11814
+ simultaneously—useful for tiered SLAs where you might want warnings at 30 minutes but
11815
+ errors at 2 days.
11816
+
11817
+ ```{python}
11818
+ # Check data is fresh within different time windows
11819
+ validation = (
11820
+ pb.Validate(data=recent_data)
11821
+ .data_freshness(column="updated_at", max_age="30 minutes") # Very fresh
11822
+ .data_freshness(column="updated_at", max_age="2 days") # Reasonably fresh
11823
+ .data_freshness(column="updated_at", max_age="1 week") # Within a week
11824
+ .interrogate()
11825
+ )
11826
+
11827
+ validation
11828
+ ```
11829
+
11830
+ When your data contains naive datetimes (timestamps without timezone information), use the
11831
+ `timezone=` parameter to specify what timezone those values represent. Here we have event
11832
+ data recorded in Eastern Time, so we set `timezone="America/New_York"` to ensure the
11833
+ freshness comparison is done correctly.
11834
+
11835
+ ```{python}
11836
+ # Data with naive datetimes (assume they're in Eastern Time)
11837
+ eastern_data = pl.DataFrame({
11838
+ "event_time": [
11839
+ datetime.now() - timedelta(hours=2),
11840
+ datetime.now() - timedelta(hours=5),
11841
+ ]
11842
+ })
11843
+
11844
+ validation = (
11845
+ pb.Validate(data=eastern_data)
11846
+ .data_freshness(
11847
+ column="event_time",
11848
+ max_age="12 hours",
11849
+ timezone="America/New_York" # Interpret times as Eastern
11850
+ )
11851
+ .interrogate()
11852
+ )
11853
+
11854
+ validation
11855
+ ```
11856
+
11857
+ For reproducible validations or historical checks, you can use `reference_time=` to compare
11858
+ against a specific point in time instead of the current time. This is particularly useful
11859
+ for testing or when validating data snapshots. The reference time should include a timezone
11860
+ offset (like `+00:00` for UTC) to avoid ambiguity.
11861
+
11862
+ ```{python}
11863
+ validation = (
11864
+ pb.Validate(data=recent_data)
11865
+ .data_freshness(
11866
+ column="updated_at",
11867
+ max_age="24 hours",
11868
+ reference_time="2024-01-15T12:00:00+00:00"
11869
+ )
11870
+ .interrogate()
11871
+ )
11872
+
11873
+ validation
11874
+ ```
11875
+ """
11876
+
11877
+ assertion_type = _get_fn_name()
11878
+
11879
+ _check_pre(pre=pre)
11880
+ _check_thresholds(thresholds=thresholds)
11881
+ _check_boolean_input(param=active, param_name="active")
11882
+ _check_boolean_input(param=allow_tz_mismatch, param_name="allow_tz_mismatch")
11883
+
11884
+ # Validate and parse the max_age parameter
11885
+ max_age_td = _parse_max_age(max_age)
11886
+
11887
+ # Validate the column parameter
11888
+ if not isinstance(column, str):
11889
+ raise TypeError(
11890
+ f"The `column` parameter must be a string, got {type(column).__name__}."
11891
+ )
11892
+
11893
+ # Validate the timezone parameter if provided
11894
+ if timezone is not None:
11895
+ _validate_timezone(timezone)
11896
+
11897
+ # Parse reference_time if it's a string
11898
+ parsed_reference_time = None
11899
+ if reference_time is not None:
11900
+ if isinstance(reference_time, str):
11901
+ parsed_reference_time = _parse_reference_time(reference_time)
11902
+ elif isinstance(reference_time, datetime.datetime):
11903
+ parsed_reference_time = reference_time
11904
+ else:
11905
+ raise TypeError(
11906
+ f"The `reference_time` parameter must be a string or datetime object, "
11907
+ f"got {type(reference_time).__name__}."
11908
+ )
11909
+
11910
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
11911
+ thresholds = (
11912
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
11913
+ )
11914
+
11915
+ # Package up the parameters for later interrogation
11916
+ values = {
11917
+ "max_age": max_age_td,
11918
+ "max_age_str": max_age if isinstance(max_age, str) else str(max_age),
11919
+ "reference_time": parsed_reference_time,
11920
+ "timezone": timezone,
11921
+ "allow_tz_mismatch": allow_tz_mismatch,
11922
+ }
11923
+
11924
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
11925
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
11926
+
11927
+ val_info = _ValidationInfo(
11928
+ assertion_type=assertion_type,
11929
+ column=column,
11930
+ values=values,
11931
+ pre=pre,
11932
+ thresholds=thresholds,
11933
+ actions=actions,
11934
+ brief=brief,
11935
+ active=active,
11936
+ )
11937
+
11938
+ self._add_validation(validation_info=val_info)
11939
+
11940
+ return self
11941
+
11389
11942
  def col_count_match(
11390
11943
  self,
11391
- count: int | FrameT | Any,
11944
+ count: int | Any,
11392
11945
  inverse: bool = False,
11393
11946
  pre: Callable | None = None,
11394
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
11947
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
11395
11948
  actions: Actions | None = None,
11396
11949
  brief: str | bool | None = None,
11397
11950
  active: bool = True,
@@ -11564,9 +12117,9 @@ class Validate:
11564
12117
 
11565
12118
  def tbl_match(
11566
12119
  self,
11567
- tbl_compare: FrameT | Any,
12120
+ tbl_compare: Any,
11568
12121
  pre: Callable | None = None,
11569
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
12122
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
11570
12123
  actions: Actions | None = None,
11571
12124
  brief: str | bool | None = None,
11572
12125
  active: bool = True,
@@ -11835,7 +12388,7 @@ class Validate:
11835
12388
  self,
11836
12389
  *exprs: Callable,
11837
12390
  pre: Callable | None = None,
11838
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
12391
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
11839
12392
  actions: Actions | None = None,
11840
12393
  brief: str | bool | None = None,
11841
12394
  active: bool = True,
@@ -12083,7 +12636,7 @@ class Validate:
12083
12636
  self,
12084
12637
  expr: Callable,
12085
12638
  pre: Callable | None = None,
12086
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
12639
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
12087
12640
  actions: Actions | None = None,
12088
12641
  brief: str | bool | None = None,
12089
12642
  active: bool = True,
@@ -12577,7 +13130,7 @@ class Validate:
12577
13130
  segment = validation.segments
12578
13131
 
12579
13132
  # Get compatible data types for this assertion type
12580
- assertion_method = ASSERTION_TYPE_METHOD_MAP[assertion_type]
13133
+ assertion_method = ASSERTION_TYPE_METHOD_MAP.get(assertion_type, assertion_type)
12581
13134
  compatible_dtypes = COMPATIBLE_DTYPES.get(assertion_method, [])
12582
13135
 
12583
13136
  # Process the `brief` text for the validation step by including template variables to
@@ -12632,7 +13185,11 @@ class Validate:
12632
13185
 
12633
13186
  # Make a deep copy of the table for this step to ensure proper isolation
12634
13187
  # This prevents modifications from one validation step affecting others
12635
- data_tbl_step = _copy_dataframe(data_tbl)
13188
+ try:
13189
+ # TODO: This copying should be scrutinized further
13190
+ data_tbl_step: IntoDataFrame = _copy_dataframe(data_tbl)
13191
+ except Exception as e: # pragma: no cover
13192
+ data_tbl_step: IntoDataFrame = data_tbl # pragma: no cover
12636
13193
 
12637
13194
  # Capture original table dimensions and columns before preprocessing
12638
13195
  # (only if preprocessing is present - we'll set these inside the preprocessing block)
@@ -12793,6 +13350,8 @@ class Validate:
12793
13350
  "col_schema_match",
12794
13351
  "row_count_match",
12795
13352
  "col_count_match",
13353
+ "data_freshness",
13354
+ "tbl_match",
12796
13355
  ]
12797
13356
 
12798
13357
  if validation.n == 0 and assertion_type not in table_level_assertions:
@@ -13053,8 +13612,107 @@ class Validate:
13053
13612
 
13054
13613
  results_tbl = None
13055
13614
 
13056
- elif assertion_type == "tbl_match":
13057
- from pointblank._interrogation import tbl_match
13615
+ elif assertion_type == "data_freshness":
13616
+ from pointblank._interrogation import data_freshness as data_freshness_check
13617
+
13618
+ freshness_result = data_freshness_check(
13619
+ data_tbl=data_tbl_step,
13620
+ column=column,
13621
+ max_age=value["max_age"],
13622
+ reference_time=value["reference_time"],
13623
+ timezone=value["timezone"],
13624
+ allow_tz_mismatch=value["allow_tz_mismatch"],
13625
+ )
13626
+
13627
+ result_bool = freshness_result["passed"]
13628
+ validation.all_passed = result_bool
13629
+ validation.n = 1
13630
+ validation.n_passed = int(result_bool)
13631
+ validation.n_failed = 1 - int(result_bool)
13632
+
13633
+ # Store the freshness check details for reporting
13634
+ validation.val_info = freshness_result
13635
+
13636
+ # Update the values dict with actual computed values for failure text
13637
+ if freshness_result.get("age") is not None:
13638
+ value["age"] = freshness_result["age"]
13639
+
13640
+ # Add timezone warning note if applicable
13641
+ if freshness_result.get("tz_warning_key"):
13642
+ tz_key = freshness_result["tz_warning_key"]
13643
+ tz_warning_text = NOTES_TEXT.get(tz_key, {}).get(
13644
+ self.locale, NOTES_TEXT.get(tz_key, {}).get("en", "")
13645
+ )
13646
+ validation._add_note(
13647
+ key="tz_warning",
13648
+ markdown=f"⚠️ {tz_warning_text}",
13649
+ text=tz_warning_text,
13650
+ )
13651
+
13652
+ # Add note about column being empty if applicable
13653
+ if freshness_result.get("column_empty"):
13654
+ column_empty_text = NOTES_TEXT.get(
13655
+ "data_freshness_column_empty", {}
13656
+ ).get(
13657
+ self.locale,
13658
+ NOTES_TEXT.get("data_freshness_column_empty", {}).get(
13659
+ "en", "The datetime column is empty (no values to check)."
13660
+ ),
13661
+ )
13662
+ validation._add_note(
13663
+ key="column_empty",
13664
+ markdown=f"⚠️ {column_empty_text}",
13665
+ text=column_empty_text,
13666
+ )
13667
+
13668
+ # Add informational note about the freshness check
13669
+ if freshness_result.get("max_datetime") and freshness_result.get("age"):
13670
+ max_dt = freshness_result["max_datetime"]
13671
+ # Format datetime without microseconds for cleaner display
13672
+ if hasattr(max_dt, "replace"):
13673
+ max_dt_display = max_dt.replace(microsecond=0)
13674
+ else:
13675
+ max_dt_display = max_dt
13676
+ age = freshness_result["age"]
13677
+ age_str = _format_timedelta(age)
13678
+ max_age_str = _format_timedelta(value["max_age"])
13679
+
13680
+ # Get translated template for pass/fail
13681
+ if result_bool:
13682
+ details_key = "data_freshness_details_pass"
13683
+ prefix = "✓"
13684
+ else:
13685
+ details_key = "data_freshness_details_fail"
13686
+ prefix = "✗"
13687
+
13688
+ details_template = NOTES_TEXT.get(details_key, {}).get(
13689
+ self.locale,
13690
+ NOTES_TEXT.get(details_key, {}).get(
13691
+ "en",
13692
+ "Most recent data: `{max_dt}` (age: {age}, max allowed: {max_age})",
13693
+ ),
13694
+ )
13695
+
13696
+ # Format the template with values
13697
+ note_text = details_template.format(
13698
+ max_dt=max_dt_display, age=age_str, max_age=max_age_str
13699
+ )
13700
+ # For markdown, make the age bold
13701
+ note_md_template = details_template.replace(
13702
+ "(age: {age}", "(age: **{age}**"
13703
+ )
13704
+ note_md = f"{prefix} {note_md_template.format(max_dt=max_dt_display, age=age_str, max_age=max_age_str)}"
13705
+
13706
+ validation._add_note(
13707
+ key="freshness_details",
13708
+ markdown=note_md,
13709
+ text=note_text,
13710
+ )
13711
+
13712
+ results_tbl = None
13713
+
13714
+ elif assertion_type == "tbl_match":
13715
+ from pointblank._interrogation import tbl_match
13058
13716
 
13059
13717
  # Get the comparison table (could be callable or actual table)
13060
13718
  tbl_compare = value["tbl_compare"]
@@ -13080,6 +13738,53 @@ class Validate:
13080
13738
  tbl_type=tbl_type,
13081
13739
  )
13082
13740
 
13741
+ elif is_valid_agg(assertion_type):
13742
+ agg, comp = resolve_agg_registries(assertion_type)
13743
+
13744
+ # Produce a 1-column Narwhals DataFrame
13745
+ # TODO: Should be able to take lazy too
13746
+ vec: nw.DataFrame = nw.from_native(data_tbl_step).select(column)
13747
+ real = agg(vec)
13748
+
13749
+ raw_value = value["value"]
13750
+ tol = value["tol"]
13751
+
13752
+ # Handle ReferenceColumn: compute target from reference data
13753
+ if isinstance(raw_value, ReferenceColumn):
13754
+ if self.reference is None:
13755
+ raise ValueError(
13756
+ f"Cannot use ref('{raw_value.column_name}') without "
13757
+ "setting reference data on the Validate object. "
13758
+ "Use Validate(data=..., reference=...) to set reference data."
13759
+ )
13760
+ ref_vec: nw.DataFrame = nw.from_native(self.reference).select(
13761
+ raw_value.column_name
13762
+ )
13763
+ target: float | int = agg(ref_vec)
13764
+ else:
13765
+ target = raw_value
13766
+
13767
+ lower_diff, upper_diff = _derive_bounds(target, tol)
13768
+
13769
+ lower_bound = target - lower_diff
13770
+ upper_bound = target + upper_diff
13771
+ result_bool: bool = comp(real, lower_bound, upper_bound)
13772
+
13773
+ validation.all_passed = result_bool
13774
+ validation.n = 1
13775
+ validation.n_passed = int(result_bool)
13776
+ validation.n_failed = 1 - result_bool
13777
+
13778
+ # Store computed values for step reports
13779
+ validation.val_info = {
13780
+ "actual": real,
13781
+ "target": target,
13782
+ "tol": tol,
13783
+ "lower_bound": lower_bound,
13784
+ "upper_bound": upper_bound,
13785
+ }
13786
+
13787
+ results_tbl = None
13083
13788
  else:
13084
13789
  raise ValueError(
13085
13790
  f"Unknown assertion type: {assertion_type}"
@@ -13822,12 +14527,14 @@ class Validate:
13822
14527
  )
13823
14528
 
13824
14529
  # Get the threshold status using the appropriate method
14530
+ # Note: scalar=False (default) always returns a dict
14531
+ status: dict[int, bool]
13825
14532
  if level == "warning":
13826
- status = self.warning(i=i)
14533
+ status = self.warning(i=i) # type: ignore[assignment]
13827
14534
  elif level == "error":
13828
- status = self.error(i=i)
13829
- elif level == "critical":
13830
- status = self.critical(i=i)
14535
+ status = self.error(i=i) # type: ignore[assignment]
14536
+ else: # level == "critical"
14537
+ status = self.critical(i=i) # type: ignore[assignment]
13831
14538
 
13832
14539
  # Find any steps that exceeded the threshold
13833
14540
  failures = []
@@ -13981,12 +14688,14 @@ class Validate:
13981
14688
  )
13982
14689
 
13983
14690
  # Get the threshold status using the appropriate method
14691
+ # Note: scalar=False (default) always returns a dict
14692
+ status: dict[int, bool]
13984
14693
  if level == "warning":
13985
- status = self.warning(i=i)
14694
+ status = self.warning(i=i) # type: ignore[assignment]
13986
14695
  elif level == "error":
13987
- status = self.error(i=i)
13988
- elif level == "critical":
13989
- status = self.critical(i=i)
14696
+ status = self.error(i=i) # type: ignore[assignment]
14697
+ else: # level == "critical"
14698
+ status = self.critical(i=i) # type: ignore[assignment]
13990
14699
 
13991
14700
  # Return True if any steps exceeded the threshold
13992
14701
  return any(status.values())
@@ -14759,7 +15468,7 @@ class Validate:
14759
15468
 
14760
15469
  def get_data_extracts(
14761
15470
  self, i: int | list[int] | None = None, frame: bool = False
14762
- ) -> dict[int, FrameT | None] | FrameT | None:
15471
+ ) -> dict[int, Any] | Any:
14763
15472
  """
14764
15473
  Get the rows that failed for each validation step.
14765
15474
 
@@ -14782,7 +15491,7 @@ class Validate:
14782
15491
 
14783
15492
  Returns
14784
15493
  -------
14785
- dict[int, FrameT | None] | FrameT | None
15494
+ dict[int, Any] | Any
14786
15495
  A dictionary of tables containing the rows that failed in every compatible validation
14787
15496
  step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
14788
15497
 
@@ -15072,7 +15781,7 @@ class Validate:
15072
15781
 
15073
15782
  return json.dumps(report, indent=4, default=str)
15074
15783
 
15075
- def get_sundered_data(self, type="pass") -> FrameT:
15784
+ def get_sundered_data(self, type="pass") -> Any:
15076
15785
  """
15077
15786
  Get the data that passed or failed the validation steps.
15078
15787
 
@@ -15108,7 +15817,7 @@ class Validate:
15108
15817
 
15109
15818
  Returns
15110
15819
  -------
15111
- FrameT
15820
+ Any
15112
15821
  A table containing the data that passed or failed the validation steps.
15113
15822
 
15114
15823
  Examples
@@ -15200,6 +15909,7 @@ class Validate:
15200
15909
  # Get all validation step result tables and join together the `pb_is_good_` columns
15201
15910
  # ensuring that the columns are named uniquely (e.g., `pb_is_good_1`, `pb_is_good_2`, ...)
15202
15911
  # and that the index is reset
15912
+ labeled_tbl_nw: nw.DataFrame | nw.LazyFrame | None = None
15203
15913
  for i, validation in enumerate(validation_info):
15204
15914
  results_tbl = nw.from_native(validation.tbl_checked)
15205
15915
 
@@ -15220,7 +15930,7 @@ class Validate:
15220
15930
  )
15221
15931
 
15222
15932
  # Add the results table to the list of tables
15223
- if i == 0:
15933
+ if labeled_tbl_nw is None:
15224
15934
  labeled_tbl_nw = results_tbl
15225
15935
  else:
15226
15936
  labeled_tbl_nw = labeled_tbl_nw.join(results_tbl, on=index_name, how="left")
@@ -15396,10 +16106,10 @@ class Validate:
15396
16106
  def get_tabular_report(
15397
16107
  self,
15398
16108
  title: str | None = ":default:",
15399
- incl_header: bool = None,
15400
- incl_footer: bool = None,
15401
- incl_footer_timings: bool = None,
15402
- incl_footer_notes: bool = None,
16109
+ incl_header: bool | None = None,
16110
+ incl_footer: bool | None = None,
16111
+ incl_footer_timings: bool | None = None,
16112
+ incl_footer_notes: bool | None = None,
15403
16113
  ) -> GT:
15404
16114
  """
15405
16115
  Validation report as a GT table.
@@ -15767,10 +16477,16 @@ class Validate:
15767
16477
  elif assertion_type[i] in ["conjointly", "specially"]:
15768
16478
  column_text = ""
15769
16479
  else:
15770
- column_text = str(column)
16480
+ # Handle both string columns and list columns
16481
+ # For single-element lists like ['a'], display as 'a'
16482
+ # For multi-element lists, display as comma-separated values
16483
+ if isinstance(column, list):
16484
+ column_text = ", ".join(str(c) for c in column)
16485
+ else:
16486
+ column_text = str(column)
15771
16487
 
15772
- # Apply underline styling for synthetic columns (using the purple color from the icon)
15773
- # Only apply styling if column_text is not empty and not a special marker
16488
+ # Apply underline styling for synthetic columns; only apply styling if column_text is
16489
+ # not empty and not a special marker
15774
16490
  if (
15775
16491
  has_synthetic_column
15776
16492
  and column_text
@@ -15848,6 +16564,69 @@ class Validate:
15848
16564
  tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
15849
16565
  values_upd.append(f"p = {p_value}<br/>tol = {tol_value}")
15850
16566
 
16567
+ elif assertion_type[i] in ["data_freshness"]:
16568
+ # Format max_age nicely for display
16569
+ max_age = value.get("max_age")
16570
+ max_age_str = _format_timedelta(max_age) if max_age else "&mdash;"
16571
+
16572
+ # Build additional lines with non-default parameters
16573
+ extra_lines = []
16574
+
16575
+ if value.get("reference_time") is not None:
16576
+ ref_time = value["reference_time"]
16577
+
16578
+ # Format datetime across two lines: date and time+tz
16579
+ if hasattr(ref_time, "strftime"):
16580
+ date_str = ref_time.strftime("@%Y-%m-%d")
16581
+ time_str = " " + ref_time.strftime("%H:%M:%S")
16582
+
16583
+ # Add timezone offset if present
16584
+ if hasattr(ref_time, "tzinfo") and ref_time.tzinfo is not None:
16585
+ tz_offset = ref_time.strftime("%z")
16586
+ if tz_offset:
16587
+ time_str += tz_offset
16588
+ extra_lines.append(date_str)
16589
+ extra_lines.append(time_str)
16590
+ else:
16591
+ extra_lines.append(f"@{ref_time}")
16592
+
16593
+ # Timezone and allow_tz_mismatch on same line
16594
+ tz_line_parts = []
16595
+ if value.get("timezone") is not None:
16596
+ # Convert timezone name to ISO 8601 offset format
16597
+ tz_name = value["timezone"]
16598
+
16599
+ try:
16600
+ tz_obj = ZoneInfo(tz_name)
16601
+
16602
+ # Get the current offset for this timezone
16603
+ now = datetime.datetime.now(tz_obj)
16604
+ offset = now.strftime("%z")
16605
+
16606
+ # Format as ISO 8601 extended: -07:00 (insert colon)
16607
+ if len(offset) == 5:
16608
+ tz_display = f"{offset[:3]}:{offset[3:]}"
16609
+ else:
16610
+ tz_display = offset
16611
+
16612
+ except Exception:
16613
+ tz_display = tz_name
16614
+ tz_line_parts.append(tz_display)
16615
+
16616
+ if value.get("allow_tz_mismatch"):
16617
+ tz_line_parts.append("~tz")
16618
+
16619
+ if tz_line_parts:
16620
+ extra_lines.append(" ".join(tz_line_parts))
16621
+
16622
+ if extra_lines:
16623
+ extra_html = "<br/>".join(extra_lines)
16624
+ values_upd.append(
16625
+ f'{max_age_str}<br/><span style="font-size: 9px;">{extra_html}</span>'
16626
+ )
16627
+ else:
16628
+ values_upd.append(max_age_str)
16629
+
15851
16630
  elif assertion_type[i] in ["col_schema_match"]:
15852
16631
  values_upd.append("SCHEMA")
15853
16632
 
@@ -15889,6 +16668,32 @@ class Validate:
15889
16668
  else: # pragma: no cover
15890
16669
  values_upd.append(str(value)) # pragma: no cover
15891
16670
 
16671
+ # Handle aggregation methods (col_sum_gt, col_avg_eq, etc.)
16672
+ elif is_valid_agg(assertion_type[i]):
16673
+ # Extract the value and tolerance from the values dict
16674
+ agg_value = value.get("value")
16675
+ tol_value = value.get("tol", 0)
16676
+
16677
+ # Format the value (could be a number, Column, or ReferenceColumn)
16678
+ if hasattr(agg_value, "__repr__"):
16679
+ # For Column or ReferenceColumn objects, use their repr
16680
+ value_str = repr(agg_value)
16681
+ else:
16682
+ value_str = str(agg_value)
16683
+
16684
+ # Format tolerance - only show on second line if non-zero
16685
+ if tol_value != 0:
16686
+ # Format tolerance based on its type
16687
+ if isinstance(tol_value, tuple):
16688
+ # Asymmetric bounds: (lower, upper)
16689
+ tol_str = f"tol=({tol_value[0]}, {tol_value[1]})"
16690
+ else:
16691
+ # Symmetric tolerance
16692
+ tol_str = f"tol={tol_value}"
16693
+ values_upd.append(f"{value_str}<br/>{tol_str}")
16694
+ else:
16695
+ values_upd.append(value_str)
16696
+
15892
16697
  # If the assertion type is not recognized, add the value as a string
15893
16698
  else: # pragma: no cover
15894
16699
  values_upd.append(str(value)) # pragma: no cover
@@ -16327,6 +17132,15 @@ class Validate:
16327
17132
  if incl_footer_timings:
16328
17133
  gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
16329
17134
 
17135
+ # Add governance metadata as source note if any metadata is present
17136
+ governance_html = _create_governance_metadata_html(
17137
+ owner=self.owner,
17138
+ consumers=self.consumers,
17139
+ version=self.version,
17140
+ )
17141
+ if governance_html:
17142
+ gt_tbl = gt_tbl.tab_source_note(source_note=html(governance_html))
17143
+
16330
17144
  # Create notes markdown from validation steps and add as separate source note if enabled
16331
17145
  if incl_footer_notes:
16332
17146
  notes_markdown = _create_notes_html(self.validation_info)
@@ -16675,6 +17489,18 @@ class Validate:
16675
17489
  debug_return_df=debug_return_df,
16676
17490
  )
16677
17491
 
17492
+ elif is_valid_agg(assertion_type):
17493
+ step_report = _step_report_aggregate(
17494
+ assertion_type=assertion_type,
17495
+ i=i,
17496
+ column=column,
17497
+ values=values,
17498
+ all_passed=all_passed,
17499
+ val_info=val_info,
17500
+ header=header,
17501
+ lang=lang,
17502
+ )
17503
+
16678
17504
  else:
16679
17505
  step_report = None # pragma: no cover
16680
17506
 
@@ -16738,7 +17564,7 @@ class Validate:
16738
17564
  table = validation.pre(self.data)
16739
17565
 
16740
17566
  # Get the columns from the table as a list
16741
- columns = list(table.columns)
17567
+ columns = list(table.columns) # type: ignore[union-attr]
16742
17568
 
16743
17569
  # Evaluate the column expression
16744
17570
  if isinstance(column_expr, ColumnSelectorNarwhals):
@@ -17116,7 +17942,7 @@ def _convert_string_to_datetime(value: str) -> datetime.datetime:
17116
17942
  return datetime.datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
17117
17943
 
17118
17944
 
17119
- def _string_date_dttm_conversion(value: any) -> any:
17945
+ def _string_date_dttm_conversion(value: Any) -> Any:
17120
17946
  """
17121
17947
  Convert a string to a date or datetime object if it is in the correct format.
17122
17948
  If the value is not a string, it is returned as is.
@@ -17151,8 +17977,8 @@ def _string_date_dttm_conversion(value: any) -> any:
17151
17977
 
17152
17978
 
17153
17979
  def _conditional_string_date_dttm_conversion(
17154
- value: any, allow_regular_strings: bool = False
17155
- ) -> any:
17980
+ value: Any, allow_regular_strings: bool = False
17981
+ ) -> Any:
17156
17982
  """
17157
17983
  Conditionally convert a string to a date or datetime object if it is in the correct format. If
17158
17984
  `allow_regular_strings=` is `True`, regular strings are allowed to pass through unchanged. If
@@ -17196,9 +18022,9 @@ def _process_brief(
17196
18022
  brief: str | None,
17197
18023
  step: int,
17198
18024
  col: str | list[str] | None,
17199
- values: any | None,
17200
- thresholds: any | None,
17201
- segment: any | None,
18025
+ values: Any | None,
18026
+ thresholds: Any | None,
18027
+ segment: Any | None,
17202
18028
  ) -> str:
17203
18029
  # If there is no brief, return `None`
17204
18030
  if brief is None:
@@ -17271,6 +18097,265 @@ def _process_brief(
17271
18097
  return brief
17272
18098
 
17273
18099
 
18100
+ def _parse_max_age(max_age: str | datetime.timedelta) -> datetime.timedelta:
18101
+ """
18102
+ Parse a max_age specification into a timedelta.
18103
+
18104
+ Parameters
18105
+ ----------
18106
+ max_age
18107
+ Either a timedelta object or a string like "24 hours", "1 day", "30 minutes",
18108
+ or compound expressions like "2 hours 15 minutes", "1 day 6 hours", etc.
18109
+
18110
+ Returns
18111
+ -------
18112
+ datetime.timedelta
18113
+ The parsed timedelta.
18114
+
18115
+ Raises
18116
+ ------
18117
+ ValueError
18118
+ If the string format is invalid or the unit is not recognized.
18119
+ """
18120
+ if isinstance(max_age, datetime.timedelta):
18121
+ return max_age
18122
+
18123
+ if not isinstance(max_age, str):
18124
+ raise TypeError(
18125
+ f"The `max_age` parameter must be a string or timedelta, got {type(max_age).__name__}."
18126
+ )
18127
+
18128
+ # Parse string format like "24 hours", "1 day", "30 minutes", etc.
18129
+ max_age_str = max_age.strip().lower()
18130
+
18131
+ # Define unit mappings (singular and plural forms)
18132
+ unit_mappings = {
18133
+ "second": "seconds",
18134
+ "seconds": "seconds",
18135
+ "sec": "seconds",
18136
+ "secs": "seconds",
18137
+ "s": "seconds",
18138
+ "minute": "minutes",
18139
+ "minutes": "minutes",
18140
+ "min": "minutes",
18141
+ "mins": "minutes",
18142
+ "m": "minutes",
18143
+ "hour": "hours",
18144
+ "hours": "hours",
18145
+ "hr": "hours",
18146
+ "hrs": "hours",
18147
+ "h": "hours",
18148
+ "day": "days",
18149
+ "days": "days",
18150
+ "d": "days",
18151
+ "week": "weeks",
18152
+ "weeks": "weeks",
18153
+ "wk": "weeks",
18154
+ "wks": "weeks",
18155
+ "w": "weeks",
18156
+ }
18157
+
18158
+ import re
18159
+
18160
+ # Pattern to find all number+unit pairs (supports compound expressions)
18161
+ # Matches: "2 hours 15 minutes", "1day6h", "30 min", etc.
18162
+ compound_pattern = r"(\d+(?:\.\d+)?)\s*([a-zA-Z]+)"
18163
+ matches = re.findall(compound_pattern, max_age_str)
18164
+
18165
+ if not matches:
18166
+ raise ValueError(
18167
+ f"Invalid max_age format: '{max_age}'. Expected format like '24 hours', "
18168
+ f"'1 day', '30 minutes', '2 hours 15 minutes', etc."
18169
+ )
18170
+
18171
+ # Accumulate timedelta from all matched components
18172
+ total_td = datetime.timedelta()
18173
+ valid_units = ["seconds", "minutes", "hours", "days", "weeks"]
18174
+
18175
+ for value_str, unit in matches:
18176
+ value = float(value_str)
18177
+
18178
+ # Normalize the unit
18179
+ unit_lower = unit.lower()
18180
+ if unit_lower not in unit_mappings:
18181
+ raise ValueError(
18182
+ f"Unknown time unit '{unit}' in max_age '{max_age}'. "
18183
+ f"Valid units are: {', '.join(valid_units)} (or their abbreviations)."
18184
+ )
18185
+
18186
+ normalized_unit = unit_mappings[unit_lower]
18187
+
18188
+ # Add to total timedelta
18189
+ if normalized_unit == "seconds":
18190
+ total_td += datetime.timedelta(seconds=value)
18191
+ elif normalized_unit == "minutes":
18192
+ total_td += datetime.timedelta(minutes=value)
18193
+ elif normalized_unit == "hours":
18194
+ total_td += datetime.timedelta(hours=value)
18195
+ elif normalized_unit == "days":
18196
+ total_td += datetime.timedelta(days=value)
18197
+ elif normalized_unit == "weeks":
18198
+ total_td += datetime.timedelta(weeks=value)
18199
+
18200
+ return total_td
18201
+
18202
+
18203
+ def _parse_timezone(timezone: str) -> datetime.tzinfo:
18204
+ """
18205
+ Parse a timezone string into a tzinfo object.
18206
+
18207
+ Supports:
18208
+ - IANA timezone names: "America/New_York", "Europe/London", "UTC"
18209
+ - Offset strings: "-7", "+5", "-07:00", "+05:30"
18210
+
18211
+ Parameters
18212
+ ----------
18213
+ timezone
18214
+ The timezone string to parse.
18215
+
18216
+ Returns
18217
+ -------
18218
+ datetime.tzinfo
18219
+ The parsed timezone object.
18220
+
18221
+ Raises
18222
+ ------
18223
+ ValueError
18224
+ If the timezone is not valid.
18225
+ """
18226
+ import re
18227
+
18228
+ # Check for offset formats: "-7", "+5", "-07:00", "+05:30", etc.
18229
+ # Match: optional sign, 1-2 digits, optional colon and 2 more digits
18230
+ offset_pattern = r"^([+-]?)(\d{1,2})(?::(\d{2}))?$"
18231
+ match = re.match(offset_pattern, timezone.strip())
18232
+
18233
+ if match:
18234
+ sign_str, hours_str, minutes_str = match.groups()
18235
+ hours = int(hours_str)
18236
+ minutes = int(minutes_str) if minutes_str else 0
18237
+
18238
+ # Apply sign (default positive if not specified)
18239
+ total_minutes = hours * 60 + minutes
18240
+ if sign_str == "-":
18241
+ total_minutes = -total_minutes
18242
+
18243
+ return datetime.timezone(datetime.timedelta(minutes=total_minutes))
18244
+
18245
+ # Try IANA timezone names (zoneinfo is standard in Python 3.9+)
18246
+ try:
18247
+ return ZoneInfo(timezone)
18248
+ except KeyError:
18249
+ pass
18250
+
18251
+ raise ValueError(
18252
+ f"Invalid timezone: '{timezone}'. Use an IANA timezone name "
18253
+ f"(e.g., 'America/New_York', 'UTC') or an offset (e.g., '-7', '+05:30')."
18254
+ )
18255
+
18256
+
18257
+ def _validate_timezone(timezone: str) -> None:
18258
+ """
18259
+ Validate that a timezone string is valid.
18260
+
18261
+ Parameters
18262
+ ----------
18263
+ timezone
18264
+ The timezone string to validate.
18265
+
18266
+ Raises
18267
+ ------
18268
+ ValueError
18269
+ If the timezone is not valid.
18270
+ """
18271
+ # Use _parse_timezone to validate - it will raise ValueError if invalid
18272
+ _parse_timezone(timezone)
18273
+
18274
+
18275
+ def _parse_reference_time(reference_time: str) -> datetime.datetime:
18276
+ """
18277
+ Parse a reference time string into a datetime object.
18278
+
18279
+ Parameters
18280
+ ----------
18281
+ reference_time
18282
+ An ISO 8601 formatted datetime string.
18283
+
18284
+ Returns
18285
+ -------
18286
+ datetime.datetime
18287
+ The parsed datetime object.
18288
+
18289
+ Raises
18290
+ ------
18291
+ ValueError
18292
+ If the string cannot be parsed.
18293
+ """
18294
+ # Try parsing with fromisoformat (handles most ISO 8601 formats)
18295
+ try:
18296
+ return datetime.datetime.fromisoformat(reference_time)
18297
+ except ValueError:
18298
+ pass
18299
+
18300
+ # Try parsing common formats
18301
+ formats = [
18302
+ "%Y-%m-%d %H:%M:%S",
18303
+ "%Y-%m-%d %H:%M:%S%z",
18304
+ "%Y-%m-%dT%H:%M:%S",
18305
+ "%Y-%m-%dT%H:%M:%S%z",
18306
+ "%Y-%m-%d",
18307
+ ]
18308
+
18309
+ for fmt in formats:
18310
+ try:
18311
+ return datetime.datetime.strptime(reference_time, fmt)
18312
+ except ValueError:
18313
+ continue
18314
+
18315
+ raise ValueError(
18316
+ f"Could not parse reference_time '{reference_time}'. "
18317
+ f"Please use ISO 8601 format like '2024-01-15T10:30:00' or '2024-01-15T10:30:00+00:00'."
18318
+ )
18319
+
18320
+
18321
+ def _format_timedelta(td: datetime.timedelta) -> str:
18322
+ """
18323
+ Format a timedelta into a human-readable string.
18324
+
18325
+ Parameters
18326
+ ----------
18327
+ td
18328
+ The timedelta to format.
18329
+
18330
+ Returns
18331
+ -------
18332
+ str
18333
+ A human-readable string like "24 hours", "2 days 5 hours", etc.
18334
+ """
18335
+ total_seconds = td.total_seconds()
18336
+
18337
+ if total_seconds < 60:
18338
+ val = round(total_seconds, 1)
18339
+ return f"{val}s"
18340
+ elif total_seconds < 3600:
18341
+ val = round(total_seconds / 60, 1)
18342
+ return f"{val}m"
18343
+ elif total_seconds < 86400:
18344
+ val = round(total_seconds / 3600, 1)
18345
+ return f"{val}h"
18346
+ elif total_seconds < 604800:
18347
+ # For days, show "xd yh" format for better readability
18348
+ days = int(total_seconds // 86400)
18349
+ remaining_hours = round((total_seconds % 86400) / 3600, 1)
18350
+ if remaining_hours == 0:
18351
+ return f"{days}d"
18352
+ else:
18353
+ return f"{days}d {remaining_hours}h"
18354
+ else:
18355
+ val = round(total_seconds / 604800)
18356
+ return f"{val}w"
18357
+
18358
+
17274
18359
  def _transform_auto_brief(brief: str | bool | None) -> str | None:
17275
18360
  if isinstance(brief, bool):
17276
18361
  if brief:
@@ -17285,7 +18370,7 @@ def _process_action_str(
17285
18370
  action_str: str,
17286
18371
  step: int,
17287
18372
  col: str | None,
17288
- value: any,
18373
+ value: Any,
17289
18374
  type: str,
17290
18375
  level: str,
17291
18376
  time: str,
@@ -17337,8 +18422,8 @@ def _process_action_str(
17337
18422
  def _create_autobrief_or_failure_text(
17338
18423
  assertion_type: str,
17339
18424
  lang: str,
17340
- column: str | None,
17341
- values: str | None,
18425
+ column: str,
18426
+ values: Any,
17342
18427
  for_failure: bool,
17343
18428
  locale: str | None = None,
17344
18429
  n_rows: int | None = None,
@@ -17465,6 +18550,14 @@ def _create_autobrief_or_failure_text(
17465
18550
  for_failure=for_failure,
17466
18551
  )
17467
18552
 
18553
+ if assertion_type == "data_freshness":
18554
+ return _create_text_data_freshness(
18555
+ lang=lang,
18556
+ column=column,
18557
+ value=values,
18558
+ for_failure=for_failure,
18559
+ )
18560
+
17468
18561
  if assertion_type == "col_pct_null":
17469
18562
  return _create_text_col_pct_null(
17470
18563
  lang=lang,
@@ -17490,7 +18583,7 @@ def _create_autobrief_or_failure_text(
17490
18583
  for_failure=for_failure,
17491
18584
  )
17492
18585
 
17493
- return None # pragma: no cover
18586
+ return None
17494
18587
 
17495
18588
 
17496
18589
  def _expect_failure_type(for_failure: bool) -> str:
@@ -17500,7 +18593,7 @@ def _expect_failure_type(for_failure: bool) -> str:
17500
18593
  def _create_text_comparison(
17501
18594
  assertion_type: str,
17502
18595
  lang: str,
17503
- column: str | list[str] | None,
18596
+ column: str | list[str],
17504
18597
  values: str | None,
17505
18598
  for_failure: bool = False,
17506
18599
  ) -> str:
@@ -17526,7 +18619,7 @@ def _create_text_comparison(
17526
18619
 
17527
18620
  def _create_text_between(
17528
18621
  lang: str,
17529
- column: str | None,
18622
+ column: str,
17530
18623
  value_1: str,
17531
18624
  value_2: str,
17532
18625
  not_: bool = False,
@@ -17556,7 +18649,7 @@ def _create_text_between(
17556
18649
 
17557
18650
 
17558
18651
  def _create_text_set(
17559
- lang: str, column: str | None, values: list[any], not_: bool = False, for_failure: bool = False
18652
+ lang: str, column: str, values: list[Any], not_: bool = False, for_failure: bool = False
17560
18653
  ) -> str:
17561
18654
  type_ = _expect_failure_type(for_failure=for_failure)
17562
18655
 
@@ -17578,9 +18671,7 @@ def _create_text_set(
17578
18671
  return text
17579
18672
 
17580
18673
 
17581
- def _create_text_null(
17582
- lang: str, column: str | None, not_: bool = False, for_failure: bool = False
17583
- ) -> str:
18674
+ def _create_text_null(lang: str, column: str, not_: bool = False, for_failure: bool = False) -> str:
17584
18675
  type_ = _expect_failure_type(for_failure=for_failure)
17585
18676
 
17586
18677
  column_text = _prep_column_text(column=column)
@@ -17597,9 +18688,7 @@ def _create_text_null(
17597
18688
  return text
17598
18689
 
17599
18690
 
17600
- def _create_text_regex(
17601
- lang: str, column: str | None, pattern: str | dict, for_failure: bool = False
17602
- ) -> str:
18691
+ def _create_text_regex(lang: str, column: str, pattern: str, for_failure: bool = False) -> str:
17603
18692
  type_ = _expect_failure_type(for_failure=for_failure)
17604
18693
 
17605
18694
  column_text = _prep_column_text(column=column)
@@ -17631,7 +18720,7 @@ def _create_text_expr(lang: str, for_failure: bool) -> str:
17631
18720
  return EXPECT_FAIL_TEXT[f"col_vals_expr_{type_}_text"][lang]
17632
18721
 
17633
18722
 
17634
- def _create_text_col_exists(lang: str, column: str | None, for_failure: bool = False) -> str:
18723
+ def _create_text_col_exists(lang: str, column: str, for_failure: bool = False) -> str:
17635
18724
  type_ = _expect_failure_type(for_failure=for_failure)
17636
18725
 
17637
18726
  column_text = _prep_column_text(column=column)
@@ -17681,7 +18770,7 @@ def _create_text_rows_complete(
17681
18770
  return text
17682
18771
 
17683
18772
 
17684
- def _create_text_row_count_match(lang: str, value: int, for_failure: bool = False) -> str:
18773
+ def _create_text_row_count_match(lang: str, value: dict, for_failure: bool = False) -> str:
17685
18774
  type_ = _expect_failure_type(for_failure=for_failure)
17686
18775
 
17687
18776
  values_text = _prep_values_text(value["count"], lang=lang)
@@ -17689,7 +18778,7 @@ def _create_text_row_count_match(lang: str, value: int, for_failure: bool = Fals
17689
18778
  return EXPECT_FAIL_TEXT[f"row_count_match_n_{type_}_text"][lang].format(values_text=values_text)
17690
18779
 
17691
18780
 
17692
- def _create_text_col_count_match(lang: str, value: int, for_failure: bool = False) -> str:
18781
+ def _create_text_col_count_match(lang: str, value: dict, for_failure: bool = False) -> str:
17693
18782
  type_ = _expect_failure_type(for_failure=for_failure)
17694
18783
 
17695
18784
  values_text = _prep_values_text(value["count"], lang=lang)
@@ -17697,6 +18786,33 @@ def _create_text_col_count_match(lang: str, value: int, for_failure: bool = Fals
17697
18786
  return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text)
17698
18787
 
17699
18788
 
18789
+ def _create_text_data_freshness(
18790
+ lang: str,
18791
+ column: str | None,
18792
+ value: dict,
18793
+ for_failure: bool = False,
18794
+ ) -> str:
18795
+ """Create text for data_freshness validation."""
18796
+ type_ = _expect_failure_type(for_failure=for_failure)
18797
+
18798
+ column_text = _prep_column_text(column=column)
18799
+ max_age_text = _format_timedelta(value.get("max_age"))
18800
+
18801
+ if for_failure:
18802
+ age = value.get("age")
18803
+ age_text = _format_timedelta(age) if age else "unknown"
18804
+ return EXPECT_FAIL_TEXT[f"data_freshness_{type_}_text"][lang].format(
18805
+ column_text=column_text,
18806
+ max_age_text=max_age_text,
18807
+ age_text=age_text,
18808
+ )
18809
+ else:
18810
+ return EXPECT_FAIL_TEXT[f"data_freshness_{type_}_text"][lang].format(
18811
+ column_text=column_text,
18812
+ max_age_text=max_age_text,
18813
+ )
18814
+
18815
+
17700
18816
  def _create_text_col_pct_null(
17701
18817
  lang: str,
17702
18818
  column: str | None,
@@ -17826,19 +18942,13 @@ def _create_text_prompt(lang: str, prompt: str, for_failure: bool = False) -> st
17826
18942
  def _prep_column_text(column: str | list[str]) -> str:
17827
18943
  if isinstance(column, list):
17828
18944
  return "`" + str(column[0]) + "`"
17829
- elif isinstance(column, str):
18945
+ if isinstance(column, str):
17830
18946
  return "`" + column + "`"
17831
- else:
17832
- return ""
18947
+ raise AssertionError
17833
18948
 
17834
18949
 
17835
18950
  def _prep_values_text(
17836
- values: str
17837
- | int
17838
- | float
17839
- | datetime.datetime
17840
- | datetime.date
17841
- | list[str | int | float | datetime.datetime | datetime.date],
18951
+ values: _CompliantValue | _CompliantValues,
17842
18952
  lang: str,
17843
18953
  limit: int = 3,
17844
18954
  ) -> str:
@@ -17886,7 +18996,7 @@ def _prep_values_text(
17886
18996
  return values_str
17887
18997
 
17888
18998
 
17889
- def _seg_expr_from_string(data_tbl: any, segments_expr: str) -> list[tuple[str, str]]:
18999
+ def _seg_expr_from_string(data_tbl: Any, segments_expr: str) -> tuple[str, str]:
17890
19000
  """
17891
19001
  Obtain the segmentation categories from a table column.
17892
19002
 
@@ -17989,7 +19099,7 @@ def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, Any]]:
17989
19099
  return seg_tuples
17990
19100
 
17991
19101
 
17992
- def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
19102
+ def _apply_segments(data_tbl: Any, segments_expr: tuple[str, str]) -> Any:
17993
19103
  """
17994
19104
  Apply the segments expression to the data table.
17995
19105
 
@@ -18053,8 +19163,26 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
18053
19163
  except ValueError: # pragma: no cover
18054
19164
  pass # pragma: no cover
18055
19165
 
18056
- # Format 2: Datetime strings with UTC timezone like
18057
- # "2016-01-04 00:00:01 UTC.strict_cast(...)"
19166
+ # Format 2: Direct datetime strings like "2016-01-04 00:00:01" (Polars 1.36+)
19167
+ # These don't have UTC suffix anymore
19168
+ elif (
19169
+ " " in segment_str
19170
+ and "UTC" not in segment_str
19171
+ and "[" not in segment_str
19172
+ and ".alias" not in segment_str
19173
+ ):
19174
+ try:
19175
+ parsed_dt = datetime.fromisoformat(segment_str)
19176
+ # Convert midnight datetimes to dates for consistency
19177
+ if parsed_dt.time() == datetime.min.time():
19178
+ parsed_value = parsed_dt.date() # pragma: no cover
19179
+ else:
19180
+ parsed_value = parsed_dt
19181
+ except ValueError: # pragma: no cover
19182
+ pass # pragma: no cover
19183
+
19184
+ # Format 3: Datetime strings with UTC timezone like
19185
+ # "2016-01-04 00:00:01 UTC.strict_cast(...)" (Polars < 1.36)
18058
19186
  elif " UTC" in segment_str:
18059
19187
  try:
18060
19188
  # Extract just the datetime part before "UTC"
@@ -18069,7 +19197,7 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
18069
19197
  except (ValueError, IndexError): # pragma: no cover
18070
19198
  pass # pragma: no cover
18071
19199
 
18072
- # Format 3: Bracketed expressions like ['2016-01-04']
19200
+ # Format 4: Bracketed expressions like ['2016-01-04']
18073
19201
  elif segment_str.startswith("[") and segment_str.endswith("]"):
18074
19202
  try: # pragma: no cover
18075
19203
  # Remove [' and ']
@@ -18204,8 +19332,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
18204
19332
 
18205
19333
  def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]:
18206
19334
  # For each icon, get the assertion icon SVG test from SVG_ICONS_FOR_ASSERTION_TYPES dictionary
18207
- # TODO: No point in using `get` if we can't handle missing keys anyways
18208
- icon_svg = [SVG_ICONS_FOR_ASSERTION_TYPES.get(icon) for icon in icon]
19335
+ icon_svg: list[str] = [SVG_ICONS_FOR_ASSERTION_TYPES[icon] for icon in icon]
18209
19336
 
18210
19337
  # Replace the width and height in the SVG string
18211
19338
  for i in range(len(icon_svg)):
@@ -18214,11 +19341,9 @@ def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]:
18214
19341
  return icon_svg
18215
19342
 
18216
19343
 
18217
- def _replace_svg_dimensions(svg: list[str], height_width: int | float) -> list[str]:
19344
+ def _replace_svg_dimensions(svg: str, height_width: int | float) -> str:
18218
19345
  svg = re.sub(r'width="[0-9]*?px', f'width="{height_width}px', svg)
18219
- svg = re.sub(r'height="[0-9]*?px', f'height="{height_width}px', svg)
18220
-
18221
- return svg
19346
+ return re.sub(r'height="[0-9]*?px', f'height="{height_width}px', svg)
18222
19347
 
18223
19348
 
18224
19349
  def _get_title_text(
@@ -18282,7 +19407,7 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s
18282
19407
  return title_text
18283
19408
 
18284
19409
 
18285
- def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: bool) -> list[str]:
19410
+ def _transform_tbl_preprocessed(pre: Any, seg: Any, interrogation_performed: bool) -> list[str]:
18286
19411
  # If no interrogation was performed, return a list of empty strings
18287
19412
  if not interrogation_performed:
18288
19413
  return ["" for _ in range(len(pre))]
@@ -18304,9 +19429,7 @@ def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: boo
18304
19429
 
18305
19430
  def _get_preprocessed_table_icon(icon: list[str]) -> list[str]:
18306
19431
  # For each icon, get the SVG icon from the SVG_ICONS_FOR_TBL_STATUS dictionary
18307
- icon_svg = [SVG_ICONS_FOR_TBL_STATUS.get(icon) for icon in icon]
18308
-
18309
- return icon_svg
19432
+ return [SVG_ICONS_FOR_TBL_STATUS[icon] for icon in icon]
18310
19433
 
18311
19434
 
18312
19435
  def _transform_eval(
@@ -18384,9 +19507,9 @@ def _transform_test_units(
18384
19507
  return _format_single_number_with_gt(
18385
19508
  value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
18386
19509
  )
18387
- else:
18388
- # Fallback to the original behavior
18389
- return str(vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0])
19510
+ formatted = vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)
19511
+ assert isinstance(formatted, list)
19512
+ return formatted[0]
18390
19513
 
18391
19514
  return [
18392
19515
  (
@@ -18590,22 +19713,21 @@ def _transform_assertion_str(
18590
19713
  return type_upd
18591
19714
 
18592
19715
 
18593
- def _pre_processing_funcs_to_str(pre: Callable) -> str | list[str]:
19716
+ def _pre_processing_funcs_to_str(pre: Callable) -> str | list[str] | None:
18594
19717
  if isinstance(pre, Callable):
18595
19718
  return _get_callable_source(fn=pre)
19719
+ return None
18596
19720
 
18597
19721
 
18598
19722
  def _get_callable_source(fn: Callable) -> str:
18599
- if isinstance(fn, Callable):
18600
- try:
18601
- source_lines, _ = inspect.getsourcelines(fn)
18602
- source = "".join(source_lines).strip()
18603
- # Extract the `pre` argument from the source code
18604
- pre_arg = _extract_pre_argument(source)
18605
- return pre_arg
18606
- except (OSError, TypeError): # pragma: no cover
18607
- return fn.__name__
18608
- return fn # pragma: no cover
19723
+ try:
19724
+ source_lines, _ = inspect.getsourcelines(fn)
19725
+ source = "".join(source_lines).strip()
19726
+ # Extract the `pre` argument from the source code
19727
+ pre_arg = _extract_pre_argument(source)
19728
+ return pre_arg
19729
+ except (OSError, TypeError): # pragma: no cover
19730
+ return fn.__name__ # ty: ignore
18609
19731
 
18610
19732
 
18611
19733
  def _extract_pre_argument(source: str) -> str:
@@ -18625,12 +19747,78 @@ def _extract_pre_argument(source: str) -> str:
18625
19747
  return pre_arg
18626
19748
 
18627
19749
 
19750
+ def _create_governance_metadata_html(
19751
+ owner: str | None,
19752
+ consumers: list[str] | None,
19753
+ version: str | None,
19754
+ ) -> str:
19755
+ """
19756
+ Create HTML for governance metadata display in the report footer.
19757
+
19758
+ Parameters
19759
+ ----------
19760
+ owner
19761
+ The owner of the data being validated.
19762
+ consumers
19763
+ List of consumers who depend on the data.
19764
+ version
19765
+ The version of the validation plan.
19766
+
19767
+ Returns
19768
+ -------
19769
+ str
19770
+ HTML string containing formatted governance metadata, or empty string if no metadata.
19771
+ """
19772
+ if owner is None and consumers is None and version is None:
19773
+ return ""
19774
+
19775
+ metadata_parts = []
19776
+
19777
+ # Common style for the metadata badges (similar to timing style but slightly smaller font)
19778
+ badge_style = (
19779
+ "background-color: #FFF; color: #444; padding: 0.5em 0.5em; position: inherit; "
19780
+ "margin-right: 5px; border: solid 1px #999999; font-variant-numeric: tabular-nums; "
19781
+ "border-radius: 0; padding: 2px 10px 2px 10px; font-size: 11px;"
19782
+ )
19783
+ label_style = (
19784
+ "color: #777; font-weight: bold; font-size: 9px; text-transform: uppercase; "
19785
+ "margin-right: 3px;"
19786
+ )
19787
+
19788
+ if owner is not None:
19789
+ metadata_parts.append(
19790
+ f"<span style='{badge_style}'><span style='{label_style}'>Owner:</span> {owner}</span>"
19791
+ )
19792
+
19793
+ if consumers is not None and len(consumers) > 0:
19794
+ consumers_str = ", ".join(consumers)
19795
+ metadata_parts.append(
19796
+ f"<span style='{badge_style}'>"
19797
+ f"<span style='{label_style}'>Consumers:</span> {consumers_str}"
19798
+ f"</span>"
19799
+ )
19800
+
19801
+ if version is not None:
19802
+ metadata_parts.append(
19803
+ f"<span style='{badge_style}'>"
19804
+ f"<span style='{label_style}'>Version:</span> {version}"
19805
+ f"</span>"
19806
+ )
19807
+
19808
+ return (
19809
+ f"<div style='margin-top: 5px; margin-bottom: 5px; margin-left: 10px;'>"
19810
+ f"{''.join(metadata_parts)}"
19811
+ f"</div>"
19812
+ )
19813
+
19814
+
18628
19815
  def _create_table_time_html(
18629
19816
  time_start: datetime.datetime | None, time_end: datetime.datetime | None
18630
19817
  ) -> str:
18631
19818
  if time_start is None:
18632
19819
  return ""
18633
19820
 
19821
+ assert time_end is not None # typing
18634
19822
  # Get the time duration (difference between `time_end` and `time_start`) in seconds
18635
19823
  time_duration = (time_end - time_start).total_seconds()
18636
19824
 
@@ -18845,11 +20033,11 @@ def _format_number_safe(
18845
20033
  locale=locale,
18846
20034
  df_lib=df_lib,
18847
20035
  )
18848
- else:
18849
- # Fallback to the original behavior
18850
- return fmt_number(
18851
- value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
18852
- )[0] # pragma: no cover
20036
+ ints = fmt_number(
20037
+ value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
20038
+ )
20039
+ assert isinstance(ints, list)
20040
+ return ints[0]
18853
20041
 
18854
20042
 
18855
20043
  def _format_integer_safe(value: int, locale: str = "en", df_lib=None) -> str:
@@ -18862,9 +20050,10 @@ def _format_integer_safe(value: int, locale: str = "en", df_lib=None) -> str:
18862
20050
  if df_lib is not None and value is not None:
18863
20051
  # Use GT-based formatting to avoid Pandas dependency completely
18864
20052
  return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
18865
- else:
18866
- # Fallback to the original behavior
18867
- return fmt_integer(value, locale=locale)[0]
20053
+
20054
+ ints = fmt_integer(value, locale=locale)
20055
+ assert isinstance(ints, list)
20056
+ return ints[0]
18868
20057
 
18869
20058
 
18870
20059
  def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> str:
@@ -18980,7 +20169,7 @@ def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en"
18980
20169
  HTML string containing the formatted threshold information.
18981
20170
  """
18982
20171
  if thresholds == Thresholds():
18983
- return ""
20172
+ return "" # pragma: no cover
18984
20173
 
18985
20174
  # Get df_lib for formatting
18986
20175
  df_lib = None
@@ -18988,10 +20177,10 @@ def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en"
18988
20177
  import polars as pl
18989
20178
 
18990
20179
  df_lib = pl
18991
- elif _is_lib_present("pandas"):
18992
- import pandas as pd
20180
+ elif _is_lib_present("pandas"): # pragma: no cover
20181
+ import pandas as pd # pragma: no cover
18993
20182
 
18994
- df_lib = pd
20183
+ df_lib = pd # pragma: no cover
18995
20184
 
18996
20185
  # Helper function to format threshold values using the shared formatting functions
18997
20186
  def _format_threshold_value(fraction: float | None, count: int | None) -> str:
@@ -18999,10 +20188,12 @@ def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en"
18999
20188
  # Format as fraction/percentage with locale formatting
19000
20189
  if fraction == 0:
19001
20190
  return "0"
19002
- elif fraction < 0.01:
20191
+ elif fraction < 0.01: # pragma: no cover
19003
20192
  # For very small fractions, show "<0.01" with locale formatting
19004
- formatted = _format_number_safe(0.01, decimals=2, locale=locale, df_lib=df_lib)
19005
- return f"&lt;{formatted}"
20193
+ formatted = _format_number_safe(
20194
+ 0.01, decimals=2, locale=locale, df_lib=df_lib
20195
+ ) # pragma: no cover
20196
+ return f"&lt;{formatted}" # pragma: no cover
19006
20197
  else:
19007
20198
  # Use shared formatting function with drop_trailing_zeros
19008
20199
  formatted = _format_number_safe(
@@ -19079,14 +20270,14 @@ def _create_local_threshold_note_text(thresholds: Thresholds) -> str:
19079
20270
  if fraction is not None:
19080
20271
  if fraction == 0:
19081
20272
  return "0"
19082
- elif fraction < 0.01:
19083
- return "<0.01"
20273
+ elif fraction < 0.01: # pragma: no cover
20274
+ return "<0.01" # pragma: no cover
19084
20275
  else:
19085
20276
  return f"{fraction:.2f}".rstrip("0").rstrip(".")
19086
20277
  elif count is not None:
19087
20278
  return str(count)
19088
20279
  else:
19089
- return "—"
20280
+ return "—" # pragma: no cover
19090
20281
 
19091
20282
  parts = []
19092
20283
 
@@ -19105,7 +20296,7 @@ def _create_local_threshold_note_text(thresholds: Thresholds) -> str:
19105
20296
  if parts:
19106
20297
  return "Step-specific thresholds set: " + ", ".join(parts)
19107
20298
  else:
19108
- return ""
20299
+ return "" # pragma: no cover
19109
20300
 
19110
20301
 
19111
20302
  def _create_threshold_reset_note_html(locale: str = "en") -> str:
@@ -19654,13 +20845,13 @@ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") ->
19654
20845
  f'<span style="color:#FF3300;">✗</span> {failed_text}: ' + ", ".join(failures) + "."
19655
20846
  )
19656
20847
  else:
19657
- summary = f'<span style="color:#FF3300;">✗</span> {failed_text}.'
20848
+ summary = f'<span style="color:#FF3300;">✗</span> {failed_text}.' # pragma: no cover
19658
20849
 
19659
20850
  # Generate the step report table using the existing function
19660
20851
  # We'll call either _step_report_schema_in_order or _step_report_schema_any_order
19661
20852
  # depending on the in_order parameter
19662
- if in_order:
19663
- step_report_gt = _step_report_schema_in_order(
20853
+ if in_order: # pragma: no cover
20854
+ step_report_gt = _step_report_schema_in_order( # pragma: no cover
19664
20855
  step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
19665
20856
  )
19666
20857
  else:
@@ -19691,7 +20882,7 @@ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") ->
19691
20882
  """
19692
20883
 
19693
20884
  # Add the settings as an additional source note to the step report
19694
- step_report_gt = step_report_gt.tab_source_note(source_note=html(source_note_html))
20885
+ step_report_gt = step_report_gt.tab_source_note(source_note=html(source_note_html)) # type: ignore[union-attr]
19695
20886
 
19696
20887
  # Extract the HTML from the GT object
19697
20888
  step_report_html = step_report_gt._repr_html_()
@@ -19743,12 +20934,12 @@ def _step_report_row_based(
19743
20934
  column: str,
19744
20935
  column_position: int,
19745
20936
  columns_subset: list[str] | None,
19746
- values: any,
20937
+ values: Any,
19747
20938
  inclusive: tuple[bool, bool] | None,
19748
20939
  n: int,
19749
20940
  n_failed: int,
19750
20941
  all_passed: bool,
19751
- extract: any,
20942
+ extract: Any,
19752
20943
  tbl_preview: GT,
19753
20944
  header: str,
19754
20945
  limit: int | None,
@@ -19775,10 +20966,12 @@ def _step_report_row_based(
19775
20966
  elif assertion_type == "col_vals_le":
19776
20967
  text = f"{column} &le; {values}"
19777
20968
  elif assertion_type == "col_vals_between":
20969
+ assert inclusive is not None
19778
20970
  symbol_left = "&le;" if inclusive[0] else "&lt;"
19779
20971
  symbol_right = "&le;" if inclusive[1] else "&lt;"
19780
20972
  text = f"{values[0]} {symbol_left} {column} {symbol_right} {values[1]}"
19781
20973
  elif assertion_type == "col_vals_outside":
20974
+ assert inclusive is not None
19782
20975
  symbol_left = "&lt;" if inclusive[0] else "&le;"
19783
20976
  symbol_right = "&gt;" if inclusive[1] else "&ge;"
19784
20977
  text = f"{column} {symbol_left} {values[0]}, {column} {symbol_right} {values[1]}"
@@ -19999,7 +21192,7 @@ def _step_report_rows_distinct(
19999
21192
  n: int,
20000
21193
  n_failed: int,
20001
21194
  all_passed: bool,
20002
- extract: any,
21195
+ extract: Any,
20003
21196
  tbl_preview: GT,
20004
21197
  header: str,
20005
21198
  limit: int | None,
@@ -20125,9 +21318,299 @@ def _step_report_rows_distinct(
20125
21318
  return step_report
20126
21319
 
20127
21320
 
21321
+ def _step_report_aggregate(
21322
+ assertion_type: str,
21323
+ i: int,
21324
+ column: str,
21325
+ values: dict,
21326
+ all_passed: bool,
21327
+ val_info: dict | None,
21328
+ header: str,
21329
+ lang: str,
21330
+ ) -> GT:
21331
+ """
21332
+ Generate a step report for aggregate validation methods (col_sum_*, col_avg_*, col_sd_*).
21333
+
21334
+ This creates a 1-row table showing the computed aggregate value vs. the target value,
21335
+ along with tolerance and pass/fail status.
21336
+ """
21337
+
21338
+ # Determine whether the `lang` value represents a right-to-left language
21339
+ is_rtl_lang = lang in RTL_LANGUAGES
21340
+ direction_rtl = " direction: rtl;" if is_rtl_lang else ""
21341
+
21342
+ # Parse assertion type to get aggregate function and comparison operator
21343
+ # Format: col_{agg}_{comp} (e.g., col_sum_eq, col_avg_gt, col_sd_le)
21344
+ parts = assertion_type.split("_")
21345
+ agg_type = parts[1] # sum, avg, sd
21346
+ comp_type = parts[2] # eq, gt, ge, lt, le
21347
+
21348
+ # Map aggregate type to display name
21349
+ agg_display = {"sum": "SUM", "avg": "AVG", "sd": "SD"}.get(agg_type, agg_type.upper())
21350
+
21351
+ # Map comparison type to symbol
21352
+ comp_symbols = {
21353
+ "eq": "=",
21354
+ "gt": "&gt;",
21355
+ "ge": "&ge;",
21356
+ "lt": "&lt;",
21357
+ "le": "&le;",
21358
+ }
21359
+ comp_symbol = comp_symbols.get(comp_type, comp_type)
21360
+
21361
+ # Get computed values from val_info (stored during interrogation)
21362
+ if val_info is not None:
21363
+ actual = val_info.get("actual", None)
21364
+ target = val_info.get("target", None)
21365
+ tol = val_info.get("tol", 0)
21366
+ lower_bound = val_info.get("lower_bound", target)
21367
+ upper_bound = val_info.get("upper_bound", target)
21368
+ else:
21369
+ # Fallback if val_info is not available
21370
+ actual = None
21371
+ target = values.get("value", None)
21372
+ tol = values.get("tol", 0)
21373
+ lower_bound = target
21374
+ upper_bound = target
21375
+
21376
+ # Format column name for display (handle list vs string)
21377
+ if isinstance(column, list):
21378
+ column_display = column[0] if len(column) == 1 else ", ".join(column)
21379
+ else:
21380
+ column_display = str(column)
21381
+
21382
+ # Generate assertion text for header
21383
+ if target is not None:
21384
+ target_display = f"{target:,.6g}" if isinstance(target, float) else f"{target:,}"
21385
+ assertion_text = f"{agg_display}({column_display}) {comp_symbol} {target_display}"
21386
+ else:
21387
+ assertion_text = f"{agg_display}({column_display}) {comp_symbol} ?"
21388
+
21389
+ # Calculate difference from boundary
21390
+ if actual is not None and target is not None:
21391
+ if comp_type == "eq":
21392
+ # For equality, show distance from target (considering tolerance)
21393
+ if lower_bound == upper_bound:
21394
+ difference = actual - target
21395
+ else:
21396
+ # With tolerance, show distance from nearest bound
21397
+ if actual < lower_bound:
21398
+ difference = actual - lower_bound
21399
+ elif actual > upper_bound:
21400
+ difference = actual - upper_bound
21401
+ else:
21402
+ difference = 0 # Within bounds
21403
+ elif comp_type in ["gt", "ge"]:
21404
+ # Distance from lower bound (positive if passing)
21405
+ difference = actual - lower_bound
21406
+ elif comp_type in ["lt", "le"]:
21407
+ # Distance from upper bound (negative if passing)
21408
+ difference = actual - upper_bound
21409
+ else:
21410
+ difference = actual - target
21411
+ else:
21412
+ difference = None
21413
+
21414
+ # Format values for display
21415
+ def format_value(v):
21416
+ if v is None:
21417
+ return "&mdash;"
21418
+ if isinstance(v, float):
21419
+ return f"{v:,.6g}"
21420
+ return f"{v:,}"
21421
+
21422
+ # Format tolerance for display
21423
+ if tol == 0:
21424
+ tol_display = "&mdash;"
21425
+ elif isinstance(tol, tuple):
21426
+ tol_display = f"(-{tol[0]}, +{tol[1]})"
21427
+ else:
21428
+ tol_display = f"&plusmn;{tol}"
21429
+
21430
+ # Format difference with sign
21431
+ if difference is not None:
21432
+ if difference == 0:
21433
+ diff_display = "0"
21434
+ elif difference > 0:
21435
+ diff_display = (
21436
+ f"+{difference:,.6g}" if isinstance(difference, float) else f"+{difference:,}"
21437
+ )
21438
+ else:
21439
+ diff_display = (
21440
+ f"{difference:,.6g}" if isinstance(difference, float) else f"{difference:,}"
21441
+ )
21442
+ else:
21443
+ diff_display = "&mdash;"
21444
+
21445
+ # Create pass/fail indicator
21446
+ if all_passed:
21447
+ status_html = CHECK_MARK_SPAN
21448
+ status_color = "#4CA64C"
21449
+ else:
21450
+ status_html = CROSS_MARK_SPAN
21451
+ status_color = "#CF142B"
21452
+
21453
+ # Select DataFrame library (prefer Polars, fall back to Pandas)
21454
+ if _is_lib_present("polars"):
21455
+ import polars as pl
21456
+
21457
+ df_lib = pl
21458
+ elif _is_lib_present("pandas"): # pragma: no cover
21459
+ import pandas as pd # pragma: no cover
21460
+
21461
+ df_lib = pd # pragma: no cover
21462
+ else: # pragma: no cover
21463
+ raise ImportError(
21464
+ "Neither Polars nor Pandas is available for step report generation"
21465
+ ) # pragma: no cover
21466
+
21467
+ # Create the data for the 1-row table
21468
+ report_data = df_lib.DataFrame(
21469
+ {
21470
+ "actual": [format_value(actual)],
21471
+ "target": [format_value(target)],
21472
+ "tolerance": [tol_display],
21473
+ "difference": [diff_display],
21474
+ "status": [status_html],
21475
+ }
21476
+ )
21477
+
21478
+ # Create GT table with styling matching preview() and other step reports
21479
+ step_report = (
21480
+ GT(report_data, id="pb_step_tbl")
21481
+ .opt_table_font(font=google_font(name="IBM Plex Sans"))
21482
+ .opt_align_table_header(align="left")
21483
+ .cols_label(
21484
+ actual="ACTUAL",
21485
+ target="EXPECTED",
21486
+ tolerance="TOL",
21487
+ difference="DIFFERENCE",
21488
+ status="",
21489
+ )
21490
+ .cols_align(align="center")
21491
+ .fmt_markdown(columns=["actual", "target", "tolerance", "difference", "status"])
21492
+ .tab_style(
21493
+ style=style.text(color="black", font=google_font(name="IBM Plex Mono"), size="13px"),
21494
+ locations=loc.body(columns=["actual", "target", "tolerance", "difference"]),
21495
+ )
21496
+ .tab_style(
21497
+ style=style.text(size="13px"),
21498
+ locations=loc.body(columns="status"),
21499
+ )
21500
+ .tab_style(
21501
+ style=style.text(color="gray20", font=google_font(name="IBM Plex Mono"), size="12px"),
21502
+ locations=loc.column_labels(),
21503
+ )
21504
+ .tab_style(
21505
+ style=style.borders(
21506
+ sides=["top", "bottom"], color="#E9E9E9", style="solid", weight="1px"
21507
+ ),
21508
+ locations=loc.body(),
21509
+ )
21510
+ .tab_options(
21511
+ table_body_vlines_style="solid",
21512
+ table_body_vlines_width="1px",
21513
+ table_body_vlines_color="#E9E9E9",
21514
+ column_labels_vlines_style="solid",
21515
+ column_labels_vlines_width="1px",
21516
+ column_labels_vlines_color="#F2F2F2",
21517
+ )
21518
+ .cols_width(
21519
+ cases={
21520
+ "actual": "200px",
21521
+ "target": "200px",
21522
+ "tolerance": "150px",
21523
+ "difference": "200px",
21524
+ "status": "50px",
21525
+ }
21526
+ )
21527
+ )
21528
+
21529
+ # Apply styling based on pass/fail
21530
+ if all_passed:
21531
+ step_report = step_report.tab_style(
21532
+ style=[
21533
+ style.text(color="#006400"),
21534
+ style.fill(color="#4CA64C33"),
21535
+ ],
21536
+ locations=loc.body(columns="status"),
21537
+ )
21538
+ else:
21539
+ step_report = step_report.tab_style(
21540
+ style=[
21541
+ style.text(color="#B22222"),
21542
+ style.fill(color="#FFC1C159"),
21543
+ ],
21544
+ locations=loc.body(columns="status"),
21545
+ )
21546
+
21547
+ # If the version of `great_tables` is `>=0.17.0` then disable Quarto table processing
21548
+ if version("great_tables") >= "0.17.0":
21549
+ step_report = step_report.tab_options(quarto_disable_processing=True)
21550
+
21551
+ # If no header requested, return the table as-is
21552
+ if header is None:
21553
+ return step_report
21554
+
21555
+ # Create header content
21556
+ assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
21557
+
21558
+ # Wrap assertion text in styled code tag
21559
+ assertion_code = (
21560
+ f"<code style='color: #303030; font-family: monospace; font-size: smaller;'>"
21561
+ f"{assertion_text}</code>"
21562
+ )
21563
+
21564
+ if all_passed:
21565
+ title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN
21566
+ result_stmt = STEP_REPORT_TEXT.get("agg_success_statement", {}).get(
21567
+ lang,
21568
+ f"The aggregate value for column <code>{column_display}</code> satisfies the condition.",
21569
+ )
21570
+ if isinstance(result_stmt, str) and "{column}" in result_stmt:
21571
+ result_stmt = result_stmt.format(column=column_display)
21572
+ else:
21573
+ title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CROSS_MARK_SPAN
21574
+ result_stmt = STEP_REPORT_TEXT.get("agg_failure_statement", {}).get(
21575
+ lang,
21576
+ f"The aggregate value for column <code>{column_display}</code> does not satisfy the condition.",
21577
+ )
21578
+ if isinstance(result_stmt, str) and "{column}" in result_stmt:
21579
+ result_stmt = result_stmt.format(column=column_display)
21580
+
21581
+ details = (
21582
+ f"<div style='font-size: 13.6px; {direction_rtl}'>"
21583
+ "<div style='padding-top: 7px;'>"
21584
+ f"{assertion_header_text} <span style='border-style: solid; border-width: thin; "
21585
+ "border-color: lightblue; padding-left: 2px; padding-right: 2px;'>"
21586
+ "<code style='color: #303030; background-color: transparent; "
21587
+ f"position: relative; bottom: 1px;'>{assertion_code}</code></span>"
21588
+ "</div>"
21589
+ "<div style='padding-top: 7px;'>"
21590
+ f"{result_stmt}"
21591
+ "</div>"
21592
+ "</div>"
21593
+ )
21594
+
21595
+ # Generate the default template text for the header when `":default:"` is used
21596
+ if header == ":default:":
21597
+ header = "{title}{details}"
21598
+
21599
+ # Use commonmark to convert the header text to HTML
21600
+ header = commonmark.commonmark(header)
21601
+
21602
+ # Place any templated text in the header
21603
+ header = header.format(title=title, details=details)
21604
+
21605
+ # Create the header with `header` string
21606
+ step_report = step_report.tab_header(title=md(header))
21607
+
21608
+ return step_report
21609
+
21610
+
20128
21611
  def _step_report_schema_in_order(
20129
- step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False
20130
- ) -> GT | any:
21612
+ step: int, schema_info: dict, header: str | None, lang: str, debug_return_df: bool = False
21613
+ ) -> GT | Any:
20131
21614
  """
20132
21615
  This is the case for schema validation where the schema is supposed to have the same column
20133
21616
  order as the target table.
@@ -20195,22 +21678,22 @@ def _step_report_schema_in_order(
20195
21678
 
20196
21679
  # Check if this column exists in exp_columns_dict (it might not if it's a duplicate)
20197
21680
  # For duplicates, we need to handle them specially
20198
- if column_name_exp_i not in exp_columns_dict:
21681
+ if column_name_exp_i not in exp_columns_dict: # pragma: no cover
20199
21682
  # This is a duplicate or invalid column, mark it as incorrect
20200
- col_exp_correct.append(CROSS_MARK_SPAN)
21683
+ col_exp_correct.append(CROSS_MARK_SPAN) # pragma: no cover
20201
21684
 
20202
21685
  # For dtype, check if there's a dtype specified in the schema
20203
- if len(expect_schema[i]) > 1:
20204
- dtype_value = expect_schema[i][1]
20205
- if isinstance(dtype_value, list):
20206
- dtype_exp.append(" | ".join(dtype_value))
20207
- else:
20208
- dtype_exp.append(str(dtype_value))
20209
- else:
20210
- dtype_exp.append("&mdash;")
21686
+ if len(expect_schema[i]) > 1: # pragma: no cover
21687
+ dtype_value = expect_schema[i][1] # pragma: no cover
21688
+ if isinstance(dtype_value, list): # pragma: no cover
21689
+ dtype_exp.append(" | ".join(dtype_value)) # pragma: no cover
21690
+ else: # pragma: no cover
21691
+ dtype_exp.append(str(dtype_value)) # pragma: no cover
21692
+ else: # pragma: no cover
21693
+ dtype_exp.append("&mdash;") # pragma: no cover
20211
21694
 
20212
- dtype_exp_correct.append("&mdash;")
20213
- continue
21695
+ dtype_exp_correct.append("&mdash;") # pragma: no cover
21696
+ continue # pragma: no cover
20214
21697
 
20215
21698
  #
20216
21699
  # `col_exp_correct` values
@@ -20433,7 +21916,9 @@ def _step_report_schema_in_order(
20433
21916
  # Add a border below the row that terminates the target table schema
20434
21917
  step_report = step_report.tab_style(
20435
21918
  style=style.borders(sides="bottom", color="#6699CC80", style="solid", weight="1px"),
20436
- locations=loc.body(rows=len(colnames_tgt) - 1),
21919
+ locations=loc.body(
21920
+ rows=len(colnames_tgt) - 1 # ty: ignore (bug in GT, should allow an int)
21921
+ ),
20437
21922
  )
20438
21923
 
20439
21924
  # If the version of `great_tables` is `>=0.17.0` then disable Quarto table processing
@@ -20482,8 +21967,8 @@ def _step_report_schema_in_order(
20482
21967
 
20483
21968
 
20484
21969
  def _step_report_schema_any_order(
20485
- step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False
20486
- ) -> GT | any:
21970
+ step: int, schema_info: dict, header: str | None, lang: str, debug_return_df: bool = False
21971
+ ) -> GT | pl.DataFrame:
20487
21972
  """
20488
21973
  This is the case for schema validation where the schema is permitted to not have to be in the
20489
21974
  same column order as the target table.
@@ -20902,9 +22387,7 @@ def _step_report_schema_any_order(
20902
22387
  header = header.format(title=title, details=details)
20903
22388
 
20904
22389
  # Create the header with `header` string
20905
- step_report = step_report.tab_header(title=md(header))
20906
-
20907
- return step_report
22390
+ return step_report.tab_header(title=md(header))
20908
22391
 
20909
22392
 
20910
22393
  def _create_label_text_html(
@@ -20993,3 +22476,321 @@ def _create_col_schema_match_params_html(
20993
22476
  f"{full_match_dtypes_text}"
20994
22477
  "</div>"
20995
22478
  )
22479
+
22480
+
22481
+ def _generate_agg_docstring(name: str) -> str:
22482
+ """Generate a comprehensive docstring for an aggregation validation method.
22483
+
22484
+ This function creates detailed documentation for dynamically generated methods like
22485
+ `col_sum_eq()`, `col_avg_gt()`, `col_sd_le()`, etc. The docstrings follow the same
22486
+ structure and quality as manually written validation methods like `col_vals_gt()`.
22487
+
22488
+ Parameters
22489
+ ----------
22490
+ name
22491
+ The method name (e.g., "col_sum_eq", "col_avg_gt", "col_sd_le").
22492
+
22493
+ Returns
22494
+ -------
22495
+ str
22496
+ A complete docstring for the method.
22497
+ """
22498
+ # Parse the method name to extract aggregation type and comparison operator
22499
+ # Format: col_{agg}_{comp} (e.g., col_sum_eq, col_avg_gt, col_sd_le)
22500
+ parts = name.split("_")
22501
+ agg_type = parts[1] # sum, avg, sd
22502
+ comp_type = parts[2] # eq, gt, ge, lt, le
22503
+
22504
+ # Human-readable names for aggregation types
22505
+ agg_names = {
22506
+ "sum": ("sum", "summed"),
22507
+ "avg": ("average", "averaged"),
22508
+ "sd": ("standard deviation", "computed for standard deviation"),
22509
+ }
22510
+
22511
+ # Human-readable descriptions for comparison operators (with article for title)
22512
+ comp_descriptions = {
22513
+ "eq": ("equal to", "equals", "an"),
22514
+ "gt": ("greater than", "is greater than", "a"),
22515
+ "ge": ("greater than or equal to", "is at least", "a"),
22516
+ "lt": ("less than", "is less than", "a"),
22517
+ "le": ("less than or equal to", "is at most", "a"),
22518
+ }
22519
+
22520
+ # Mathematical symbols for comparison operators
22521
+ comp_symbols = {
22522
+ "eq": "==",
22523
+ "gt": ">",
22524
+ "ge": ">=",
22525
+ "lt": "<",
22526
+ "le": "<=",
22527
+ }
22528
+
22529
+ agg_name, agg_verb = agg_names[agg_type]
22530
+ comp_desc, comp_phrase, comp_article = comp_descriptions[comp_type]
22531
+ comp_symbol = comp_symbols[comp_type]
22532
+
22533
+ # Determine the appropriate example values based on the aggregation and comparison
22534
+ if agg_type == "sum":
22535
+ example_value = "15"
22536
+ example_data = '{"a": [1, 2, 3, 4, 5], "b": [2, 2, 2, 2, 2]}'
22537
+ example_sum = "15" # sum of a
22538
+ example_ref_sum = "10" # sum of b
22539
+ elif agg_type == "avg":
22540
+ example_value = "3"
22541
+ example_data = '{"a": [1, 2, 3, 4, 5], "b": [2, 2, 2, 2, 2]}'
22542
+ example_sum = "3.0" # avg of a
22543
+ example_ref_sum = "2.0" # avg of b
22544
+ else: # sd
22545
+ example_value = "2"
22546
+ example_data = '{"a": [1, 2, 3, 4, 5], "b": [2, 2, 2, 2, 2]}'
22547
+ example_sum = "~1.58" # sd of a
22548
+ example_ref_sum = "0.0" # sd of b
22549
+
22550
+ # Build appropriate tolerance explanation based on comparison type
22551
+ if comp_type == "eq":
22552
+ tol_explanation = f"""The `tol=` parameter is particularly useful with `{name}()` since exact equality
22553
+ comparisons on floating-point aggregations can be problematic due to numerical precision.
22554
+ Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
22555
+ floating-point arithmetic."""
22556
+ else:
22557
+ tol_explanation = f"""The `tol=` parameter expands the acceptable range for the comparison. For
22558
+ `{name}()`, a tolerance of `tol=0.5` would mean the {agg_name} can be within `0.5` of the
22559
+ target value and still pass validation."""
22560
+
22561
+ docstring = f"""
22562
+ Does the column {agg_name} satisfy {comp_article} {comp_desc} comparison?
22563
+
22564
+ The `{name}()` validation method checks whether the {agg_name} of values in a column
22565
+ {comp_phrase} a specified `value=`. This is an aggregation-based validation where the entire
22566
+ column is reduced to a single {agg_name} value that is then compared against the target. The
22567
+ comparison used in this function is `{agg_name}(column) {comp_symbol} value`.
22568
+
22569
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
22570
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
22571
+ the comparison) or fails completely.
22572
+
22573
+ Parameters
22574
+ ----------
22575
+ columns
22576
+ A single column or a list of columns to validate. If multiple columns are supplied,
22577
+ there will be a separate validation step generated for each column. The columns must
22578
+ contain numeric data for the {agg_name} to be computed.
22579
+ value
22580
+ The value to compare the column {agg_name} against. This can be: (1) a numeric literal
22581
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
22582
+ whose {agg_name} will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
22583
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
22584
+ `None` to automatically compare against the same column in reference data (shorthand for
22585
+ `ref(column_name)` when reference data is set).
22586
+ tol
22587
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
22588
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
22589
+ a {agg_name} that differs from the target by up to `0.5` will still pass. {tol_explanation}
22590
+ thresholds
22591
+ Failure threshold levels so that the validation step can react accordingly when
22592
+ failing test units are level. Since this is an aggregation-based validation with only
22593
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
22594
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
22595
+ acceptable.
22596
+ brief
22597
+ An optional brief description of the validation step that will be displayed in the
22598
+ reporting table. You can use the templating elements like `"{{step}}"` to insert
22599
+ the step number, or `"{{auto}}"` to include an automatically generated brief. If `True`
22600
+ the entire brief will be automatically generated. If `None` (the default) then there
22601
+ won't be a brief.
22602
+ actions
22603
+ Optional actions to take when the validation step meets or exceeds any set threshold
22604
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
22605
+ define the actions.
22606
+ active
22607
+ A boolean value indicating whether the validation step should be active. Using `False`
22608
+ will make the validation step inactive (still reporting its presence and keeping indexes
22609
+ for the steps unchanged).
22610
+
22611
+ Returns
22612
+ -------
22613
+ Validate
22614
+ The `Validate` object with the added validation step.
22615
+
22616
+ Using Reference Data
22617
+ --------------------
22618
+ The `{name}()` method supports comparing column aggregations against reference data. This
22619
+ is useful for validating that statistical properties remain consistent across different
22620
+ versions of a dataset, or for comparing current data against historical baselines.
22621
+
22622
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
22623
+
22624
+ ```python
22625
+ validation = (
22626
+ pb.Validate(data=current_data, reference=baseline_data)
22627
+ .{name}(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
22628
+ .interrogate()
22629
+ )
22630
+ ```
22631
+
22632
+ When `value=None` and reference data is set, the method automatically compares against the
22633
+ same column in the reference data. You can also explicitly specify reference columns using
22634
+ the `ref()` helper:
22635
+
22636
+ ```python
22637
+ .{name}(columns="revenue", value=pb.ref("baseline_revenue"))
22638
+ ```
22639
+
22640
+ Understanding Tolerance
22641
+ -----------------------
22642
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
22643
+ floating-point aggregations where exact equality is often unreliable.
22644
+
22645
+ {tol_explanation}
22646
+
22647
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
22648
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
22649
+ shifts the comparison boundary.
22650
+
22651
+ Thresholds
22652
+ ----------
22653
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
22654
+ step. If they are set here at the step level, these thresholds will override any thresholds
22655
+ set at the global level in `Validate(thresholds=...)`.
22656
+
22657
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
22658
+ validations operate on a single test unit (the aggregated value), threshold values are
22659
+ typically set as absolute counts:
22660
+
22661
+ - `thresholds=1` means any failure triggers a 'warning'
22662
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
22663
+
22664
+ Thresholds can be defined using one of these input schemes:
22665
+
22666
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
22667
+ thresholds)
22668
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
22669
+ the 'error' level, and position `2` is the 'critical' level
22670
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
22671
+ 'critical'
22672
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
22673
+ for the 'warning' level only
22674
+
22675
+ Examples
22676
+ --------
22677
+ ```{{python}}
22678
+ #| echo: false
22679
+ #| output: false
22680
+ import pointblank as pb
22681
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
22682
+ ```
22683
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
22684
+ shown below:
22685
+
22686
+ ```{{python}}
22687
+ import pointblank as pb
22688
+ import polars as pl
22689
+
22690
+ tbl = pl.DataFrame(
22691
+ {{
22692
+ "a": [1, 2, 3, 4, 5],
22693
+ "b": [2, 2, 2, 2, 2],
22694
+ }}
22695
+ )
22696
+
22697
+ pb.preview(tbl)
22698
+ ```
22699
+
22700
+ Let's validate that the {agg_name} of column `a` {comp_phrase} `{example_value}`:
22701
+
22702
+ ```{{python}}
22703
+ validation = (
22704
+ pb.Validate(data=tbl)
22705
+ .{name}(columns="a", value={example_value})
22706
+ .interrogate()
22707
+ )
22708
+
22709
+ validation
22710
+ ```
22711
+
22712
+ The validation result shows whether the {agg_name} comparison passed or failed. Since this
22713
+ is an aggregation-based validation, there is exactly one test unit per column.
22714
+
22715
+ When validating multiple columns, each column gets its own validation step:
22716
+
22717
+ ```{{python}}
22718
+ validation = (
22719
+ pb.Validate(data=tbl)
22720
+ .{name}(columns=["a", "b"], value={example_value})
22721
+ .interrogate()
22722
+ )
22723
+
22724
+ validation
22725
+ ```
22726
+
22727
+ Using tolerance for flexible comparisons:
22728
+
22729
+ ```{{python}}
22730
+ validation = (
22731
+ pb.Validate(data=tbl)
22732
+ .{name}(columns="a", value={example_value}, tol=1.0)
22733
+ .interrogate()
22734
+ )
22735
+
22736
+ validation
22737
+ ```
22738
+ """
22739
+
22740
+ return docstring.strip()
22741
+
22742
+
22743
+ def make_agg_validator(name: str):
22744
+ """Factory for dynamically generated aggregate validation methods.
22745
+
22746
+ Why this exists:
22747
+ Aggregate validators all share identical behavior. The only thing that differs
22748
+ between them is the semantic assertion type (their name). The implementation
22749
+ of each aggregate validator is fetched from `from_agg_validator`.
22750
+
22751
+ Instead of copy/pasting dozens of identical methods, we generate
22752
+ them dynamically and attach them to the Validate class. The types are generated
22753
+ at build time with `make pyi` to allow the methods to be visible to the type checker,
22754
+ documentation builders and the IDEs/LSPs.
22755
+
22756
+ The returned function is a thin adapter that forwards all arguments to
22757
+ `_add_agg_validation`, supplying the assertion type explicitly.
22758
+ """
22759
+
22760
+ def agg_validator(
22761
+ self: Validate,
22762
+ columns: str | Collection[str],
22763
+ value: float | int | Column | ReferenceColumn | None = None,
22764
+ tol: float = 0,
22765
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
22766
+ brief: str | bool | None = None,
22767
+ actions: Actions | None = None,
22768
+ active: bool = True,
22769
+ ) -> Validate:
22770
+ # Dynamically generated aggregate validator.
22771
+ # This method is generated per assertion type and forwards all arguments
22772
+ # to the shared aggregate validation implementation.
22773
+ return self._add_agg_validation(
22774
+ assertion_type=name,
22775
+ columns=columns,
22776
+ value=value,
22777
+ tol=tol,
22778
+ thresholds=thresholds,
22779
+ brief=brief,
22780
+ actions=actions,
22781
+ active=active,
22782
+ )
22783
+
22784
+ # Manually set function identity so this behaves like a real method.
22785
+ # These must be set before attaching the function to the class.
22786
+ agg_validator.__name__ = name
22787
+ agg_validator.__qualname__ = f"Validate.{name}"
22788
+ agg_validator.__doc__ = _generate_agg_docstring(name)
22789
+
22790
+ return agg_validator
22791
+
22792
+
22793
+ # Finally, we grab all the valid aggregation method names and attach them to
22794
+ # the Validate class, registering each one appropriately.
22795
+ for method in load_validation_method_grid(): # -> `col_sum_*`, `col_mean_*`, etc.
22796
+ setattr(Validate, method, make_agg_validator(method))