pointblank 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -12,9 +12,10 @@ import tempfile
12
12
  import threading
13
13
  from dataclasses import dataclass
14
14
  from enum import Enum
15
+ from functools import partial
15
16
  from importlib.metadata import version
16
17
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Callable, Literal
18
+ from typing import TYPE_CHECKING, Any, Callable, Literal, NoReturn, ParamSpec, TypeVar
18
19
  from zipfile import ZipFile
19
20
 
20
21
  import commonmark
@@ -23,8 +24,8 @@ from great_tables import GT, from_column, google_font, html, loc, md, style, val
23
24
  from great_tables.gt import _get_column_of_values
24
25
  from great_tables.vals import fmt_integer, fmt_number
25
26
  from importlib_resources import files
26
- from narwhals.typing import FrameT
27
27
 
28
+ from pointblank._agg import is_valid_agg, load_validation_method_grid, resolve_agg_registries
28
29
  from pointblank._constants import (
29
30
  ASSERTION_TYPE_METHOD_MAP,
30
31
  CHECK_MARK_SPAN,
@@ -54,6 +55,7 @@ from pointblank._interrogation import (
54
55
  SpeciallyValidation,
55
56
  col_count_match,
56
57
  col_exists,
58
+ col_pct_null,
57
59
  col_schema_match,
58
60
  col_vals_expr,
59
61
  conjointly_validation,
@@ -90,6 +92,8 @@ from pointblank._utils import (
90
92
  _is_lib_present,
91
93
  _is_narwhals_table,
92
94
  _is_value_a_df,
95
+ _PBUnresolvedColumn,
96
+ _resolve_columns,
93
97
  _select_df_lib,
94
98
  )
95
99
  from pointblank._utils_check_args import (
@@ -100,7 +104,14 @@ from pointblank._utils_check_args import (
100
104
  _check_thresholds,
101
105
  )
102
106
  from pointblank._utils_html import _create_table_dims_html, _create_table_type_html
103
- from pointblank.column import Column, ColumnLiteral, ColumnSelector, ColumnSelectorNarwhals, col
107
+ from pointblank.column import (
108
+ Column,
109
+ ColumnLiteral,
110
+ ColumnSelector,
111
+ ColumnSelectorNarwhals,
112
+ ReferenceColumn,
113
+ col,
114
+ )
104
115
  from pointblank.schema import Schema, _get_schema_validation_info
105
116
  from pointblank.segments import Segment
106
117
  from pointblank.thresholds import (
@@ -111,10 +122,18 @@ from pointblank.thresholds import (
111
122
  _normalize_thresholds_creation,
112
123
  )
113
124
 
125
+ P = ParamSpec("P")
126
+ R = TypeVar("R")
127
+
114
128
  if TYPE_CHECKING:
115
129
  from collections.abc import Collection
130
+ from typing import Any
131
+
132
+ import polars as pl
133
+ from narwhals.typing import IntoDataFrame, IntoFrame
134
+
135
+ from pointblank._typing import AbsoluteBounds, Tolerance, _CompliantValue, _CompliantValues
116
136
 
117
- from pointblank._typing import AbsoluteBounds, Tolerance
118
137
 
119
138
  __all__ = [
120
139
  "Validate",
@@ -133,6 +152,7 @@ __all__ = [
133
152
  "get_validation_summary",
134
153
  ]
135
154
 
155
+
136
156
  # Create a thread-local storage for the metadata
137
157
  _action_context = threading.local()
138
158
 
@@ -363,12 +383,16 @@ class PointblankConfig:
363
383
 
364
384
  report_incl_header: bool = True
365
385
  report_incl_footer: bool = True
386
+ report_incl_footer_timings: bool = True
387
+ report_incl_footer_notes: bool = True
366
388
  preview_incl_header: bool = True
367
389
 
368
390
  def __repr__(self):
369
391
  return (
370
392
  f"PointblankConfig(report_incl_header={self.report_incl_header}, "
371
393
  f"report_incl_footer={self.report_incl_footer}, "
394
+ f"report_incl_footer_timings={self.report_incl_footer_timings}, "
395
+ f"report_incl_footer_notes={self.report_incl_footer_notes}, "
372
396
  f"preview_incl_header={self.preview_incl_header})"
373
397
  )
374
398
 
@@ -380,6 +404,8 @@ global_config = PointblankConfig()
380
404
  def config(
381
405
  report_incl_header: bool = True,
382
406
  report_incl_footer: bool = True,
407
+ report_incl_footer_timings: bool = True,
408
+ report_incl_footer_notes: bool = True,
383
409
  preview_incl_header: bool = True,
384
410
  ) -> PointblankConfig:
385
411
  """
@@ -393,7 +419,13 @@ def config(
393
419
  threshold levels (if set).
394
420
  report_incl_footer
395
421
  Should the footer of the validation table report be displayed? The footer contains the
396
- starting and ending times of the interrogation.
422
+ starting and ending times of the interrogation and any notes added to validation steps.
423
+ report_incl_footer_timings
424
+ Controls whether the validation timing information (start time, duration, and end time)
425
+ should be displayed in the footer. Only applies when `report_incl_footer=True`.
426
+ report_incl_footer_notes
427
+ Controls whether the notes from validation steps should be displayed in the footer. Only
428
+ applies when `report_incl_footer=True`.
397
429
  preview_incl_header
398
430
  Whether the header should be present in any preview table (generated via the
399
431
  [`preview()`](`pointblank.preview`) function).
@@ -407,13 +439,16 @@ def config(
407
439
  global global_config
408
440
  global_config.report_incl_header = report_incl_header # pragma: no cover
409
441
  global_config.report_incl_footer = report_incl_footer # pragma: no cover
442
+ global_config.report_incl_footer_timings = report_incl_footer_timings # pragma: no cover
443
+ global_config.report_incl_footer_notes = report_incl_footer_notes # pragma: no cover
410
444
  global_config.preview_incl_header = preview_incl_header # pragma: no cover
445
+ return global_config # pragma: no cover
411
446
 
412
447
 
413
448
  def load_dataset(
414
449
  dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
415
450
  tbl_type: Literal["polars", "pandas", "duckdb"] = "polars",
416
- ) -> FrameT | Any:
451
+ ) -> Any:
417
452
  """
418
453
  Load a dataset hosted in the library as specified table type.
419
454
 
@@ -434,7 +469,7 @@ def load_dataset(
434
469
 
435
470
  Returns
436
471
  -------
437
- FrameT | Any
472
+ Any
438
473
  The dataset for the `Validate` object. This could be a Polars DataFrame, a Pandas DataFrame,
439
474
  or a DuckDB table as an Ibis table.
440
475
 
@@ -1507,7 +1542,7 @@ def get_data_path(
1507
1542
  return tmp_file.name
1508
1543
 
1509
1544
 
1510
- def _process_data(data: FrameT | Any) -> FrameT | Any:
1545
+ def _process_data(data: Any) -> Any:
1511
1546
  """
1512
1547
  Centralized data processing pipeline that handles all supported input types.
1513
1548
 
@@ -1524,7 +1559,7 @@ def _process_data(data: FrameT | Any) -> FrameT | Any:
1524
1559
 
1525
1560
  Parameters
1526
1561
  ----------
1527
- data : FrameT | Any
1562
+ data
1528
1563
  The input data which could be:
1529
1564
  - a DataFrame object (Polars, Pandas, Ibis, etc.)
1530
1565
  - a GitHub URL pointing to a CSV or Parquet file
@@ -1535,7 +1570,7 @@ def _process_data(data: FrameT | Any) -> FrameT | Any:
1535
1570
 
1536
1571
  Returns
1537
1572
  -------
1538
- FrameT | Any
1573
+ Any
1539
1574
  Processed data as a DataFrame if input was a supported data source type,
1540
1575
  otherwise the original data unchanged.
1541
1576
  """
@@ -1554,7 +1589,7 @@ def _process_data(data: FrameT | Any) -> FrameT | Any:
1554
1589
  return data
1555
1590
 
1556
1591
 
1557
- def _process_github_url(data: FrameT | Any) -> FrameT | Any:
1592
+ def _process_github_url(data: Any) -> Any:
1558
1593
  """
1559
1594
  Process data parameter to handle GitHub URLs pointing to CSV or Parquet files.
1560
1595
 
@@ -1569,12 +1604,12 @@ def _process_github_url(data: FrameT | Any) -> FrameT | Any:
1569
1604
 
1570
1605
  Parameters
1571
1606
  ----------
1572
- data : FrameT | Any
1607
+ data
1573
1608
  The data parameter which may be a GitHub URL string or any other data type.
1574
1609
 
1575
1610
  Returns
1576
1611
  -------
1577
- FrameT | Any
1612
+ Any
1578
1613
  If the input is a supported GitHub URL, returns a DataFrame loaded from the downloaded file.
1579
1614
  Otherwise, returns the original data unchanged.
1580
1615
 
@@ -1659,7 +1694,7 @@ def _process_github_url(data: FrameT | Any) -> FrameT | Any:
1659
1694
  return data
1660
1695
 
1661
1696
 
1662
- def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
1697
+ def _process_connection_string(data: Any) -> Any:
1663
1698
  """
1664
1699
  Process data parameter to handle database connection strings.
1665
1700
 
@@ -1686,7 +1721,7 @@ def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
1686
1721
  return connect_to_table(data)
1687
1722
 
1688
1723
 
1689
- def _process_csv_input(data: FrameT | Any) -> FrameT | Any:
1724
+ def _process_csv_input(data: Any) -> Any:
1690
1725
  """
1691
1726
  Process data parameter to handle CSV file inputs.
1692
1727
 
@@ -1744,7 +1779,7 @@ def _process_csv_input(data: FrameT | Any) -> FrameT | Any:
1744
1779
  )
1745
1780
 
1746
1781
 
1747
- def _process_parquet_input(data: FrameT | Any) -> FrameT | Any:
1782
+ def _process_parquet_input(data: Any) -> Any:
1748
1783
  """
1749
1784
  Process data parameter to handle Parquet file inputs.
1750
1785
 
@@ -1887,7 +1922,7 @@ def _process_parquet_input(data: FrameT | Any) -> FrameT | Any:
1887
1922
 
1888
1923
 
1889
1924
  def preview(
1890
- data: FrameT | Any,
1925
+ data: Any,
1891
1926
  columns_subset: str | list[str] | Column | None = None,
1892
1927
  n_head: int = 5,
1893
1928
  n_tail: int = 5,
@@ -1895,7 +1930,7 @@ def preview(
1895
1930
  show_row_numbers: bool = True,
1896
1931
  max_col_width: int = 250,
1897
1932
  min_tbl_width: int = 500,
1898
- incl_header: bool = None,
1933
+ incl_header: bool | None = None,
1899
1934
  ) -> GT:
1900
1935
  """
1901
1936
  Display a table preview that shows some rows from the top, some from the bottom.
@@ -2153,7 +2188,7 @@ def preview(
2153
2188
 
2154
2189
 
2155
2190
  def _generate_display_table(
2156
- data: FrameT | Any,
2191
+ data: Any,
2157
2192
  columns_subset: str | list[str] | Column | None = None,
2158
2193
  n_head: int = 5,
2159
2194
  n_tail: int = 5,
@@ -2161,7 +2196,7 @@ def _generate_display_table(
2161
2196
  show_row_numbers: bool = True,
2162
2197
  max_col_width: int = 250,
2163
2198
  min_tbl_width: int = 500,
2164
- incl_header: bool = None,
2199
+ incl_header: bool | None = None,
2165
2200
  mark_missing_values: bool = True,
2166
2201
  row_number_list: list[int] | None = None,
2167
2202
  ) -> GT:
@@ -2258,7 +2293,8 @@ def _generate_display_table(
2258
2293
  tbl_schema = Schema(tbl=data)
2259
2294
 
2260
2295
  # Get the row count for the table
2261
- ibis_rows = data.count()
2296
+ # Note: ibis tables have count(), to_polars(), to_pandas() methods
2297
+ ibis_rows = data.count() # type: ignore[union-attr]
2262
2298
  n_rows = ibis_rows.to_polars() if df_lib_name_gt == "polars" else int(ibis_rows.to_pandas())
2263
2299
 
2264
2300
  # If n_head + n_tail is greater than the row count, display the entire table
@@ -2267,11 +2303,11 @@ def _generate_display_table(
2267
2303
  data_subset = data
2268
2304
 
2269
2305
  if row_number_list is None:
2270
- row_number_list = range(1, n_rows + 1)
2306
+ row_number_list = list(range(1, n_rows + 1))
2271
2307
  else:
2272
2308
  # Get the first n and last n rows of the table
2273
- data_head = data.head(n_head)
2274
- data_tail = data.filter(
2309
+ data_head = data.head(n_head) # type: ignore[union-attr]
2310
+ data_tail = data.filter( # type: ignore[union-attr]
2275
2311
  [ibis.row_number() >= (n_rows - n_tail), ibis.row_number() <= n_rows]
2276
2312
  )
2277
2313
  data_subset = data_head.union(data_tail)
@@ -2283,9 +2319,9 @@ def _generate_display_table(
2283
2319
 
2284
2320
  # Convert either to Polars or Pandas depending on the available library
2285
2321
  if df_lib_name_gt == "polars":
2286
- data = data_subset.to_polars()
2322
+ data = data_subset.to_polars() # type: ignore[union-attr]
2287
2323
  else:
2288
- data = data_subset.to_pandas()
2324
+ data = data_subset.to_pandas() # type: ignore[union-attr]
2289
2325
 
2290
2326
  # From a DataFrame:
2291
2327
  # - get the row count
@@ -2296,17 +2332,18 @@ def _generate_display_table(
2296
2332
  tbl_schema = Schema(tbl=data)
2297
2333
 
2298
2334
  if tbl_type == "polars":
2299
- n_rows = int(data.height)
2335
+ # Note: polars DataFrames have height, head(), tail() attributes
2336
+ n_rows = int(data.height) # type: ignore[union-attr]
2300
2337
 
2301
2338
  # If n_head + n_tail is greater than the row count, display the entire table
2302
2339
  if n_head + n_tail >= n_rows:
2303
2340
  full_dataset = True
2304
2341
 
2305
2342
  if row_number_list is None:
2306
- row_number_list = range(1, n_rows + 1)
2343
+ row_number_list = list(range(1, n_rows + 1))
2307
2344
 
2308
2345
  else:
2309
- data = pl.concat([data.head(n=n_head), data.tail(n=n_tail)])
2346
+ data = pl.concat([data.head(n=n_head), data.tail(n=n_tail)]) # type: ignore[union-attr]
2310
2347
 
2311
2348
  if row_number_list is None:
2312
2349
  row_number_list = list(range(1, n_head + 1)) + list(
@@ -2314,40 +2351,42 @@ def _generate_display_table(
2314
2351
  )
2315
2352
 
2316
2353
  if tbl_type == "pandas":
2317
- n_rows = data.shape[0]
2354
+ # Note: pandas DataFrames have shape, head(), tail() attributes
2355
+ n_rows = data.shape[0] # type: ignore[union-attr]
2318
2356
 
2319
2357
  # If n_head + n_tail is greater than the row count, display the entire table
2320
2358
  if n_head + n_tail >= n_rows:
2321
2359
  full_dataset = True
2322
2360
  data_subset = data
2323
2361
 
2324
- row_number_list = range(1, n_rows + 1)
2362
+ row_number_list = list(range(1, n_rows + 1))
2325
2363
  else:
2326
- data = pd.concat([data.head(n=n_head), data.tail(n=n_tail)])
2364
+ data = pd.concat([data.head(n=n_head), data.tail(n=n_tail)]) # type: ignore[union-attr]
2327
2365
 
2328
2366
  row_number_list = list(range(1, n_head + 1)) + list(
2329
2367
  range(n_rows - n_tail + 1, n_rows + 1)
2330
2368
  )
2331
2369
 
2332
2370
  if tbl_type == "pyspark":
2333
- n_rows = data.count()
2371
+ # Note: pyspark DataFrames have count(), toPandas(), limit(), tail(), sparkSession
2372
+ n_rows = data.count() # type: ignore[union-attr]
2334
2373
 
2335
2374
  # If n_head + n_tail is greater than the row count, display the entire table
2336
2375
  if n_head + n_tail >= n_rows:
2337
2376
  full_dataset = True
2338
2377
  # Convert to pandas for Great Tables compatibility
2339
- data = data.toPandas()
2378
+ data = data.toPandas() # type: ignore[union-attr]
2340
2379
 
2341
- row_number_list = range(1, n_rows + 1)
2380
+ row_number_list = list(range(1, n_rows + 1))
2342
2381
  else:
2343
2382
  # Get head and tail samples, then convert to pandas
2344
- head_data = data.limit(n_head).toPandas()
2383
+ head_data = data.limit(n_head).toPandas() # type: ignore[union-attr]
2345
2384
 
2346
2385
  # PySpark tail() returns a list of Row objects, need to convert to DataFrame
2347
- tail_rows = data.tail(n_tail)
2386
+ tail_rows = data.tail(n_tail) # type: ignore[union-attr]
2348
2387
  if tail_rows:
2349
2388
  # Convert list of Row objects back to DataFrame, then to pandas
2350
- tail_df = data.sparkSession.createDataFrame(tail_rows, data.schema)
2389
+ tail_df = data.sparkSession.createDataFrame(tail_rows, data.schema) # type: ignore[union-attr]
2351
2390
  tail_data = tail_df.toPandas()
2352
2391
  else:
2353
2392
  # If no tail data, create empty DataFrame with same schema
@@ -2375,14 +2414,14 @@ def _generate_display_table(
2375
2414
  tbl_schema = Schema(tbl=data)
2376
2415
 
2377
2416
  # From the table schema, get a list of tuples containing column names and data types
2378
- col_dtype_dict = tbl_schema.columns
2417
+ col_dtype_list = tbl_schema.columns or []
2379
2418
 
2380
2419
  # Extract the column names from the list of tuples (first element of each tuple)
2381
- col_names = [col[0] for col in col_dtype_dict]
2420
+ col_names = [col[0] for col in col_dtype_list]
2382
2421
 
2383
2422
  # Iterate over the list of tuples and create a new dictionary with the
2384
2423
  # column names and data types
2385
- col_dtype_dict = {k: v for k, v in col_dtype_dict}
2424
+ col_dtype_dict = {k: v for k, v in col_dtype_list}
2386
2425
 
2387
2426
  # Create short versions of the data types by omitting any text in parentheses
2388
2427
  col_dtype_dict_short = {
@@ -2481,21 +2520,21 @@ def _generate_display_table(
2481
2520
  # Prepend a column that contains the row numbers if `show_row_numbers=True`
2482
2521
  if show_row_numbers or has_leading_row_num_col:
2483
2522
  if has_leading_row_num_col:
2484
- row_number_list = data["_row_num_"].to_list()
2523
+ row_number_list = data["_row_num_"].to_list() # type: ignore[union-attr]
2485
2524
 
2486
2525
  else:
2487
2526
  if df_lib_name_gt == "polars":
2488
2527
  import polars as pl
2489
2528
 
2490
2529
  row_number_series = pl.Series("_row_num_", row_number_list)
2491
- data = data.insert_column(0, row_number_series)
2530
+ data = data.insert_column(0, row_number_series) # type: ignore[union-attr]
2492
2531
 
2493
2532
  if df_lib_name_gt == "pandas":
2494
- data.insert(0, "_row_num_", row_number_list)
2533
+ data.insert(0, "_row_num_", row_number_list) # type: ignore[union-attr]
2495
2534
 
2496
2535
  if df_lib_name_gt == "pyspark":
2497
2536
  # For PySpark converted to pandas, use pandas method
2498
- data.insert(0, "_row_num_", row_number_list)
2537
+ data.insert(0, "_row_num_", row_number_list) # type: ignore[union-attr]
2499
2538
 
2500
2539
  # Get the highest number in the `row_number_list` and calculate a width that will
2501
2540
  # safely fit a number of that magnitude
@@ -2604,7 +2643,7 @@ def _generate_display_table(
2604
2643
  return gt_tbl
2605
2644
 
2606
2645
 
2607
- def missing_vals_tbl(data: FrameT | Any) -> GT:
2646
+ def missing_vals_tbl(data: Any) -> GT:
2608
2647
  """
2609
2648
  Display a table that shows the missing values in the input table.
2610
2649
 
@@ -3205,7 +3244,7 @@ def _get_column_names_safe(data: Any) -> list[str]:
3205
3244
  return list(data.columns) # pragma: no cover
3206
3245
 
3207
3246
 
3208
- def _get_column_names(data: FrameT | Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
3247
+ def _get_column_names(data: Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
3209
3248
  if ibis_tbl:
3210
3249
  return data.columns if df_lib_name_gt == "polars" else list(data.columns)
3211
3250
 
@@ -3229,12 +3268,10 @@ def _validate_columns_subset(
3229
3268
  )
3230
3269
  return columns_subset
3231
3270
 
3232
- return columns_subset.resolve(columns=col_names)
3271
+ return columns_subset.resolve(columns=col_names) # type: ignore[union-attr]
3233
3272
 
3234
3273
 
3235
- def _select_columns(
3236
- data: FrameT | Any, resolved_columns: list[str], ibis_tbl: bool, tbl_type: str
3237
- ) -> FrameT | Any:
3274
+ def _select_columns(data: Any, resolved_columns: list[str], ibis_tbl: bool, tbl_type: str) -> Any:
3238
3275
  if ibis_tbl:
3239
3276
  return data[resolved_columns]
3240
3277
  if tbl_type == "polars":
@@ -3242,7 +3279,7 @@ def _select_columns(
3242
3279
  return data[resolved_columns]
3243
3280
 
3244
3281
 
3245
- def get_column_count(data: FrameT | Any) -> int:
3282
+ def get_column_count(data: Any) -> int:
3246
3283
  """
3247
3284
  Get the number of columns in a table.
3248
3285
 
@@ -3454,7 +3491,7 @@ def _extract_enum_values(set_values: Any) -> list[Any]:
3454
3491
  return [set_values]
3455
3492
 
3456
3493
 
3457
- def get_row_count(data: FrameT | Any) -> int:
3494
+ def get_row_count(data: Any) -> int:
3458
3495
  """
3459
3496
  Get the number of rows in a table.
3460
3497
 
@@ -3707,18 +3744,46 @@ class _ValidationInfo:
3707
3744
  insertion order, ensuring notes appear in a consistent sequence in reports and logs.
3708
3745
  """
3709
3746
 
3747
+ @classmethod
3748
+ def from_agg_validator(
3749
+ cls,
3750
+ assertion_type: str,
3751
+ columns: _PBUnresolvedColumn,
3752
+ value: float | Column | ReferenceColumn,
3753
+ tol: Tolerance = 0,
3754
+ thresholds: float | bool | tuple | dict | Thresholds | None = None,
3755
+ brief: str | bool = False,
3756
+ actions: Actions | None = None,
3757
+ active: bool = True,
3758
+ ) -> _ValidationInfo:
3759
+ # This factory method creates a `_ValidationInfo` instance for aggregate
3760
+ # methods. The reason this is created, is because all agg methods share the same
3761
+ # signature so instead of instantiating the class directly each time, this method
3762
+ # can be used to reduce redundancy, boilerplate and mistakes :)
3763
+ _check_thresholds(thresholds=thresholds)
3764
+
3765
+ return cls(
3766
+ assertion_type=assertion_type,
3767
+ column=_resolve_columns(columns),
3768
+ values={"value": value, "tol": tol},
3769
+ thresholds=_normalize_thresholds_creation(thresholds),
3770
+ brief=_transform_auto_brief(brief=brief),
3771
+ actions=actions,
3772
+ active=active,
3773
+ )
3774
+
3710
3775
  # Validation plan
3711
3776
  i: int | None = None
3712
3777
  i_o: int | None = None
3713
3778
  step_id: str | None = None
3714
3779
  sha1: str | None = None
3715
3780
  assertion_type: str | None = None
3716
- column: any | None = None
3717
- values: any | list[any] | tuple | None = None
3781
+ column: Any | None = None
3782
+ values: Any | list[Any] | tuple | None = None
3718
3783
  inclusive: tuple[bool, bool] | None = None
3719
3784
  na_pass: bool | None = None
3720
3785
  pre: Callable | None = None
3721
- segments: any | None = None
3786
+ segments: Any | None = None
3722
3787
  thresholds: Thresholds | None = None
3723
3788
  actions: Actions | None = None
3724
3789
  label: str | None = None
@@ -3737,14 +3802,14 @@ class _ValidationInfo:
3737
3802
  error: bool | None = None
3738
3803
  critical: bool | None = None
3739
3804
  failure_text: str | None = None
3740
- tbl_checked: FrameT | None = None
3741
- extract: FrameT | None = None
3742
- val_info: dict[str, any] | None = None
3805
+ tbl_checked: Any = None
3806
+ extract: Any = None
3807
+ val_info: dict[str, Any] | None = None
3743
3808
  time_processed: str | None = None
3744
3809
  proc_duration_s: float | None = None
3745
3810
  notes: dict[str, dict[str, str]] | None = None
3746
3811
 
3747
- def get_val_info(self) -> dict[str, any]:
3812
+ def get_val_info(self) -> dict[str, Any] | None:
3748
3813
  return self.val_info
3749
3814
 
3750
3815
  def _add_note(self, key: str, markdown: str, text: str | None = None) -> None:
@@ -3920,7 +3985,7 @@ class _ValidationInfo:
3920
3985
  return self.notes is not None and len(self.notes) > 0
3921
3986
 
3922
3987
 
3923
- def _handle_connection_errors(e: Exception, connection_string: str) -> None:
3988
+ def _handle_connection_errors(e: Exception, connection_string: str) -> NoReturn:
3924
3989
  """
3925
3990
  Shared error handling for database connection failures.
3926
3991
 
@@ -4761,7 +4826,8 @@ class Validate:
4761
4826
  when table specifications are missing or backend dependencies are not installed.
4762
4827
  """
4763
4828
 
4764
- data: FrameT | Any
4829
+ data: IntoDataFrame
4830
+ reference: IntoFrame | None = None
4765
4831
  tbl_name: str | None = None
4766
4832
  label: str | None = None
4767
4833
  thresholds: int | float | bool | tuple | dict | Thresholds | None = None
@@ -4775,6 +4841,10 @@ class Validate:
4775
4841
  # Process data through the centralized data processing pipeline
4776
4842
  self.data = _process_data(self.data)
4777
4843
 
4844
+ # Process reference data if provided
4845
+ if self.reference is not None:
4846
+ self.reference = _process_data(self.reference)
4847
+
4778
4848
  # Check input of the `thresholds=` argument
4779
4849
  _check_thresholds(thresholds=self.thresholds)
4780
4850
 
@@ -4819,9 +4889,107 @@ class Validate:
4819
4889
 
4820
4890
  self.validation_info = []
4821
4891
 
4892
+ def _add_agg_validation(
4893
+ self,
4894
+ *,
4895
+ assertion_type: str,
4896
+ columns: str | Collection[str],
4897
+ value,
4898
+ tol=0,
4899
+ thresholds=None,
4900
+ brief=False,
4901
+ actions=None,
4902
+ active=True,
4903
+ ):
4904
+ """
4905
+ Add an aggregation-based validation step to the validation plan.
4906
+
4907
+ This internal method is used by all aggregation-based column validation methods
4908
+ (e.g., `col_sum_eq`, `col_avg_gt`, `col_sd_le`) to create and register validation
4909
+ steps. It relies heavily on the `_ValidationInfo.from_agg_validator()` class method.
4910
+
4911
+ Automatic Reference Inference
4912
+ -----------------------------
4913
+ When `value` is None and reference data has been set on the Validate object,
4914
+ this method automatically creates a `ReferenceColumn` pointing to the same
4915
+ column name in the reference data. This enables a convenient shorthand:
4916
+
4917
+ .. code-block:: python
4918
+
4919
+ # Instead of writing:
4920
+ Validate(data=df, reference=ref_df).col_sum_eq("a", ref("a"))
4921
+
4922
+ # You can simply write:
4923
+ Validate(data=df, reference=ref_df).col_sum_eq("a")
4924
+
4925
+ If `value` is None and no reference data is set, a `ValueError` is raised
4926
+ immediately to provide clear feedback to the user.
4927
+
4928
+ Parameters
4929
+ ----------
4930
+ assertion_type
4931
+ The type of assertion (e.g., "col_sum_eq", "col_avg_gt").
4932
+ columns
4933
+ Column name or collection of column names to validate.
4934
+ value
4935
+ The target value to compare against. Can be:
4936
+ - A numeric literal (int or float)
4937
+ - A `Column` object for cross-column comparison
4938
+ - A `ReferenceColumn` object for reference data comparison
4939
+ - None to automatically use `ref(column)` when reference data is set
4940
+ tol
4941
+ Tolerance for the comparison. Defaults to 0.
4942
+ thresholds
4943
+ Custom thresholds for the validation step.
4944
+ brief
4945
+ Brief description or auto-generate flag.
4946
+ actions
4947
+ Actions to take based on validation results.
4948
+ active
4949
+ Whether this validation step is active.
4950
+
4951
+ Returns
4952
+ -------
4953
+ Validate
4954
+ The Validate instance for method chaining.
4955
+
4956
+ Raises
4957
+ ------
4958
+ ValueError
4959
+ If `value` is None and no reference data is set on the Validate object.
4960
+ """
4961
+ if isinstance(columns, str):
4962
+ columns = [columns]
4963
+ for column in columns:
4964
+ # If value is None, default to referencing the same column from reference data
4965
+ resolved_value = value
4966
+ if value is None:
4967
+ if self.reference is None:
4968
+ raise ValueError(
4969
+ f"The 'value' parameter is required for {assertion_type}() "
4970
+ "when no reference data is set. Either provide a value, or "
4971
+ "set reference data on the Validate object using "
4972
+ "Validate(data=..., reference=...)."
4973
+ )
4974
+ resolved_value = ReferenceColumn(column_name=column)
4975
+
4976
+ val_info = _ValidationInfo.from_agg_validator(
4977
+ assertion_type=assertion_type,
4978
+ columns=column,
4979
+ value=resolved_value,
4980
+ tol=tol,
4981
+ thresholds=self.thresholds if thresholds is None else thresholds,
4982
+ actions=self.actions if actions is None else actions,
4983
+ brief=self.brief if brief is None else brief,
4984
+ active=active,
4985
+ )
4986
+ self._add_validation(validation_info=val_info)
4987
+
4988
+ return self
4989
+
4822
4990
  def set_tbl(
4823
4991
  self,
4824
- tbl: FrameT | Any,
4992
+ tbl: Any,
4825
4993
  tbl_name: str | None = None,
4826
4994
  label: str | None = None,
4827
4995
  ) -> Validate:
@@ -4964,7 +5132,7 @@ class Validate:
4964
5132
  na_pass: bool = False,
4965
5133
  pre: Callable | None = None,
4966
5134
  segments: SegmentSpec | None = None,
4967
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
5135
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
4968
5136
  actions: Actions | None = None,
4969
5137
  brief: str | bool | None = None,
4970
5138
  active: bool = True,
@@ -5198,7 +5366,6 @@ class Validate:
5198
5366
  - Row 1: `c` is `1` and `b` is `2`.
5199
5367
  - Row 3: `c` is `2` and `b` is `2`.
5200
5368
  """
5201
-
5202
5369
  assertion_type = _get_fn_name()
5203
5370
 
5204
5371
  _check_column(column=columns)
@@ -5218,14 +5385,7 @@ class Validate:
5218
5385
  self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
5219
5386
  )
5220
5387
 
5221
- # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
5222
- # resolve the columns
5223
- if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
5224
- columns = col(columns)
5225
-
5226
- # If `columns` is Column value or a string, place it in a list for iteration
5227
- if isinstance(columns, (Column, str)):
5228
- columns = [columns]
5388
+ columns = _resolve_columns(columns)
5229
5389
 
5230
5390
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
5231
5391
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
@@ -5256,7 +5416,7 @@ class Validate:
5256
5416
  na_pass: bool = False,
5257
5417
  pre: Callable | None = None,
5258
5418
  segments: SegmentSpec | None = None,
5259
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
5419
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
5260
5420
  actions: Actions | None = None,
5261
5421
  brief: str | bool | None = None,
5262
5422
  active: bool = True,
@@ -5547,7 +5707,7 @@ class Validate:
5547
5707
  na_pass: bool = False,
5548
5708
  pre: Callable | None = None,
5549
5709
  segments: SegmentSpec | None = None,
5550
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
5710
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
5551
5711
  actions: Actions | None = None,
5552
5712
  brief: str | bool | None = None,
5553
5713
  active: bool = True,
@@ -5838,7 +5998,7 @@ class Validate:
5838
5998
  na_pass: bool = False,
5839
5999
  pre: Callable | None = None,
5840
6000
  segments: SegmentSpec | None = None,
5841
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
6001
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
5842
6002
  actions: Actions | None = None,
5843
6003
  brief: str | bool | None = None,
5844
6004
  active: bool = True,
@@ -6127,7 +6287,7 @@ class Validate:
6127
6287
  na_pass: bool = False,
6128
6288
  pre: Callable | None = None,
6129
6289
  segments: SegmentSpec | None = None,
6130
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
6290
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
6131
6291
  actions: Actions | None = None,
6132
6292
  brief: str | bool | None = None,
6133
6293
  active: bool = True,
@@ -6419,7 +6579,7 @@ class Validate:
6419
6579
  na_pass: bool = False,
6420
6580
  pre: Callable | None = None,
6421
6581
  segments: SegmentSpec | None = None,
6422
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
6582
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
6423
6583
  actions: Actions | None = None,
6424
6584
  brief: str | bool | None = None,
6425
6585
  active: bool = True,
@@ -6713,7 +6873,7 @@ class Validate:
6713
6873
  na_pass: bool = False,
6714
6874
  pre: Callable | None = None,
6715
6875
  segments: SegmentSpec | None = None,
6716
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
6876
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
6717
6877
  actions: Actions | None = None,
6718
6878
  brief: str | bool | None = None,
6719
6879
  active: bool = True,
@@ -7033,7 +7193,7 @@ class Validate:
7033
7193
  na_pass: bool = False,
7034
7194
  pre: Callable | None = None,
7035
7195
  segments: SegmentSpec | None = None,
7036
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
7196
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
7037
7197
  actions: Actions | None = None,
7038
7198
  brief: str | bool | None = None,
7039
7199
  active: bool = True,
@@ -7350,7 +7510,7 @@ class Validate:
7350
7510
  set: Collection[Any],
7351
7511
  pre: Callable | None = None,
7352
7512
  segments: SegmentSpec | None = None,
7353
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
7513
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
7354
7514
  actions: Actions | None = None,
7355
7515
  brief: str | bool | None = None,
7356
7516
  active: bool = True,
@@ -7667,7 +7827,7 @@ class Validate:
7667
7827
  set: Collection[Any],
7668
7828
  pre: Callable | None = None,
7669
7829
  segments: SegmentSpec | None = None,
7670
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
7830
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
7671
7831
  actions: Actions | None = None,
7672
7832
  brief: str | bool | None = None,
7673
7833
  active: bool = True,
@@ -7958,7 +8118,7 @@ class Validate:
7958
8118
  na_pass: bool = False,
7959
8119
  pre: Callable | None = None,
7960
8120
  segments: SegmentSpec | None = None,
7961
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
8121
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
7962
8122
  actions: Actions | None = None,
7963
8123
  brief: str | bool | None = None,
7964
8124
  active: bool = True,
@@ -8146,7 +8306,7 @@ class Validate:
8146
8306
  na_pass: bool = False,
8147
8307
  pre: Callable | None = None,
8148
8308
  segments: SegmentSpec | None = None,
8149
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
8309
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
8150
8310
  actions: Actions | None = None,
8151
8311
  brief: str | bool | None = None,
8152
8312
  active: bool = True,
@@ -8331,7 +8491,7 @@ class Validate:
8331
8491
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
8332
8492
  pre: Callable | None = None,
8333
8493
  segments: SegmentSpec | None = None,
8334
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
8494
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
8335
8495
  actions: Actions | None = None,
8336
8496
  brief: str | bool | None = None,
8337
8497
  active: bool = True,
@@ -8574,7 +8734,7 @@ class Validate:
8574
8734
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
8575
8735
  pre: Callable | None = None,
8576
8736
  segments: SegmentSpec | None = None,
8577
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
8737
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
8578
8738
  actions: Actions | None = None,
8579
8739
  brief: str | bool | None = None,
8580
8740
  active: bool = True,
@@ -8820,7 +8980,7 @@ class Validate:
8820
8980
  inverse: bool = False,
8821
8981
  pre: Callable | None = None,
8822
8982
  segments: SegmentSpec | None = None,
8823
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
8983
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
8824
8984
  actions: Actions | None = None,
8825
8985
  brief: str | bool | None = None,
8826
8986
  active: bool = True,
@@ -9083,7 +9243,7 @@ class Validate:
9083
9243
  na_pass: bool = False,
9084
9244
  pre: Callable | None = None,
9085
9245
  segments: SegmentSpec | None = None,
9086
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
9246
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
9087
9247
  actions: Actions | None = None,
9088
9248
  brief: str | bool | None = None,
9089
9249
  active: bool = True,
@@ -9363,10 +9523,10 @@ class Validate:
9363
9523
 
9364
9524
  def col_vals_expr(
9365
9525
  self,
9366
- expr: any,
9526
+ expr: Any,
9367
9527
  pre: Callable | None = None,
9368
9528
  segments: SegmentSpec | None = None,
9369
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
9529
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
9370
9530
  actions: Actions | None = None,
9371
9531
  brief: str | bool | None = None,
9372
9532
  active: bool = True,
@@ -9584,7 +9744,7 @@ class Validate:
9584
9744
  def col_exists(
9585
9745
  self,
9586
9746
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
9587
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
9747
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
9588
9748
  actions: Actions | None = None,
9589
9749
  brief: str | bool | None = None,
9590
9750
  active: bool = True,
@@ -9755,40 +9915,41 @@ class Validate:
9755
9915
 
9756
9916
  return self
9757
9917
 
9758
- def rows_distinct(
9918
+ def col_pct_null(
9759
9919
  self,
9760
- columns_subset: str | list[str] | None = None,
9761
- pre: Callable | None = None,
9762
- segments: SegmentSpec | None = None,
9763
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
9920
+ columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
9921
+ p: float,
9922
+ tol: Tolerance = 0,
9923
+ thresholds: int | float | None | bool | tuple | dict | Thresholds = None,
9764
9924
  actions: Actions | None = None,
9765
9925
  brief: str | bool | None = None,
9766
9926
  active: bool = True,
9767
9927
  ) -> Validate:
9768
9928
  """
9769
- Validate whether rows in the table are distinct.
9929
+ Validate whether a column has a specific percentage of Null values.
9770
9930
 
9771
- The `rows_distinct()` method checks whether rows in the table are distinct. This validation
9772
- will operate over the number of test units that is equal to the number of rows in the table
9773
- (determined after any `pre=` mutation has been applied).
9931
+ The `col_pct_null()` validation method checks whether the percentage of Null values in a
9932
+ column matches a specified percentage `p=` (within an optional tolerance `tol=`). This
9933
+ validation operates at the column level, generating a single validation step per column that
9934
+ passes or fails based on whether the actual percentage of Null values falls within the
9935
+ acceptable range defined by `p ± tol`.
9774
9936
 
9775
9937
  Parameters
9776
9938
  ----------
9777
- columns_subset
9778
- A single column or a list of columns to use as a subset for the distinct comparison.
9779
- If `None`, then all columns in the table will be used for the comparison. If multiple
9780
- columns are supplied, the distinct comparison will be made over the combination of
9781
- values in those columns.
9782
- pre
9783
- An optional preprocessing function or lambda to apply to the data table during
9784
- interrogation. This function should take a table as input and return a modified table.
9785
- Have a look at the *Preprocessing* section for more information on how to use this
9786
- argument.
9787
- segments
9788
- An optional directive on segmentation, which serves to split a validation step into
9789
- multiple (one step per segment). Can be a single column name, a tuple that specifies a
9790
- column name and its corresponding values to segment on, or a combination of both
9791
- (provided as a list). Read the *Segmentation* section for usage information.
9939
+ columns
9940
+ A single column or a list of columns to validate. Can also use
9941
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
9942
+ multiple columns are supplied or resolved, there will be a separate validation step
9943
+ generated for each column.
9944
+ p
9945
+ The expected percentage of Null values in the column, expressed as a decimal between
9946
+ `0.0` and `1.0`. For example, `p=0.5` means 50% of values should be Null.
9947
+ tol
9948
+ The tolerance allowed when comparing the actual percentage of Null values to the
9949
+ expected percentage `p=`. The validation passes if the actual percentage falls within
9950
+ the range `[p - tol, p + tol]`. Default is `0`, meaning an exact match is required. See
9951
+ the *Tolerance* section for details on all supported formats (absolute, relative,
9952
+ symmetric, and asymmetric bounds).
9792
9953
  thresholds
9793
9954
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
9794
9955
  The thresholds are set at the step level and will override any global thresholds set in
@@ -9796,7 +9957,7 @@ class Validate:
9796
9957
  be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
9797
9958
  section for information on how to set threshold levels.
9798
9959
  actions
9799
- Optional actions to take when the validation step meets or exceeds any set threshold
9960
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
9800
9961
  levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
9801
9962
  define the actions.
9802
9963
  brief
@@ -9815,60 +9976,30 @@ class Validate:
9815
9976
  Validate
9816
9977
  The `Validate` object with the added validation step.
9817
9978
 
9818
- Preprocessing
9819
- -------------
9820
- The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
9821
- table during interrogation. This function should take a table as input and return a modified
9822
- table. This is useful for performing any necessary transformations or filtering on the data
9823
- before the validation step is applied.
9824
-
9825
- The preprocessing function can be any callable that takes a table as input and returns a
9826
- modified table. For example, you could use a lambda function to filter the table based on
9827
- certain criteria or to apply a transformation to the data. Note that you can refer to
9828
- columns via `columns_subset=` that are expected to be present in the transformed table, but
9829
- may not exist in the table before preprocessing. Regarding the lifetime of the transformed
9830
- table, it only exists during the validation step and is not stored in the `Validate` object
9831
- or used in subsequent validation steps.
9832
-
9833
- Segmentation
9834
- ------------
9835
- The `segments=` argument allows for the segmentation of a validation step into multiple
9836
- segments. This is useful for applying the same validation step to different subsets of the
9837
- data. The segmentation can be done based on a single column or specific fields within a
9838
- column.
9839
-
9840
- Providing a single column name will result in a separate validation step for each unique
9841
- value in that column. For example, if you have a column called `"region"` with values
9842
- `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
9843
- region.
9844
-
9845
- Alternatively, you can provide a tuple that specifies a column name and its corresponding
9846
- values to segment on. For example, if you have a column called `"date"` and you want to
9847
- segment on only specific dates, you can provide a tuple like
9848
- `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
9849
- (i.e., no validation steps will be created for them).
9979
+ Tolerance
9980
+ ---------
9981
+ The `tol=` parameter accepts several different formats to specify the acceptable deviation
9982
+ from the expected percentage `p=`. The tolerance can be expressed as:
9850
9983
 
9851
- A list with a combination of column names and tuples can be provided as well. This allows
9852
- for more complex segmentation scenarios. The following inputs are both valid:
9984
+ 1. *single integer* (absolute tolerance): the exact number of test units that can deviate.
9985
+ For example, `tol=2` means the actual count can differ from the expected count by up to 2
9986
+ units in either direction.
9853
9987
 
9854
- ```
9855
- # Segments from all unique values in the `region` column
9856
- # and specific dates in the `date` column
9857
- segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
9988
+ 2. *single float between 0 and 1* (relative tolerance): a proportion of the expected
9989
+ count. For example, if the expected count is 50 and `tol=0.1`, the acceptable range is
9990
+ 45 to 55 (50 ± 10% of 50 = 50 ± 5).
9858
9991
 
9859
- # Segments from all unique values in the `region` and `date` columns
9860
- segments=["region", "date"]
9861
- ```
9992
+ 3. *tuple of two integers* (absolute bounds): explicitly specify the lower and upper
9993
+ bounds as absolute deviations. For example, `tol=(1, 3)` means the actual count can be
9994
+ 1 unit below or 3 units above the expected count.
9862
9995
 
9863
- The segmentation is performed during interrogation, and the resulting validation steps will
9864
- be numbered sequentially. Each segment will have its own validation step, and the results
9865
- will be reported separately. This allows for a more granular analysis of the data and helps
9866
- identify issues within specific segments.
9996
+ 4. *tuple of two floats between 0 and 1* (relative bounds): explicitly specify the lower
9997
+ and upper bounds as proportional deviations. For example, `tol=(0.05, 0.15)` means the
9998
+ lower bound is 5% below and the upper bound is 15% above the expected count.
9867
9999
 
9868
- Importantly, the segmentation process will be performed after any preprocessing of the data
9869
- table. Because of this, one can conceivably use the `pre=` argument to generate a column
9870
- that can be used for segmentation. For example, you could create a new column called
9871
- `"segment"` through use of `pre=` and then use that column for segmentation.
10000
+ When using a single value (integer or float), the tolerance is applied symmetrically in both
10001
+ directions. When using a tuple, you can specify asymmetric tolerances where the lower and
10002
+ upper bounds differ.
9872
10003
 
9873
10004
  Thresholds
9874
10005
  ----------
@@ -9906,8 +10037,8 @@ class Validate:
9906
10037
  import pointblank as pb
9907
10038
  pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
9908
10039
  ```
9909
- For the examples here, we'll use a simple Polars DataFrame with three string columns
9910
- (`col_1`, `col_2`, and `col_3`). The table is shown below:
10040
+ For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`,
10041
+ and `c`) that have different percentages of Null values. The table is shown below:
9911
10042
 
9912
10043
  ```{python}
9913
10044
  import pointblank as pb
@@ -9915,56 +10046,133 @@ class Validate:
9915
10046
 
9916
10047
  tbl = pl.DataFrame(
9917
10048
  {
9918
- "col_1": ["a", "b", "c", "d"],
9919
- "col_2": ["a", "a", "c", "d"],
9920
- "col_3": ["a", "a", "d", "e"],
10049
+ "a": [1, 2, 3, 4, 5, 6, 7, 8],
10050
+ "b": [1, None, 3, None, 5, None, 7, None],
10051
+ "c": [None, None, None, None, None, None, 1, 2],
9921
10052
  }
9922
10053
  )
9923
10054
 
9924
10055
  pb.preview(tbl)
9925
10056
  ```
9926
10057
 
9927
- Let's validate that the rows in the table are distinct with `rows_distinct()`. We'll
9928
- determine if this validation had any failing test units (there are four test units, one for
9929
- each row). A failing test units means that a given row is not distinct from every other row.
10058
+ Let's validate that column `a` has 0% Null values (i.e., no Null values at all).
9930
10059
 
9931
10060
  ```{python}
9932
10061
  validation = (
9933
10062
  pb.Validate(data=tbl)
9934
- .rows_distinct()
10063
+ .col_pct_null(columns="a", p=0.0)
9935
10064
  .interrogate()
9936
10065
  )
9937
10066
 
9938
10067
  validation
9939
10068
  ```
9940
10069
 
9941
- From this validation table we see that there are no failing test units. All rows in the
9942
- table are distinct from one another.
10070
+ Printing the `validation` object shows the validation table in an HTML viewing environment.
10071
+ The validation table shows the single entry that corresponds to the validation step created
10072
+ by using `col_pct_null()`. The validation passed since column `a` has no Null values.
9943
10073
 
9944
- We can also use a subset of columns to determine distinctness. Let's specify the subset
9945
- using columns `col_2` and `col_3` for the next validation.
10074
+ Now, let's check that column `b` has exactly 50% Null values.
9946
10075
 
9947
10076
  ```{python}
9948
10077
  validation = (
9949
10078
  pb.Validate(data=tbl)
9950
- .rows_distinct(columns_subset=["col_2", "col_3"])
10079
+ .col_pct_null(columns="b", p=0.5)
9951
10080
  .interrogate()
9952
10081
  )
9953
10082
 
9954
10083
  validation
9955
10084
  ```
9956
10085
 
9957
- The validation table reports two failing test units. The first and second rows are
9958
- duplicated when considering only the values in columns `col_2` and `col_3`. There's only
9959
- one set of duplicates but there are two failing test units since each row is compared to all
9960
- others.
9961
- """
10086
+ This validation also passes, as column `b` has exactly 4 out of 8 values as Null (50%).
10087
+
10088
+ Finally, let's validate column `c` with a tolerance. Column `c` has 75% Null values, so
10089
+ we'll check if it's approximately 70% Null with a tolerance of 10%.
10090
+
10091
+ ```{python}
10092
+ validation = (
10093
+ pb.Validate(data=tbl)
10094
+ .col_pct_null(columns="c", p=0.70, tol=0.10)
10095
+ .interrogate()
10096
+ )
10097
+
10098
+ validation
10099
+ ```
10100
+
10101
+ This validation passes because the actual percentage (75%) falls within the acceptable
10102
+ range of 60% to 80% (70% ± 10%).
10103
+
10104
+ The `tol=` parameter supports multiple formats to express tolerance. Let's explore all the
10105
+ different ways to specify tolerance using column `b`, which has exactly 50% Null values
10106
+ (4 out of 8 values).
10107
+
10108
+ *Using an absolute tolerance (integer)*: Specify the exact number of rows that can
10109
+ deviate. With `tol=1`, we allow the count to differ by 1 row in either direction.
10110
+
10111
+ ```{python}
10112
+ validation = (
10113
+ pb.Validate(data=tbl)
10114
+ .col_pct_null(columns="b", p=0.375, tol=1) # Expect 3 nulls, allow ±1 (range: 2-4)
10115
+ .interrogate()
10116
+ )
10117
+
10118
+ validation
10119
+ ```
10120
+
10121
+ This passes because column `b` has 4 Null values, which falls within the acceptable range
10122
+ of 2 to 4 (3 ± 1).
10123
+
10124
+ *Using a relative tolerance (float)*: Specify the tolerance as a proportion of the
10125
+ expected count. With `tol=0.25`, we allow a 25% deviation from the expected count.
10126
+
10127
+ ```{python}
10128
+ validation = (
10129
+ pb.Validate(data=tbl)
10130
+ .col_pct_null(columns="b", p=0.375, tol=0.25) # Expect 3 nulls, allow ±25% (range: 2.25-3.75)
10131
+ .interrogate()
10132
+ )
10133
+
10134
+ validation
10135
+ ```
10136
+
10137
+ This passes because 4 Null values falls within the acceptable range (3 ± 0.75 calculates
10138
+ to 2.25 to 3.75, which rounds down to 2 to 3 rows).
10139
+
10140
+ *Using asymmetric absolute bounds (tuple of integers)*: Specify different lower and
10141
+ upper bounds as absolute values. With `tol=(0, 2)`, we allow no deviation below but up
10142
+ to 2 rows above the expected count.
10143
+
10144
+ ```{python}
10145
+ validation = (
10146
+ pb.Validate(data=tbl)
10147
+ .col_pct_null(columns="b", p=0.25, tol=(0, 2)) # Expect 2 Nulls, allow +0/-2 (range: 2-4)
10148
+ .interrogate()
10149
+ )
10150
+
10151
+ validation
10152
+ ```
10153
+
10154
+ This passes because 4 Null values falls within the acceptable range of 2 to 4.
10155
+
10156
+ *Using asymmetric relative bounds (tuple of floats)*: Specify different lower and upper
10157
+ bounds as proportions. With `tol=(0.1, 0.3)`, we allow 10% below and 30% above the
10158
+ expected count.
10159
+
10160
+ ```{python}
10161
+ validation = (
10162
+ pb.Validate(data=tbl)
10163
+ .col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3)) # Expect 3 Nulls, allow -10%/+30%
10164
+ .interrogate()
10165
+ )
10166
+
10167
+ validation
10168
+ ```
9962
10169
 
10170
+ This passes because 4 Null values falls within the acceptable range (3 - 0.3 to 3 + 0.9
10171
+ calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows).
10172
+ """
9963
10173
  assertion_type = _get_fn_name()
9964
10174
 
9965
- _check_pre(pre=pre)
9966
- # TODO: add check for segments
9967
- # _check_segments(segments=segments)
10175
+ _check_column(column=columns)
9968
10176
  _check_thresholds(thresholds=thresholds)
9969
10177
  _check_boolean_input(param=active, param_name="active")
9970
10178
 
@@ -9973,26 +10181,274 @@ class Validate:
9973
10181
  self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
9974
10182
  )
9975
10183
 
9976
- if columns_subset is not None and isinstance(columns_subset, str):
9977
- columns_subset = [columns_subset]
10184
+ # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
10185
+ # resolve the columns
10186
+ if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
10187
+ columns = col(columns)
9978
10188
 
9979
- # TODO: incorporate Column object
10189
+ # If `columns` is Column value or a string, place it in a list for iteration
10190
+ if isinstance(columns, (Column, str)):
10191
+ columns = [columns]
9980
10192
 
9981
10193
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
9982
10194
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
9983
10195
 
9984
- val_info = _ValidationInfo(
9985
- assertion_type=assertion_type,
9986
- column=columns_subset,
9987
- pre=pre,
9988
- segments=segments,
9989
- thresholds=thresholds,
9990
- actions=actions,
9991
- brief=brief,
9992
- active=active,
9993
- )
9994
-
9995
- self._add_validation(validation_info=val_info)
10196
+ bound_finder: Callable[[int], AbsoluteBounds] = partial(_derive_bounds, tol=tol)
10197
+
10198
+ # Iterate over the columns and create a validation step for each
10199
+ for column in columns:
10200
+ val_info = _ValidationInfo(
10201
+ assertion_type=assertion_type,
10202
+ column=column,
10203
+ values={"p": p, "bound_finder": bound_finder},
10204
+ thresholds=thresholds,
10205
+ actions=actions,
10206
+ brief=brief,
10207
+ active=active,
10208
+ )
10209
+
10210
+ self._add_validation(validation_info=val_info)
10211
+
10212
+ return self
10213
+
10214
+ def rows_distinct(
10215
+ self,
10216
+ columns_subset: str | list[str] | None = None,
10217
+ pre: Callable | None = None,
10218
+ segments: SegmentSpec | None = None,
10219
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
10220
+ actions: Actions | None = None,
10221
+ brief: str | bool | None = None,
10222
+ active: bool = True,
10223
+ ) -> Validate:
10224
+ """
10225
+ Validate whether rows in the table are distinct.
10226
+
10227
+ The `rows_distinct()` method checks whether rows in the table are distinct. This validation
10228
+ will operate over the number of test units that is equal to the number of rows in the table
10229
+ (determined after any `pre=` mutation has been applied).
10230
+
10231
+ Parameters
10232
+ ----------
10233
+ columns_subset
10234
+ A single column or a list of columns to use as a subset for the distinct comparison.
10235
+ If `None`, then all columns in the table will be used for the comparison. If multiple
10236
+ columns are supplied, the distinct comparison will be made over the combination of
10237
+ values in those columns.
10238
+ pre
10239
+ An optional preprocessing function or lambda to apply to the data table during
10240
+ interrogation. This function should take a table as input and return a modified table.
10241
+ Have a look at the *Preprocessing* section for more information on how to use this
10242
+ argument.
10243
+ segments
10244
+ An optional directive on segmentation, which serves to split a validation step into
10245
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
10246
+ column name and its corresponding values to segment on, or a combination of both
10247
+ (provided as a list). Read the *Segmentation* section for usage information.
10248
+ thresholds
10249
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
10250
+ The thresholds are set at the step level and will override any global thresholds set in
10251
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
10252
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
10253
+ section for information on how to set threshold levels.
10254
+ actions
10255
+ Optional actions to take when the validation step meets or exceeds any set threshold
10256
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
10257
+ define the actions.
10258
+ brief
10259
+ An optional brief description of the validation step that will be displayed in the
10260
+ reporting table. You can use the templating elements like `"{step}"` to insert
10261
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
10262
+ the entire brief will be automatically generated. If `None` (the default) then there
10263
+ won't be a brief.
10264
+ active
10265
+ A boolean value indicating whether the validation step should be active. Using `False`
10266
+ will make the validation step inactive (still reporting its presence and keeping indexes
10267
+ for the steps unchanged).
10268
+
10269
+ Returns
10270
+ -------
10271
+ Validate
10272
+ The `Validate` object with the added validation step.
10273
+
10274
+ Preprocessing
10275
+ -------------
10276
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
10277
+ table during interrogation. This function should take a table as input and return a modified
10278
+ table. This is useful for performing any necessary transformations or filtering on the data
10279
+ before the validation step is applied.
10280
+
10281
+ The preprocessing function can be any callable that takes a table as input and returns a
10282
+ modified table. For example, you could use a lambda function to filter the table based on
10283
+ certain criteria or to apply a transformation to the data. Note that you can refer to
10284
+ columns via `columns_subset=` that are expected to be present in the transformed table, but
10285
+ may not exist in the table before preprocessing. Regarding the lifetime of the transformed
10286
+ table, it only exists during the validation step and is not stored in the `Validate` object
10287
+ or used in subsequent validation steps.
10288
+
10289
+ Segmentation
10290
+ ------------
10291
+ The `segments=` argument allows for the segmentation of a validation step into multiple
10292
+ segments. This is useful for applying the same validation step to different subsets of the
10293
+ data. The segmentation can be done based on a single column or specific fields within a
10294
+ column.
10295
+
10296
+ Providing a single column name will result in a separate validation step for each unique
10297
+ value in that column. For example, if you have a column called `"region"` with values
10298
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
10299
+ region.
10300
+
10301
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
10302
+ values to segment on. For example, if you have a column called `"date"` and you want to
10303
+ segment on only specific dates, you can provide a tuple like
10304
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
10305
+ (i.e., no validation steps will be created for them).
10306
+
10307
+ A list with a combination of column names and tuples can be provided as well. This allows
10308
+ for more complex segmentation scenarios. The following inputs are both valid:
10309
+
10310
+ ```
10311
+ # Segments from all unique values in the `region` column
10312
+ # and specific dates in the `date` column
10313
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
10314
+
10315
+ # Segments from all unique values in the `region` and `date` columns
10316
+ segments=["region", "date"]
10317
+ ```
10318
+
10319
+ The segmentation is performed during interrogation, and the resulting validation steps will
10320
+ be numbered sequentially. Each segment will have its own validation step, and the results
10321
+ will be reported separately. This allows for a more granular analysis of the data and helps
10322
+ identify issues within specific segments.
10323
+
10324
+ Importantly, the segmentation process will be performed after any preprocessing of the data
10325
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
10326
+ that can be used for segmentation. For example, you could create a new column called
10327
+ `"segment"` through use of `pre=` and then use that column for segmentation.
10328
+
10329
+ Thresholds
10330
+ ----------
10331
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
10332
+ step. If they are set here at the step level, these thresholds will override any thresholds
10333
+ set at the global level in `Validate(thresholds=...)`.
10334
+
10335
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
10336
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
10337
+ or, the absolute number of failing test units (as integer that's `1` or greater).
10338
+
10339
+ Thresholds can be defined using one of these input schemes:
10340
+
10341
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
10342
+ thresholds)
10343
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
10344
+ the 'error' level, and position `2` is the 'critical' level
10345
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
10346
+ 'critical'
10347
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
10348
+ for the 'warning' level only
10349
+
10350
+ If the number of failing test units exceeds set thresholds, the validation step will be
10351
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
10352
+ set, you're free to set any combination of them.
10353
+
10354
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
10355
+ take for each level of failure (using the `actions=` parameter).
10356
+
10357
+ Examples
10358
+ --------
10359
+ ```{python}
10360
+ #| echo: false
10361
+ #| output: false
10362
+ import pointblank as pb
10363
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
10364
+ ```
10365
+ For the examples here, we'll use a simple Polars DataFrame with three string columns
10366
+ (`col_1`, `col_2`, and `col_3`). The table is shown below:
10367
+
10368
+ ```{python}
10369
+ import pointblank as pb
10370
+ import polars as pl
10371
+
10372
+ tbl = pl.DataFrame(
10373
+ {
10374
+ "col_1": ["a", "b", "c", "d"],
10375
+ "col_2": ["a", "a", "c", "d"],
10376
+ "col_3": ["a", "a", "d", "e"],
10377
+ }
10378
+ )
10379
+
10380
+ pb.preview(tbl)
10381
+ ```
10382
+
10383
+ Let's validate that the rows in the table are distinct with `rows_distinct()`. We'll
10384
+ determine if this validation had any failing test units (there are four test units, one for
10385
+ each row). A failing test units means that a given row is not distinct from every other row.
10386
+
10387
+ ```{python}
10388
+ validation = (
10389
+ pb.Validate(data=tbl)
10390
+ .rows_distinct()
10391
+ .interrogate()
10392
+ )
10393
+
10394
+ validation
10395
+ ```
10396
+
10397
+ From this validation table we see that there are no failing test units. All rows in the
10398
+ table are distinct from one another.
10399
+
10400
+ We can also use a subset of columns to determine distinctness. Let's specify the subset
10401
+ using columns `col_2` and `col_3` for the next validation.
10402
+
10403
+ ```{python}
10404
+ validation = (
10405
+ pb.Validate(data=tbl)
10406
+ .rows_distinct(columns_subset=["col_2", "col_3"])
10407
+ .interrogate()
10408
+ )
10409
+
10410
+ validation
10411
+ ```
10412
+
10413
+ The validation table reports two failing test units. The first and second rows are
10414
+ duplicated when considering only the values in columns `col_2` and `col_3`. There's only
10415
+ one set of duplicates but there are two failing test units since each row is compared to all
10416
+ others.
10417
+ """
10418
+
10419
+ assertion_type = _get_fn_name()
10420
+
10421
+ _check_pre(pre=pre)
10422
+ # TODO: add check for segments
10423
+ # _check_segments(segments=segments)
10424
+ _check_thresholds(thresholds=thresholds)
10425
+ _check_boolean_input(param=active, param_name="active")
10426
+
10427
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
10428
+ thresholds = (
10429
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
10430
+ )
10431
+
10432
+ if columns_subset is not None and isinstance(columns_subset, str):
10433
+ columns_subset = [columns_subset]
10434
+
10435
+ # TODO: incorporate Column object
10436
+
10437
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
10438
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
10439
+
10440
+ val_info = _ValidationInfo(
10441
+ assertion_type=assertion_type,
10442
+ column=columns_subset,
10443
+ pre=pre,
10444
+ segments=segments,
10445
+ thresholds=thresholds,
10446
+ actions=actions,
10447
+ brief=brief,
10448
+ active=active,
10449
+ )
10450
+
10451
+ self._add_validation(validation_info=val_info)
9996
10452
 
9997
10453
  return self
9998
10454
 
@@ -10001,7 +10457,7 @@ class Validate:
10001
10457
  columns_subset: str | list[str] | None = None,
10002
10458
  pre: Callable | None = None,
10003
10459
  segments: SegmentSpec | None = None,
10004
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
10460
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
10005
10461
  actions: Actions | None = None,
10006
10462
  brief: str | bool | None = None,
10007
10463
  active: bool = True,
@@ -10246,7 +10702,7 @@ class Validate:
10246
10702
  max_concurrent: int = 3,
10247
10703
  pre: Callable | None = None,
10248
10704
  segments: SegmentSpec | None = None,
10249
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
10705
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
10250
10706
  actions: Actions | None = None,
10251
10707
  brief: str | bool | None = None,
10252
10708
  active: bool = True,
@@ -10641,7 +11097,7 @@ class Validate:
10641
11097
  case_sensitive_dtypes: bool = True,
10642
11098
  full_match_dtypes: bool = True,
10643
11099
  pre: Callable | None = None,
10644
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
11100
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
10645
11101
  actions: Actions | None = None,
10646
11102
  brief: str | bool | None = None,
10647
11103
  active: bool = True,
@@ -10857,11 +11313,11 @@ class Validate:
10857
11313
 
10858
11314
  def row_count_match(
10859
11315
  self,
10860
- count: int | FrameT | Any,
11316
+ count: int | Any,
10861
11317
  tol: Tolerance = 0,
10862
11318
  inverse: bool = False,
10863
11319
  pre: Callable | None = None,
10864
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
11320
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
10865
11321
  actions: Actions | None = None,
10866
11322
  brief: str | bool | None = None,
10867
11323
  active: bool = True,
@@ -11076,10 +11532,10 @@ class Validate:
11076
11532
 
11077
11533
  def col_count_match(
11078
11534
  self,
11079
- count: int | FrameT | Any,
11535
+ count: int | Any,
11080
11536
  inverse: bool = False,
11081
11537
  pre: Callable | None = None,
11082
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
11538
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
11083
11539
  actions: Actions | None = None,
11084
11540
  brief: str | bool | None = None,
11085
11541
  active: bool = True,
@@ -11252,9 +11708,9 @@ class Validate:
11252
11708
 
11253
11709
  def tbl_match(
11254
11710
  self,
11255
- tbl_compare: FrameT | Any,
11711
+ tbl_compare: Any,
11256
11712
  pre: Callable | None = None,
11257
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
11713
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
11258
11714
  actions: Actions | None = None,
11259
11715
  brief: str | bool | None = None,
11260
11716
  active: bool = True,
@@ -11523,7 +11979,7 @@ class Validate:
11523
11979
  self,
11524
11980
  *exprs: Callable,
11525
11981
  pre: Callable | None = None,
11526
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
11982
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
11527
11983
  actions: Actions | None = None,
11528
11984
  brief: str | bool | None = None,
11529
11985
  active: bool = True,
@@ -11771,7 +12227,7 @@ class Validate:
11771
12227
  self,
11772
12228
  expr: Callable,
11773
12229
  pre: Callable | None = None,
11774
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
12230
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
11775
12231
  actions: Actions | None = None,
11776
12232
  brief: str | bool | None = None,
11777
12233
  active: bool = True,
@@ -12265,7 +12721,7 @@ class Validate:
12265
12721
  segment = validation.segments
12266
12722
 
12267
12723
  # Get compatible data types for this assertion type
12268
- assertion_method = ASSERTION_TYPE_METHOD_MAP[assertion_type]
12724
+ assertion_method = ASSERTION_TYPE_METHOD_MAP.get(assertion_type, assertion_type)
12269
12725
  compatible_dtypes = COMPATIBLE_DTYPES.get(assertion_method, [])
12270
12726
 
12271
12727
  # Process the `brief` text for the validation step by including template variables to
@@ -12282,12 +12738,19 @@ class Validate:
12282
12738
  # Generate the autobrief description for the validation step; it's important to perform
12283
12739
  # that here since text components like the column and the value(s) have been resolved
12284
12740
  # at this point
12741
+ # Get row count for col_pct_null to properly calculate absolute tolerance percentages
12742
+ n_rows = None
12743
+ if assertion_type == "col_pct_null":
12744
+ n_rows = get_row_count(data_tbl)
12745
+
12285
12746
  autobrief = _create_autobrief_or_failure_text(
12286
12747
  assertion_type=assertion_type,
12287
12748
  lang=self.lang,
12288
12749
  column=column,
12289
12750
  values=value,
12290
12751
  for_failure=False,
12752
+ locale=self.locale,
12753
+ n_rows=n_rows,
12291
12754
  )
12292
12755
 
12293
12756
  validation.autobrief = autobrief
@@ -12313,7 +12776,17 @@ class Validate:
12313
12776
 
12314
12777
  # Make a deep copy of the table for this step to ensure proper isolation
12315
12778
  # This prevents modifications from one validation step affecting others
12316
- data_tbl_step = _copy_dataframe(data_tbl)
12779
+ try:
12780
+ # TODO: This copying should be scrutinized further
12781
+ data_tbl_step: IntoDataFrame = _copy_dataframe(data_tbl)
12782
+ except Exception as e: # pragma: no cover
12783
+ data_tbl_step: IntoDataFrame = data_tbl # pragma: no cover
12784
+
12785
+ # Capture original table dimensions and columns before preprocessing
12786
+ # (only if preprocessing is present - we'll set these inside the preprocessing block)
12787
+ original_rows = None
12788
+ original_cols = None
12789
+ original_column_names = None
12317
12790
 
12318
12791
  # ------------------------------------------------
12319
12792
  # Preprocessing stage
@@ -12322,6 +12795,16 @@ class Validate:
12322
12795
  # Determine whether any preprocessing functions are to be applied to the table
12323
12796
  if validation.pre is not None:
12324
12797
  try:
12798
+ # Capture original table dimensions before preprocessing
12799
+ # Use get_row_count() instead of len() for compatibility with PySpark, etc.
12800
+ original_rows = get_row_count(data_tbl_step)
12801
+ original_cols = get_column_count(data_tbl_step)
12802
+ original_column_names = set(
12803
+ data_tbl_step.columns
12804
+ if hasattr(data_tbl_step, "columns")
12805
+ else list(data_tbl_step.columns)
12806
+ )
12807
+
12325
12808
  # Read the text of the preprocessing function
12326
12809
  pre_text = _pre_processing_funcs_to_str(validation.pre)
12327
12810
 
@@ -12354,6 +12837,62 @@ class Validate:
12354
12837
  elif isinstance(validation.pre, Callable):
12355
12838
  data_tbl_step = validation.pre(data_tbl_step)
12356
12839
 
12840
+ # After successful preprocessing, check dimensions and create notes
12841
+ # Use get_row_count() and get_column_count() for compatibility
12842
+ processed_rows = get_row_count(data_tbl_step)
12843
+ processed_cols = get_column_count(data_tbl_step)
12844
+
12845
+ # Always add a note when preprocessing is applied
12846
+ if original_rows != processed_rows or original_cols != processed_cols:
12847
+ # Dimensions changed - show the change
12848
+ note_html = _create_preprocessing_note_html(
12849
+ original_rows=original_rows,
12850
+ original_cols=original_cols,
12851
+ processed_rows=processed_rows,
12852
+ processed_cols=processed_cols,
12853
+ locale=self.locale,
12854
+ )
12855
+ note_text = _create_preprocessing_note_text(
12856
+ original_rows=original_rows,
12857
+ original_cols=original_cols,
12858
+ processed_rows=processed_rows,
12859
+ processed_cols=processed_cols,
12860
+ )
12861
+ else:
12862
+ # No dimension change - just indicate preprocessing was applied
12863
+ note_html = _create_preprocessing_no_change_note_html(locale=self.locale)
12864
+ note_text = _create_preprocessing_no_change_note_text()
12865
+
12866
+ validation._add_note(
12867
+ key="pre_applied",
12868
+ markdown=note_html,
12869
+ text=note_text,
12870
+ )
12871
+
12872
+ # Check if target column is synthetic (exists in processed but not original)
12873
+ # Only check for single column names (not lists used in rows_distinct, etc.)
12874
+ if column is not None and isinstance(column, str):
12875
+ processed_column_names = set(
12876
+ data_tbl_step.columns
12877
+ if hasattr(data_tbl_step, "columns")
12878
+ else list(data_tbl_step.columns)
12879
+ )
12880
+
12881
+ # Check if the target column is in the processed table but not in original
12882
+ if column in processed_column_names and column not in original_column_names:
12883
+ note_html = _create_synthetic_target_column_note_html(
12884
+ column_name=column,
12885
+ locale=self.locale,
12886
+ )
12887
+ note_text = _create_synthetic_target_column_note_text(
12888
+ column_name=column,
12889
+ )
12890
+ validation._add_note(
12891
+ key="syn_target_col",
12892
+ markdown=note_html,
12893
+ text=note_text,
12894
+ )
12895
+
12357
12896
  except Exception:
12358
12897
  # If preprocessing fails, mark the validation as having an eval_error
12359
12898
  validation.eval_error = True
@@ -12543,6 +13082,21 @@ class Validate:
12543
13082
  tbl=tbl, column=column, values=value, na_pass=na_pass
12544
13083
  )
12545
13084
 
13085
+ elif assertion_type == "col_pct_null":
13086
+ result_bool = col_pct_null(
13087
+ data_tbl=data_tbl_step,
13088
+ column=column,
13089
+ p=value["p"],
13090
+ bound_finder=value["bound_finder"],
13091
+ )
13092
+
13093
+ validation.all_passed = result_bool
13094
+ validation.n = 1
13095
+ validation.n_passed = int(result_bool)
13096
+ validation.n_failed = 1 - int(result_bool)
13097
+
13098
+ results_tbl = None
13099
+
12546
13100
  elif assertion_type == "col_vals_expr":
12547
13101
  results_tbl = col_vals_expr(
12548
13102
  data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
@@ -12602,10 +13156,21 @@ class Validate:
12602
13156
  # Add the schema validation info to the validation object
12603
13157
  validation.val_info = schema_validation_info
12604
13158
 
13159
+ # Add a note with the schema expectation and results
13160
+ schema_note_html = _create_col_schema_match_note_html(
13161
+ schema_info=schema_validation_info, locale=self.locale
13162
+ )
13163
+ schema_note_text = _create_col_schema_match_note_text(
13164
+ schema_info=schema_validation_info
13165
+ )
13166
+ validation._add_note(
13167
+ key="schema_check", markdown=schema_note_html, text=schema_note_text
13168
+ )
13169
+
12605
13170
  validation.all_passed = result_bool
12606
13171
  validation.n = 1
12607
13172
  validation.n_passed = int(result_bool)
12608
- validation.n_failed = 1 - result_bool
13173
+ validation.n_failed = 1 - int(result_bool)
12609
13174
 
12610
13175
  results_tbl = None
12611
13176
 
@@ -12620,7 +13185,7 @@ class Validate:
12620
13185
  validation.all_passed = result_bool
12621
13186
  validation.n = 1
12622
13187
  validation.n_passed = int(result_bool)
12623
- validation.n_failed = 1 - result_bool
13188
+ validation.n_failed = 1 - int(result_bool)
12624
13189
 
12625
13190
  results_tbl = None
12626
13191
 
@@ -12632,7 +13197,7 @@ class Validate:
12632
13197
  validation.all_passed = result_bool
12633
13198
  validation.n = 1
12634
13199
  validation.n_passed = int(result_bool)
12635
- validation.n_failed = 1 - result_bool
13200
+ validation.n_failed = 1 - int(result_bool)
12636
13201
 
12637
13202
  results_tbl = None
12638
13203
 
@@ -12651,7 +13216,7 @@ class Validate:
12651
13216
  validation.all_passed = result_bool
12652
13217
  validation.n = 1
12653
13218
  validation.n_passed = int(result_bool)
12654
- validation.n_failed = 1 - result_bool
13219
+ validation.n_failed = 1 - int(result_bool)
12655
13220
 
12656
13221
  results_tbl = None
12657
13222
 
@@ -12663,14 +13228,53 @@ class Validate:
12663
13228
  tbl_type=tbl_type,
12664
13229
  )
12665
13230
 
13231
+ elif is_valid_agg(assertion_type):
13232
+ agg, comp = resolve_agg_registries(assertion_type)
13233
+
13234
+ # Produce a 1-column Narwhals DataFrame
13235
+ # TODO: Should be able to take lazy too
13236
+ vec: nw.DataFrame = nw.from_native(data_tbl_step).select(column)
13237
+ real = agg(vec)
13238
+
13239
+ raw_value = value["value"]
13240
+ tol = value["tol"]
13241
+
13242
+ # Handle ReferenceColumn: compute target from reference data
13243
+ if isinstance(raw_value, ReferenceColumn):
13244
+ if self.reference is None:
13245
+ raise ValueError(
13246
+ f"Cannot use ref('{raw_value.column_name}') without "
13247
+ "setting reference data on the Validate object. "
13248
+ "Use Validate(data=..., reference=...) to set reference data."
13249
+ )
13250
+ ref_vec: nw.DataFrame = nw.from_native(self.reference).select(
13251
+ raw_value.column_name
13252
+ )
13253
+ target: float | int = agg(ref_vec)
13254
+ else:
13255
+ target = raw_value
13256
+
13257
+ lower_diff, upper_diff = _derive_bounds(target, tol)
13258
+
13259
+ lower_bound = target - lower_diff
13260
+ upper_bound = target + upper_diff
13261
+ result_bool: bool = comp(real, lower_bound, upper_bound)
13262
+
13263
+ validation.all_passed = result_bool
13264
+ validation.n = 1
13265
+ validation.n_passed = int(result_bool)
13266
+ validation.n_failed = 1 - result_bool
13267
+
13268
+ results_tbl = None
12666
13269
  else:
12667
13270
  raise ValueError(
12668
13271
  f"Unknown assertion type: {assertion_type}"
12669
13272
  ) # pragma: no cover
12670
13273
 
12671
13274
  except Exception as e:
12672
- # Only catch specific data quality comparison errors, not programming errors
13275
+ # Catch data quality errors and column not found errors
12673
13276
  error_msg = str(e).lower()
13277
+
12674
13278
  is_comparison_error = (
12675
13279
  "boolean value of na is ambiguous" in error_msg
12676
13280
  or "cannot compare" in error_msg
@@ -12681,20 +13285,101 @@ class Validate:
12681
13285
  or ("dtype" in error_msg and "compare" in error_msg)
12682
13286
  )
12683
13287
 
12684
- if is_comparison_error: # pragma: no cover
12685
- # If data quality comparison fails, mark the validation as having an eval_error
13288
+ is_column_not_found = "column" in error_msg and "not found" in error_msg
13289
+
13290
+ is_comparison_column_not_found = (
13291
+ "unable to find column" in error_msg and "valid columns" in error_msg
13292
+ )
13293
+
13294
+ if (
13295
+ is_comparison_error or is_column_not_found or is_comparison_column_not_found
13296
+ ): # pragma: no cover
13297
+ # If data quality comparison fails or column not found, mark as eval_error
12686
13298
  validation.eval_error = True # pragma: no cover
13299
+
13300
+ # Add a note for column not found errors (target column)
13301
+ if is_column_not_found:
13302
+ note_html = _create_column_not_found_note_html(
13303
+ column_name=column,
13304
+ available_columns=list(data_tbl_step.columns)
13305
+ if hasattr(data_tbl_step, "columns")
13306
+ else [],
13307
+ locale=self.locale,
13308
+ )
13309
+ note_text = _create_column_not_found_note_text(
13310
+ column_name=column,
13311
+ available_columns=list(data_tbl_step.columns)
13312
+ if hasattr(data_tbl_step, "columns")
13313
+ else [],
13314
+ )
13315
+ validation._add_note(
13316
+ key="column_not_found",
13317
+ markdown=note_html,
13318
+ text=note_text,
13319
+ )
13320
+
13321
+ # Add a note for comparison column not found errors
13322
+ elif is_comparison_column_not_found:
13323
+ # Extract column name from error message
13324
+ # Error format: 'unable to find column "col_name"; valid columns: ...'
13325
+ match = re.search(r'unable to find column "([^"]+)"', str(e))
13326
+
13327
+ if match:
13328
+ missing_col_name = match.group(1)
13329
+
13330
+ # Determine position for between/outside validations
13331
+ position = None
13332
+ if assertion_type in ["col_vals_between", "col_vals_outside"]:
13333
+ # Check if missing column is in left or right position
13334
+ from pointblank.column import Column
13335
+
13336
+ if (
13337
+ isinstance(value[0], Column)
13338
+ and value[0].exprs == missing_col_name
13339
+ ):
13340
+ position = "left"
13341
+ elif (
13342
+ isinstance(value[1], Column)
13343
+ and value[1].exprs == missing_col_name
13344
+ ):
13345
+ position = "right"
13346
+
13347
+ note_html = _create_comparison_column_not_found_note_html(
13348
+ column_name=missing_col_name,
13349
+ position=position,
13350
+ available_columns=list(data_tbl_step.columns)
13351
+ if hasattr(data_tbl_step, "columns")
13352
+ else [],
13353
+ locale=self.locale,
13354
+ )
13355
+ note_text = _create_comparison_column_not_found_note_text(
13356
+ column_name=missing_col_name,
13357
+ position=position,
13358
+ available_columns=list(data_tbl_step.columns)
13359
+ if hasattr(data_tbl_step, "columns")
13360
+ else [],
13361
+ )
13362
+ validation._add_note(
13363
+ key="comparison_column_not_found",
13364
+ markdown=note_html,
13365
+ text=note_text,
13366
+ )
13367
+
12687
13368
  end_time = datetime.datetime.now(datetime.timezone.utc) # pragma: no cover
13369
+
12688
13370
  validation.proc_duration_s = (
12689
13371
  end_time - start_time
12690
13372
  ).total_seconds() # pragma: no cover
13373
+
12691
13374
  validation.time_processed = end_time.isoformat(
12692
13375
  timespec="milliseconds"
12693
13376
  ) # pragma: no cover
13377
+
12694
13378
  validation.active = False # pragma: no cover
13379
+
12695
13380
  continue # pragma: no cover
12696
13381
  else:
12697
- # For other errors (like missing columns), let them propagate
13382
+ # For other unexpected errors, let them propagate
12698
13383
  raise
12699
13384
 
12700
13385
  else:
@@ -12792,6 +13477,7 @@ class Validate:
12792
13477
  markdown=threshold_note_html,
12793
13478
  text=threshold_note_text,
12794
13479
  )
13480
+
12795
13481
  elif self.thresholds != Thresholds():
12796
13482
  # Thresholds explicitly reset to empty when global thresholds exist
12797
13483
  reset_note_html = _create_threshold_reset_note_html(locale=self.locale)
@@ -12814,6 +13500,8 @@ class Validate:
12814
13500
  column=column,
12815
13501
  values=value,
12816
13502
  for_failure=True,
13503
+ locale=self.locale,
13504
+ n_rows=n_rows,
12817
13505
  )
12818
13506
 
12819
13507
  # Set the failure text in the validation step
@@ -13320,12 +14008,14 @@ class Validate:
13320
14008
  )
13321
14009
 
13322
14010
  # Get the threshold status using the appropriate method
14011
+ # Note: scalar=False (default) always returns a dict
14012
+ status: dict[int, bool]
13323
14013
  if level == "warning":
13324
- status = self.warning(i=i)
14014
+ status = self.warning(i=i) # type: ignore[assignment]
13325
14015
  elif level == "error":
13326
- status = self.error(i=i)
13327
- elif level == "critical":
13328
- status = self.critical(i=i)
14016
+ status = self.error(i=i) # type: ignore[assignment]
14017
+ else: # level == "critical"
14018
+ status = self.critical(i=i) # type: ignore[assignment]
13329
14019
 
13330
14020
  # Find any steps that exceeded the threshold
13331
14021
  failures = []
@@ -13479,12 +14169,14 @@ class Validate:
13479
14169
  )
13480
14170
 
13481
14171
  # Get the threshold status using the appropriate method
14172
+ # Note: scalar=False (default) always returns a dict
14173
+ status: dict[int, bool]
13482
14174
  if level == "warning":
13483
- status = self.warning(i=i)
14175
+ status = self.warning(i=i) # type: ignore[assignment]
13484
14176
  elif level == "error":
13485
- status = self.error(i=i)
13486
- elif level == "critical":
13487
- status = self.critical(i=i)
14177
+ status = self.error(i=i) # type: ignore[assignment]
14178
+ else: # level == "critical"
14179
+ status = self.critical(i=i) # type: ignore[assignment]
13488
14180
 
13489
14181
  # Return True if any steps exceeded the threshold
13490
14182
  return any(status.values())
@@ -14257,7 +14949,7 @@ class Validate:
14257
14949
 
14258
14950
  def get_data_extracts(
14259
14951
  self, i: int | list[int] | None = None, frame: bool = False
14260
- ) -> dict[int, FrameT | None] | FrameT | None:
14952
+ ) -> dict[int, Any] | Any:
14261
14953
  """
14262
14954
  Get the rows that failed for each validation step.
14263
14955
 
@@ -14280,7 +14972,7 @@ class Validate:
14280
14972
 
14281
14973
  Returns
14282
14974
  -------
14283
- dict[int, FrameT | None] | FrameT | None
14975
+ dict[int, Any] | Any
14284
14976
  A dictionary of tables containing the rows that failed in every compatible validation
14285
14977
  step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
14286
14978
 
@@ -14570,7 +15262,7 @@ class Validate:
14570
15262
 
14571
15263
  return json.dumps(report, indent=4, default=str)
14572
15264
 
14573
- def get_sundered_data(self, type="pass") -> FrameT:
15265
+ def get_sundered_data(self, type="pass") -> Any:
14574
15266
  """
14575
15267
  Get the data that passed or failed the validation steps.
14576
15268
 
@@ -14606,7 +15298,7 @@ class Validate:
14606
15298
 
14607
15299
  Returns
14608
15300
  -------
14609
- FrameT
15301
+ Any
14610
15302
  A table containing the data that passed or failed the validation steps.
14611
15303
 
14612
15304
  Examples
@@ -14698,6 +15390,7 @@ class Validate:
14698
15390
  # Get all validation step result tables and join together the `pb_is_good_` columns
14699
15391
  # ensuring that the columns are named uniquely (e.g., `pb_is_good_1`, `pb_is_good_2`, ...)
14700
15392
  # and that the index is reset
15393
+ labeled_tbl_nw: nw.DataFrame | nw.LazyFrame | None = None
14701
15394
  for i, validation in enumerate(validation_info):
14702
15395
  results_tbl = nw.from_native(validation.tbl_checked)
14703
15396
 
@@ -14718,7 +15411,7 @@ class Validate:
14718
15411
  )
14719
15412
 
14720
15413
  # Add the results table to the list of tables
14721
- if i == 0:
15414
+ if labeled_tbl_nw is None:
14722
15415
  labeled_tbl_nw = results_tbl
14723
15416
  else:
14724
15417
  labeled_tbl_nw = labeled_tbl_nw.join(results_tbl, on=index_name, how="left")
@@ -14892,7 +15585,12 @@ class Validate:
14892
15585
  return None
14893
15586
 
14894
15587
  def get_tabular_report(
14895
- self, title: str | None = ":default:", incl_header: bool = None, incl_footer: bool = None
15588
+ self,
15589
+ title: str | None = ":default:",
15590
+ incl_header: bool | None = None,
15591
+ incl_footer: bool | None = None,
15592
+ incl_footer_timings: bool | None = None,
15593
+ incl_footer_notes: bool | None = None,
14896
15594
  ) -> GT:
14897
15595
  """
14898
15596
  Validation report as a GT table.
@@ -14915,6 +15613,20 @@ class Validate:
14915
15613
  name of the table as the title for the report. If no title is wanted, then `":none:"`
14916
15614
  can be used. Aside from keyword options, text can be provided for the title. This will
14917
15615
  be interpreted as Markdown text and transformed internally to HTML.
15616
+ incl_header
15617
+ Controls whether the header section should be displayed. If `None`, uses the global
15618
+ configuration setting. The header contains the table name, label, and threshold
15619
+ information.
15620
+ incl_footer
15621
+ Controls whether the footer section should be displayed. If `None`, uses the global
15622
+ configuration setting. The footer can contain validation timing information and notes.
15623
+ incl_footer_timings
15624
+ Controls whether validation timing information (start time, duration, end time) should
15625
+ be displayed in the footer. If `None`, uses the global configuration setting. Only
15626
+ applies when `incl_footer=True`.
15627
+ incl_footer_notes
15628
+ Controls whether notes from validation steps should be displayed in the footer. If
15629
+ `None`, uses the global configuration setting. Only applies when `incl_footer=True`.
14918
15630
 
14919
15631
  Returns
14920
15632
  -------
@@ -14974,6 +15686,10 @@ class Validate:
14974
15686
  incl_header = global_config.report_incl_header
14975
15687
  if incl_footer is None:
14976
15688
  incl_footer = global_config.report_incl_footer
15689
+ if incl_footer_timings is None:
15690
+ incl_footer_timings = global_config.report_incl_footer_timings
15691
+ if incl_footer_notes is None:
15692
+ incl_footer_notes = global_config.report_incl_footer_notes
14977
15693
 
14978
15694
  # Do we have a DataFrame library to work with?
14979
15695
  _check_any_df_lib(method_used="get_tabular_report")
@@ -15212,30 +15928,59 @@ class Validate:
15212
15928
  columns_upd = []
15213
15929
 
15214
15930
  columns = validation_info_dict["column"]
15931
+ notes = validation_info_dict["notes"]
15215
15932
 
15216
15933
  assertion_type = validation_info_dict["assertion_type"]
15217
15934
 
15218
15935
  # Iterate over the values in the `column` entry
15219
15936
  for i, column in enumerate(columns):
15937
+ # Check if this validation has a synthetic target column note
15938
+ has_synthetic_column = (
15939
+ notes[i] is not None and isinstance(notes[i], dict) and "syn_target_col" in notes[i]
15940
+ )
15941
+
15942
+ column_text = None
15943
+
15220
15944
  if assertion_type[i] in [
15221
15945
  "col_schema_match",
15222
15946
  "row_count_match",
15223
15947
  "col_count_match",
15224
15948
  "col_vals_expr",
15225
15949
  ]:
15226
- columns_upd.append("&mdash;")
15950
+ column_text = "&mdash;"
15227
15951
  elif assertion_type[i] in ["rows_distinct", "rows_complete", "prompt"]:
15228
15952
  if not column:
15229
15953
  # If there is no column subset, then all columns are used
15230
- columns_upd.append("ALL COLUMNS")
15954
+ column_text = "ALL COLUMNS"
15231
15955
  else:
15232
15956
  # With a column subset list, format with commas between the column names
15233
- columns_upd.append(", ".join(column))
15234
-
15957
+ column_text = ", ".join(column)
15235
15958
  elif assertion_type[i] in ["conjointly", "specially"]:
15236
- columns_upd.append("")
15959
+ column_text = ""
15237
15960
  else:
15238
- columns_upd.append(str(column))
15961
+ # Handle both string columns and list columns
15962
+ # For single-element lists like ['a'], display as 'a'
15963
+ # For multi-element lists, display as comma-separated values
15964
+ if isinstance(column, list):
15965
+ column_text = ", ".join(str(c) for c in column)
15966
+ else:
15967
+ column_text = str(column)
15968
+
15969
+ # Apply underline styling for synthetic columns; only apply styling if column_text is
15970
+ # not empty and not a special marker
15971
+ if (
15972
+ has_synthetic_column
15973
+ and column_text
15974
+ and column_text not in ["&mdash;", "ALL COLUMNS", ""]
15975
+ ):
15976
+ column_text = (
15977
+ f'<span style="text-decoration: underline; '
15978
+ f"text-decoration-color: #9A7CB4; text-decoration-thickness: 1px; "
15979
+ f'text-underline-offset: 3px;">'
15980
+ f"{column_text}</span>"
15981
+ )
15982
+
15983
+ columns_upd.append(column_text)
15239
15984
 
15240
15985
  # Add the `columns_upd` entry to the dictionary
15241
15986
  validation_info_dict["columns_upd"] = columns_upd
@@ -15291,6 +16036,15 @@ class Validate:
15291
16036
  ]:
15292
16037
  values_upd.append("&mdash;")
15293
16038
 
16039
+ elif assertion_type[i] in ["col_pct_null"]:
16040
+ # Extract p and tol from the values dict for nice formatting
16041
+ p_value = value["p"]
16042
+
16043
+ # Extract tol from the bound_finder partial function
16044
+ bound_finder = value.get("bound_finder")
16045
+ tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
16046
+ values_upd.append(f"p = {p_value}<br/>tol = {tol_value}")
16047
+
15294
16048
  elif assertion_type[i] in ["col_schema_match"]:
15295
16049
  values_upd.append("SCHEMA")
15296
16050
 
@@ -15332,6 +16086,32 @@ class Validate:
15332
16086
  else: # pragma: no cover
15333
16087
  values_upd.append(str(value)) # pragma: no cover
15334
16088
 
16089
+ # Handle aggregation methods (col_sum_gt, col_avg_eq, etc.)
16090
+ elif is_valid_agg(assertion_type[i]):
16091
+ # Extract the value and tolerance from the values dict
16092
+ agg_value = value.get("value")
16093
+ tol_value = value.get("tol", 0)
16094
+
16095
+ # Format the value (could be a number, Column, or ReferenceColumn)
16096
+ if hasattr(agg_value, "__repr__"):
16097
+ # For Column or ReferenceColumn objects, use their repr
16098
+ value_str = repr(agg_value)
16099
+ else:
16100
+ value_str = str(agg_value)
16101
+
16102
+ # Format tolerance - only show on second line if non-zero
16103
+ if tol_value != 0:
16104
+ # Format tolerance based on its type
16105
+ if isinstance(tol_value, tuple):
16106
+ # Asymmetric bounds: (lower, upper)
16107
+ tol_str = f"tol=({tol_value[0]}, {tol_value[1]})"
16108
+ else:
16109
+ # Symmetric tolerance
16110
+ tol_str = f"tol={tol_value}"
16111
+ values_upd.append(f"{value_str}<br/>{tol_str}")
16112
+ else:
16113
+ values_upd.append(value_str)
16114
+
15335
16115
  # If the assertion type is not recognized, add the value as a string
15336
16116
  else: # pragma: no cover
15337
16117
  values_upd.append(str(value)) # pragma: no cover
@@ -15766,13 +16546,15 @@ class Validate:
15766
16546
  gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
15767
16547
 
15768
16548
  if incl_footer:
15769
- # Add table time as HTML source note
15770
- gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
16549
+ # Add table time as HTML source note if enabled
16550
+ if incl_footer_timings:
16551
+ gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
15771
16552
 
15772
- # Create notes markdown from validation steps and add as separate source note
15773
- notes_markdown = _create_notes_html(self.validation_info)
15774
- if notes_markdown:
15775
- gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
16553
+ # Create notes markdown from validation steps and add as separate source note if enabled
16554
+ if incl_footer_notes:
16555
+ notes_markdown = _create_notes_html(self.validation_info)
16556
+ if notes_markdown:
16557
+ gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
15776
16558
 
15777
16559
  # If the interrogation has not been performed, then style the table columns dealing with
15778
16560
  # interrogation data as grayed out
@@ -16179,7 +16961,7 @@ class Validate:
16179
16961
  table = validation.pre(self.data)
16180
16962
 
16181
16963
  # Get the columns from the table as a list
16182
- columns = list(table.columns)
16964
+ columns = list(table.columns) # type: ignore[union-attr]
16183
16965
 
16184
16966
  # Evaluate the column expression
16185
16967
  if isinstance(column_expr, ColumnSelectorNarwhals):
@@ -16189,6 +16971,12 @@ class Validate:
16189
16971
 
16190
16972
  except Exception: # pragma: no cover
16191
16973
  validation.eval_error = True
16974
+ columns_resolved = []
16975
+ # Store columns list for note generation
16976
+ try:
16977
+ columns = list(table.columns) if "table" in locals() else []
16978
+ except Exception:
16979
+ columns = []
16192
16980
 
16193
16981
  # If no columns were resolved, then create a patched validation step with the
16194
16982
  # `eval_error` and `column` attributes set
@@ -16196,6 +16984,22 @@ class Validate:
16196
16984
  validation.eval_error = True
16197
16985
  validation.column = str(column_expr)
16198
16986
 
16987
+ # Add a helpful note explaining that no columns were resolved
16988
+ note_html = _create_no_columns_resolved_note_html(
16989
+ column_expr=str(column_expr),
16990
+ available_columns=columns,
16991
+ locale=self.locale,
16992
+ )
16993
+ note_text = _create_no_columns_resolved_note_text(
16994
+ column_expr=str(column_expr),
16995
+ available_columns=columns,
16996
+ )
16997
+ validation._add_note(
16998
+ key="no_columns_resolved",
16999
+ markdown=note_html,
17000
+ text=note_text,
17001
+ )
17002
+
16199
17003
  expanded_validation_info.append(validation)
16200
17004
  continue
16201
17005
 
@@ -16535,7 +17339,7 @@ def _convert_string_to_datetime(value: str) -> datetime.datetime:
16535
17339
  return datetime.datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
16536
17340
 
16537
17341
 
16538
- def _string_date_dttm_conversion(value: any) -> any:
17342
+ def _string_date_dttm_conversion(value: Any) -> Any:
16539
17343
  """
16540
17344
  Convert a string to a date or datetime object if it is in the correct format.
16541
17345
  If the value is not a string, it is returned as is.
@@ -16570,8 +17374,8 @@ def _string_date_dttm_conversion(value: any) -> any:
16570
17374
 
16571
17375
 
16572
17376
  def _conditional_string_date_dttm_conversion(
16573
- value: any, allow_regular_strings: bool = False
16574
- ) -> any:
17377
+ value: Any, allow_regular_strings: bool = False
17378
+ ) -> Any:
16575
17379
  """
16576
17380
  Conditionally convert a string to a date or datetime object if it is in the correct format. If
16577
17381
  `allow_regular_strings=` is `True`, regular strings are allowed to pass through unchanged. If
@@ -16615,9 +17419,9 @@ def _process_brief(
16615
17419
  brief: str | None,
16616
17420
  step: int,
16617
17421
  col: str | list[str] | None,
16618
- values: any | None,
16619
- thresholds: any | None,
16620
- segment: any | None,
17422
+ values: Any | None,
17423
+ thresholds: Any | None,
17424
+ segment: Any | None,
16621
17425
  ) -> str:
16622
17426
  # If there is no brief, return `None`
16623
17427
  if brief is None:
@@ -16704,7 +17508,7 @@ def _process_action_str(
16704
17508
  action_str: str,
16705
17509
  step: int,
16706
17510
  col: str | None,
16707
- value: any,
17511
+ value: Any,
16708
17512
  type: str,
16709
17513
  level: str,
16710
17514
  time: str,
@@ -16754,7 +17558,13 @@ def _process_action_str(
16754
17558
 
16755
17559
 
16756
17560
  def _create_autobrief_or_failure_text(
16757
- assertion_type: str, lang: str, column: str | None, values: str | None, for_failure: bool
17561
+ assertion_type: str,
17562
+ lang: str,
17563
+ column: str,
17564
+ values: Any,
17565
+ for_failure: bool,
17566
+ locale: str | None = None,
17567
+ n_rows: int | None = None,
16758
17568
  ) -> str:
16759
17569
  if assertion_type in [
16760
17570
  "col_vals_gt",
@@ -16878,6 +17688,16 @@ def _create_autobrief_or_failure_text(
16878
17688
  for_failure=for_failure,
16879
17689
  )
16880
17690
 
17691
+ if assertion_type == "col_pct_null":
17692
+ return _create_text_col_pct_null(
17693
+ lang=lang,
17694
+ column=column,
17695
+ value=values,
17696
+ for_failure=for_failure,
17697
+ locale=locale if locale else lang,
17698
+ n_rows=n_rows,
17699
+ )
17700
+
16881
17701
  if assertion_type == "conjointly":
16882
17702
  return _create_text_conjointly(lang=lang, for_failure=for_failure)
16883
17703
 
@@ -16893,7 +17713,7 @@ def _create_autobrief_or_failure_text(
16893
17713
  for_failure=for_failure,
16894
17714
  )
16895
17715
 
16896
- return None # pragma: no cover
17716
+ return None
16897
17717
 
16898
17718
 
16899
17719
  def _expect_failure_type(for_failure: bool) -> str:
@@ -16903,7 +17723,7 @@ def _expect_failure_type(for_failure: bool) -> str:
16903
17723
  def _create_text_comparison(
16904
17724
  assertion_type: str,
16905
17725
  lang: str,
16906
- column: str | list[str] | None,
17726
+ column: str | list[str],
16907
17727
  values: str | None,
16908
17728
  for_failure: bool = False,
16909
17729
  ) -> str:
@@ -16929,7 +17749,7 @@ def _create_text_comparison(
16929
17749
 
16930
17750
  def _create_text_between(
16931
17751
  lang: str,
16932
- column: str | None,
17752
+ column: str,
16933
17753
  value_1: str,
16934
17754
  value_2: str,
16935
17755
  not_: bool = False,
@@ -16959,7 +17779,7 @@ def _create_text_between(
16959
17779
 
16960
17780
 
16961
17781
  def _create_text_set(
16962
- lang: str, column: str | None, values: list[any], not_: bool = False, for_failure: bool = False
17782
+ lang: str, column: str, values: list[Any], not_: bool = False, for_failure: bool = False
16963
17783
  ) -> str:
16964
17784
  type_ = _expect_failure_type(for_failure=for_failure)
16965
17785
 
@@ -16981,9 +17801,7 @@ def _create_text_set(
16981
17801
  return text
16982
17802
 
16983
17803
 
16984
- def _create_text_null(
16985
- lang: str, column: str | None, not_: bool = False, for_failure: bool = False
16986
- ) -> str:
17804
+ def _create_text_null(lang: str, column: str, not_: bool = False, for_failure: bool = False) -> str:
16987
17805
  type_ = _expect_failure_type(for_failure=for_failure)
16988
17806
 
16989
17807
  column_text = _prep_column_text(column=column)
@@ -17000,9 +17818,7 @@ def _create_text_null(
17000
17818
  return text
17001
17819
 
17002
17820
 
17003
- def _create_text_regex(
17004
- lang: str, column: str | None, pattern: str | dict, for_failure: bool = False
17005
- ) -> str:
17821
+ def _create_text_regex(lang: str, column: str, pattern: str, for_failure: bool = False) -> str:
17006
17822
  type_ = _expect_failure_type(for_failure=for_failure)
17007
17823
 
17008
17824
  column_text = _prep_column_text(column=column)
@@ -17034,7 +17850,7 @@ def _create_text_expr(lang: str, for_failure: bool) -> str:
17034
17850
  return EXPECT_FAIL_TEXT[f"col_vals_expr_{type_}_text"][lang]
17035
17851
 
17036
17852
 
17037
- def _create_text_col_exists(lang: str, column: str | None, for_failure: bool = False) -> str:
17853
+ def _create_text_col_exists(lang: str, column: str, for_failure: bool = False) -> str:
17038
17854
  type_ = _expect_failure_type(for_failure=for_failure)
17039
17855
 
17040
17856
  column_text = _prep_column_text(column=column)
@@ -17084,7 +17900,7 @@ def _create_text_rows_complete(
17084
17900
  return text
17085
17901
 
17086
17902
 
17087
- def _create_text_row_count_match(lang: str, value: int, for_failure: bool = False) -> str:
17903
+ def _create_text_row_count_match(lang: str, value: dict, for_failure: bool = False) -> str:
17088
17904
  type_ = _expect_failure_type(for_failure=for_failure)
17089
17905
 
17090
17906
  values_text = _prep_values_text(value["count"], lang=lang)
@@ -17092,7 +17908,7 @@ def _create_text_row_count_match(lang: str, value: int, for_failure: bool = Fals
17092
17908
  return EXPECT_FAIL_TEXT[f"row_count_match_n_{type_}_text"][lang].format(values_text=values_text)
17093
17909
 
17094
17910
 
17095
- def _create_text_col_count_match(lang: str, value: int, for_failure: bool = False) -> str:
17911
+ def _create_text_col_count_match(lang: str, value: dict, for_failure: bool = False) -> str:
17096
17912
  type_ = _expect_failure_type(for_failure=for_failure)
17097
17913
 
17098
17914
  values_text = _prep_values_text(value["count"], lang=lang)
@@ -17100,6 +17916,115 @@ def _create_text_col_count_match(lang: str, value: int, for_failure: bool = Fals
17100
17916
  return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text)
17101
17917
 
17102
17918
 
17919
+ def _create_text_col_pct_null(
17920
+ lang: str,
17921
+ column: str | None,
17922
+ value: dict,
17923
+ for_failure: bool = False,
17924
+ locale: str | None = None,
17925
+ n_rows: int | None = None,
17926
+ ) -> str:
17927
+ """Create text for col_pct_null validation with tolerance handling."""
17928
+ type_ = _expect_failure_type(for_failure=for_failure)
17929
+
17930
+ column_text = _prep_column_text(column=column)
17931
+
17932
+ # Use locale for number formatting, defaulting to lang if not provided
17933
+ fmt_locale = locale if locale else lang
17934
+
17935
+ # Extract p and tol from the values dict
17936
+ p_value = value.get("p", 0) * 100 # Convert to percentage
17937
+ p_value_original = value.get("p", 0) # Keep original value for deviation format
17938
+
17939
+ # Extract tol from the bound_finder partial function
17940
+ bound_finder = value.get("bound_finder")
17941
+ tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
17942
+
17943
+ # Handle different tolerance types
17944
+ has_tolerance = False
17945
+ is_asymmetric = False
17946
+
17947
+ if isinstance(tol_value, tuple):
17948
+ # Tuple tolerance: can be (lower, upper) in absolute or relative terms
17949
+ tol_lower, tol_upper = tol_value
17950
+
17951
+ # Check if we have any non-zero tolerance
17952
+ has_tolerance = tol_lower != 0 or tol_upper != 0
17953
+ is_asymmetric = tol_lower != tol_upper
17954
+
17955
+ # For relative tolerances (floats < 1), we can compute exact percentage bounds
17956
+ # For absolute tolerances (ints >= 1), calculate based on actual row count if available
17957
+ if tol_lower < 1:
17958
+ # Relative tolerance (float)
17959
+ lower_pct_delta = tol_lower * 100
17960
+ else:
17961
+ # Absolute tolerance (int); uses actual row count if available
17962
+ if n_rows is not None and n_rows > 0:
17963
+ lower_pct_delta = (tol_lower / n_rows) * 100
17964
+ else:
17965
+ lower_pct_delta = tol_lower # Fallback approximation
17966
+
17967
+ if tol_upper < 1:
17968
+ # Relative tolerance (float)
17969
+ upper_pct_delta = tol_upper * 100
17970
+ else:
17971
+ # Absolute tolerance (int); uses actual row count if available
17972
+ if n_rows is not None and n_rows > 0:
17973
+ upper_pct_delta = (tol_upper / n_rows) * 100
17974
+ else:
17975
+ upper_pct_delta = tol_upper # Fallback approximation
17976
+ else:
17977
+ # Single value tolerance: symmetric
17978
+ has_tolerance = tol_value != 0
17979
+
17980
+ if tol_value < 1:
17981
+ # Relative tolerance (float)
17982
+ tol_pct = tol_value * 100
17983
+ else:
17984
+ # Absolute tolerance (int) - use actual row count if available
17985
+ if n_rows is not None and n_rows > 0:
17986
+ tol_pct = (tol_value / n_rows) * 100
17987
+ else:
17988
+ tol_pct = tol_value # Fallback approximation
17989
+
17990
+ lower_pct_delta = tol_pct
17991
+ upper_pct_delta = tol_pct
17992
+
17993
+ # Format numbers with locale-aware formatting
17994
+ p_formatted = _format_number_safe(p_value, decimals=1, locale=fmt_locale)
17995
+ p_original_formatted = _format_number_safe(p_value_original, decimals=2, locale=fmt_locale)
17996
+
17997
+ # Choose the appropriate translation key based on tolerance
17998
+ if not has_tolerance:
17999
+ # No tolerance - use simple text
18000
+ text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text"][lang].format(
18001
+ column_text=column_text,
18002
+ p=p_formatted,
18003
+ )
18004
+ elif is_asymmetric or isinstance(tol_value, tuple):
18005
+ # Use deviation format for tuple tolerances (including symmetric ones)
18006
+ # Format the deviation values with signs (using proper minus sign U+2212)
18007
+ lower_dev = f"−{_format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)}%"
18008
+ upper_dev = f"+{_format_number_safe(upper_pct_delta, decimals=1, locale=fmt_locale)}%"
18009
+
18010
+ text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol_deviation"][lang].format(
18011
+ column_text=column_text,
18012
+ lower_dev=lower_dev,
18013
+ upper_dev=upper_dev,
18014
+ p=p_original_formatted,
18015
+ )
18016
+ else:
18017
+ # Single value tolerance - use the symmetric ± format
18018
+ tol_formatted = _format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)
18019
+ text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol"][lang].format(
18020
+ column_text=column_text,
18021
+ p=p_formatted,
18022
+ tol=tol_formatted,
18023
+ )
18024
+
18025
+ return text
18026
+
18027
+
17103
18028
  def _create_text_conjointly(lang: str, for_failure: bool = False) -> str:
17104
18029
  type_ = _expect_failure_type(for_failure=for_failure)
17105
18030
 
@@ -17120,19 +18045,13 @@ def _create_text_prompt(lang: str, prompt: str, for_failure: bool = False) -> st
17120
18045
  def _prep_column_text(column: str | list[str]) -> str:
17121
18046
  if isinstance(column, list):
17122
18047
  return "`" + str(column[0]) + "`"
17123
- elif isinstance(column, str):
18048
+ if isinstance(column, str):
17124
18049
  return "`" + column + "`"
17125
- else:
17126
- return ""
18050
+ raise AssertionError
17127
18051
 
17128
18052
 
17129
18053
  def _prep_values_text(
17130
- values: str
17131
- | int
17132
- | float
17133
- | datetime.datetime
17134
- | datetime.date
17135
- | list[str | int | float | datetime.datetime | datetime.date],
18054
+ values: _CompliantValue | _CompliantValues,
17136
18055
  lang: str,
17137
18056
  limit: int = 3,
17138
18057
  ) -> str:
@@ -17180,7 +18099,7 @@ def _prep_values_text(
17180
18099
  return values_str
17181
18100
 
17182
18101
 
17183
- def _seg_expr_from_string(data_tbl: any, segments_expr: str) -> list[tuple[str, str]]:
18102
+ def _seg_expr_from_string(data_tbl: Any, segments_expr: str) -> tuple[str, str]:
17184
18103
  """
17185
18104
  Obtain the segmentation categories from a table column.
17186
18105
 
@@ -17283,7 +18202,7 @@ def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, Any]]:
17283
18202
  return seg_tuples
17284
18203
 
17285
18204
 
17286
- def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
18205
+ def _apply_segments(data_tbl: Any, segments_expr: tuple[str, str]) -> Any:
17287
18206
  """
17288
18207
  Apply the segments expression to the data table.
17289
18208
 
@@ -17347,8 +18266,26 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
17347
18266
  except ValueError: # pragma: no cover
17348
18267
  pass # pragma: no cover
17349
18268
 
17350
- # Format 2: Datetime strings with UTC timezone like
17351
- # "2016-01-04 00:00:01 UTC.strict_cast(...)"
18269
+ # Format 2: Direct datetime strings like "2016-01-04 00:00:01" (Polars 1.36+)
18270
+ # These don't have UTC suffix anymore
18271
+ elif (
18272
+ " " in segment_str
18273
+ and "UTC" not in segment_str
18274
+ and "[" not in segment_str
18275
+ and ".alias" not in segment_str
18276
+ ):
18277
+ try:
18278
+ parsed_dt = datetime.fromisoformat(segment_str)
18279
+ # Convert midnight datetimes to dates for consistency
18280
+ if parsed_dt.time() == datetime.min.time():
18281
+ parsed_value = parsed_dt.date() # pragma: no cover
18282
+ else:
18283
+ parsed_value = parsed_dt
18284
+ except ValueError: # pragma: no cover
18285
+ pass # pragma: no cover
18286
+
18287
+ # Format 3: Datetime strings with UTC timezone like
18288
+ # "2016-01-04 00:00:01 UTC.strict_cast(...)" (Polars < 1.36)
17352
18289
  elif " UTC" in segment_str:
17353
18290
  try:
17354
18291
  # Extract just the datetime part before "UTC"
@@ -17363,7 +18300,7 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
17363
18300
  except (ValueError, IndexError): # pragma: no cover
17364
18301
  pass # pragma: no cover
17365
18302
 
17366
- # Format 3: Bracketed expressions like ['2016-01-04']
18303
+ # Format 4: Bracketed expressions like ['2016-01-04']
17367
18304
  elif segment_str.startswith("[") and segment_str.endswith("]"):
17368
18305
  try: # pragma: no cover
17369
18306
  # Remove [' and ']
@@ -17498,7 +18435,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
17498
18435
 
17499
18436
  def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]:
17500
18437
  # For each icon, get the assertion icon SVG test from SVG_ICONS_FOR_ASSERTION_TYPES dictionary
17501
- icon_svg = [SVG_ICONS_FOR_ASSERTION_TYPES.get(icon) for icon in icon]
18438
+ icon_svg: list[str] = [SVG_ICONS_FOR_ASSERTION_TYPES[icon] for icon in icon]
17502
18439
 
17503
18440
  # Replace the width and height in the SVG string
17504
18441
  for i in range(len(icon_svg)):
@@ -17507,11 +18444,9 @@ def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]:
17507
18444
  return icon_svg
17508
18445
 
17509
18446
 
17510
- def _replace_svg_dimensions(svg: list[str], height_width: int | float) -> list[str]:
18447
+ def _replace_svg_dimensions(svg: str, height_width: int | float) -> str:
17511
18448
  svg = re.sub(r'width="[0-9]*?px', f'width="{height_width}px', svg)
17512
- svg = re.sub(r'height="[0-9]*?px', f'height="{height_width}px', svg)
17513
-
17514
- return svg
18449
+ return re.sub(r'height="[0-9]*?px', f'height="{height_width}px', svg)
17515
18450
 
17516
18451
 
17517
18452
  def _get_title_text(
@@ -17575,7 +18510,7 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s
17575
18510
  return title_text
17576
18511
 
17577
18512
 
17578
- def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: bool) -> list[str]:
18513
+ def _transform_tbl_preprocessed(pre: Any, seg: Any, interrogation_performed: bool) -> list[str]:
17579
18514
  # If no interrogation was performed, return a list of empty strings
17580
18515
  if not interrogation_performed:
17581
18516
  return ["" for _ in range(len(pre))]
@@ -17597,9 +18532,7 @@ def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: boo
17597
18532
 
17598
18533
  def _get_preprocessed_table_icon(icon: list[str]) -> list[str]:
17599
18534
  # For each icon, get the SVG icon from the SVG_ICONS_FOR_TBL_STATUS dictionary
17600
- icon_svg = [SVG_ICONS_FOR_TBL_STATUS.get(icon) for icon in icon]
17601
-
17602
- return icon_svg
18535
+ return [SVG_ICONS_FOR_TBL_STATUS[icon] for icon in icon]
17603
18536
 
17604
18537
 
17605
18538
  def _transform_eval(
@@ -17677,9 +18610,9 @@ def _transform_test_units(
17677
18610
  return _format_single_number_with_gt(
17678
18611
  value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
17679
18612
  )
17680
- else:
17681
- # Fallback to the original behavior
17682
- return str(vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0])
18613
+ formatted = vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)
18614
+ assert isinstance(formatted, list)
18615
+ return formatted[0]
17683
18616
 
17684
18617
  return [
17685
18618
  (
@@ -17883,22 +18816,21 @@ def _transform_assertion_str(
17883
18816
  return type_upd
17884
18817
 
17885
18818
 
17886
- def _pre_processing_funcs_to_str(pre: Callable) -> str | list[str]:
18819
+ def _pre_processing_funcs_to_str(pre: Callable) -> str | list[str] | None:
17887
18820
  if isinstance(pre, Callable):
17888
18821
  return _get_callable_source(fn=pre)
18822
+ return None
17889
18823
 
17890
18824
 
17891
18825
  def _get_callable_source(fn: Callable) -> str:
17892
- if isinstance(fn, Callable):
17893
- try:
17894
- source_lines, _ = inspect.getsourcelines(fn)
17895
- source = "".join(source_lines).strip()
17896
- # Extract the `pre` argument from the source code
17897
- pre_arg = _extract_pre_argument(source)
17898
- return pre_arg
17899
- except (OSError, TypeError): # pragma: no cover
17900
- return fn.__name__
17901
- return fn # pragma: no cover
18826
+ try:
18827
+ source_lines, _ = inspect.getsourcelines(fn)
18828
+ source = "".join(source_lines).strip()
18829
+ # Extract the `pre` argument from the source code
18830
+ pre_arg = _extract_pre_argument(source)
18831
+ return pre_arg
18832
+ except (OSError, TypeError): # pragma: no cover
18833
+ return fn.__name__ # ty: ignore
17902
18834
 
17903
18835
 
17904
18836
  def _extract_pre_argument(source: str) -> str:
@@ -17924,6 +18856,7 @@ def _create_table_time_html(
17924
18856
  if time_start is None:
17925
18857
  return ""
17926
18858
 
18859
+ assert time_end is not None # typing
17927
18860
  # Get the time duration (difference between `time_end` and `time_start`) in seconds
17928
18861
  time_duration = (time_end - time_start).total_seconds()
17929
18862
 
@@ -18138,11 +19071,11 @@ def _format_number_safe(
18138
19071
  locale=locale,
18139
19072
  df_lib=df_lib,
18140
19073
  )
18141
- else:
18142
- # Fallback to the original behavior
18143
- return fmt_number(
18144
- value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
18145
- )[0] # pragma: no cover
19074
+ ints = fmt_number(
19075
+ value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
19076
+ )
19077
+ assert isinstance(ints, list)
19078
+ return ints[0]
18146
19079
 
18147
19080
 
18148
19081
  def _format_integer_safe(value: int, locale: str = "en", df_lib=None) -> str:
@@ -18155,9 +19088,10 @@ def _format_integer_safe(value: int, locale: str = "en", df_lib=None) -> str:
18155
19088
  if df_lib is not None and value is not None:
18156
19089
  # Use GT-based formatting to avoid Pandas dependency completely
18157
19090
  return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
18158
- else:
18159
- # Fallback to the original behavior
18160
- return fmt_integer(value, locale=locale)[0]
19091
+
19092
+ ints = fmt_integer(value, locale=locale)
19093
+ assert isinstance(ints, list)
19094
+ return ints[0]
18161
19095
 
18162
19096
 
18163
19097
  def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> str:
@@ -18273,7 +19207,7 @@ def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en"
18273
19207
  HTML string containing the formatted threshold information.
18274
19208
  """
18275
19209
  if thresholds == Thresholds():
18276
- return ""
19210
+ return "" # pragma: no cover
18277
19211
 
18278
19212
  # Get df_lib for formatting
18279
19213
  df_lib = None
@@ -18281,10 +19215,10 @@ def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en"
18281
19215
  import polars as pl
18282
19216
 
18283
19217
  df_lib = pl
18284
- elif _is_lib_present("pandas"):
18285
- import pandas as pd
19218
+ elif _is_lib_present("pandas"): # pragma: no cover
19219
+ import pandas as pd # pragma: no cover
18286
19220
 
18287
- df_lib = pd
19221
+ df_lib = pd # pragma: no cover
18288
19222
 
18289
19223
  # Helper function to format threshold values using the shared formatting functions
18290
19224
  def _format_threshold_value(fraction: float | None, count: int | None) -> str:
@@ -18292,10 +19226,12 @@ def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en"
18292
19226
  # Format as fraction/percentage with locale formatting
18293
19227
  if fraction == 0:
18294
19228
  return "0"
18295
- elif fraction < 0.01:
19229
+ elif fraction < 0.01: # pragma: no cover
18296
19230
  # For very small fractions, show "<0.01" with locale formatting
18297
- formatted = _format_number_safe(0.01, decimals=2, locale=locale, df_lib=df_lib)
18298
- return f"&lt;{formatted}"
19231
+ formatted = _format_number_safe(
19232
+ 0.01, decimals=2, locale=locale, df_lib=df_lib
19233
+ ) # pragma: no cover
19234
+ return f"&lt;{formatted}" # pragma: no cover
18299
19235
  else:
18300
19236
  # Use shared formatting function with drop_trailing_zeros
18301
19237
  formatted = _format_number_safe(
@@ -18372,14 +19308,14 @@ def _create_local_threshold_note_text(thresholds: Thresholds) -> str:
18372
19308
  if fraction is not None:
18373
19309
  if fraction == 0:
18374
19310
  return "0"
18375
- elif fraction < 0.01:
18376
- return "<0.01"
19311
+ elif fraction < 0.01: # pragma: no cover
19312
+ return "<0.01" # pragma: no cover
18377
19313
  else:
18378
19314
  return f"{fraction:.2f}".rstrip("0").rstrip(".")
18379
19315
  elif count is not None:
18380
19316
  return str(count)
18381
19317
  else:
18382
- return "—"
19318
+ return "—" # pragma: no cover
18383
19319
 
18384
19320
  parts = []
18385
19321
 
@@ -18398,7 +19334,7 @@ def _create_local_threshold_note_text(thresholds: Thresholds) -> str:
18398
19334
  if parts:
18399
19335
  return "Step-specific thresholds set: " + ", ".join(parts)
18400
19336
  else:
18401
- return ""
19337
+ return "" # pragma: no cover
18402
19338
 
18403
19339
 
18404
19340
  def _create_threshold_reset_note_html(locale: str = "en") -> str:
@@ -18433,79 +19369,678 @@ def _create_threshold_reset_note_text() -> str:
18433
19369
  return "Global thresholds explicitly not used for this step."
18434
19370
 
18435
19371
 
18436
- def _step_report_row_based(
18437
- assertion_type: str,
18438
- i: int,
18439
- column: str,
18440
- column_position: int,
18441
- columns_subset: list[str] | None,
18442
- values: any,
18443
- inclusive: tuple[bool, bool] | None,
18444
- n: int,
18445
- n_failed: int,
18446
- all_passed: bool,
18447
- extract: any,
18448
- tbl_preview: GT,
18449
- header: str,
18450
- limit: int | None,
18451
- lang: str,
18452
- ) -> GT:
18453
- # Get the length of the extracted data for the step
18454
- extract_length = get_row_count(extract)
18455
-
18456
- # Determine whether the `lang` value represents a right-to-left language
18457
- is_rtl_lang = lang in RTL_LANGUAGES
18458
- direction_rtl = " direction: rtl;" if is_rtl_lang else ""
19372
+ def _create_no_columns_resolved_note_html(
19373
+ column_expr: str, available_columns: list[str], locale: str = "en"
19374
+ ) -> str:
19375
+ """
19376
+ Create an HTML note explaining that a column expression resolved to no columns.
18459
19377
 
18460
- # Generate text that indicates the assertion for the validation step
18461
- if assertion_type == "col_vals_gt":
18462
- text = f"{column} > {values}"
18463
- elif assertion_type == "col_vals_lt":
18464
- text = f"{column} < {values}"
18465
- elif assertion_type == "col_vals_eq":
18466
- text = f"{column} = {values}"
18467
- elif assertion_type == "col_vals_ne":
18468
- text = f"{column} &ne; {values}"
18469
- elif assertion_type == "col_vals_ge":
18470
- text = f"{column} &ge; {values}"
18471
- elif assertion_type == "col_vals_le":
18472
- text = f"{column} &le; {values}"
18473
- elif assertion_type == "col_vals_between":
18474
- symbol_left = "&le;" if inclusive[0] else "&lt;"
18475
- symbol_right = "&le;" if inclusive[1] else "&lt;"
18476
- text = f"{values[0]} {symbol_left} {column} {symbol_right} {values[1]}"
18477
- elif assertion_type == "col_vals_outside":
18478
- symbol_left = "&lt;" if inclusive[0] else "&le;"
18479
- symbol_right = "&gt;" if inclusive[1] else "&ge;"
18480
- text = f"{column} {symbol_left} {values[0]}, {column} {symbol_right} {values[1]}"
18481
- elif assertion_type == "col_vals_in_set":
18482
- elements = ", ".join(map(str, values))
18483
- text = f"{column} &isinv; {{{elements}}}"
18484
- elif assertion_type == "col_vals_not_in_set":
18485
- elements = ", ".join(values)
18486
- text = f"{column} &NotElement; {{{elements}}}"
18487
- elif assertion_type == "col_vals_regex":
18488
- pattern = values["pattern"]
18489
- text = STEP_REPORT_TEXT["column_matches_regex"][lang].format(column=column, values=pattern)
18490
- elif assertion_type == "col_vals_null":
18491
- text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
18492
- elif assertion_type == "col_vals_not_null":
18493
- text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
18494
- elif assertion_type == "col_vals_expr":
18495
- text = STEP_REPORT_TEXT["column_expr"][lang].format(values=values)
18496
- elif assertion_type == "rows_complete":
18497
- if column is None:
18498
- text = STEP_REPORT_TEXT["rows_complete_all"][lang]
18499
- else:
18500
- text = STEP_REPORT_TEXT["rows_complete_subset"][lang]
19378
+ Parameters
19379
+ ----------
19380
+ column_expr
19381
+ The column expression that failed to resolve columns (as a string).
19382
+ available_columns
19383
+ List of available column names in the table.
19384
+ locale
19385
+ The locale string (e.g., 'en', 'fr').
18501
19386
 
18502
- # Wrap assertion text in a <code> tag
18503
- text = (
18504
- f"<code style='color: #303030; font-family: monospace; font-size: smaller;'>{text}</code>"
19387
+ Returns
19388
+ -------
19389
+ str
19390
+ HTML-formatted note text.
19391
+ """
19392
+ # Get translated strings
19393
+ intro = NOTES_TEXT.get("column_not_found_intro", {}).get(
19394
+ locale, NOTES_TEXT.get("column_not_found_intro", {}).get("en", "The column expression")
19395
+ )
19396
+ no_resolve = NOTES_TEXT.get("column_not_found_no_resolve", {}).get(
19397
+ locale,
19398
+ NOTES_TEXT.get("column_not_found_no_resolve", {}).get(
19399
+ "en", "does not resolve to any columns"
19400
+ ),
18505
19401
  )
18506
19402
 
18507
- if all_passed:
18508
- # Style the target column in green and add borders but only if that column is present
19403
+ # Format the column expression with monospace font
19404
+ col_expr_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_expr}</code>"
19405
+
19406
+ # Build the HTML note
19407
+ html = f"{intro} {col_expr_html} {no_resolve}."
19408
+
19409
+ return html
19410
+
19411
+
19412
+ def _create_no_columns_resolved_note_text(column_expr: str, available_columns: list[str]) -> str:
19413
+ """
19414
+ Create a plain text note explaining that a column expression resolved to no columns.
19415
+
19416
+ Parameters
19417
+ ----------
19418
+ column_expr
19419
+ The column expression that failed to resolve columns (as a string).
19420
+ available_columns
19421
+ List of available column names in the table.
19422
+
19423
+ Returns
19424
+ -------
19425
+ str
19426
+ Plain text note.
19427
+ """
19428
+ return f"The column expression `{column_expr}` does not resolve to any columns."
19429
+
19430
+
19431
+ def _create_column_not_found_note_html(
19432
+ column_name: str, available_columns: list[str], locale: str = "en"
19433
+ ) -> str:
19434
+ """
19435
+ Create an HTML note explaining that a specific column was not found.
19436
+
19437
+ Parameters
19438
+ ----------
19439
+ column_name
19440
+ The column name that was not found.
19441
+ available_columns
19442
+ List of available column names in the table.
19443
+ locale
19444
+ The locale string (e.g., 'en', 'fr').
19445
+
19446
+ Returns
19447
+ -------
19448
+ str
19449
+ HTML-formatted note text.
19450
+ """
19451
+ # Get translated strings
19452
+ intro = NOTES_TEXT.get("target_column_provided", {}).get(
19453
+ locale, NOTES_TEXT.get("target_column_provided", {}).get("en", "The target column provided")
19454
+ )
19455
+ not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get(
19456
+ locale,
19457
+ NOTES_TEXT.get("does_not_match_any_columns", {}).get(
19458
+ "en", "does not match any columns in the table"
19459
+ ),
19460
+ )
19461
+
19462
+ # Format the column name with monospace font
19463
+ col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
19464
+
19465
+ # Build the HTML note
19466
+ html = f"{intro} ({col_name_html}) {not_found}."
19467
+
19468
+ return html
19469
+
19470
+
19471
+ def _create_column_not_found_note_text(column_name: str, available_columns: list[str]) -> str:
19472
+ """
19473
+ Create a plain text note explaining that a specific column was not found.
19474
+
19475
+ Parameters
19476
+ ----------
19477
+ column_name
19478
+ The column name that was not found.
19479
+ available_columns
19480
+ List of available column names in the table.
19481
+
19482
+ Returns
19483
+ -------
19484
+ str
19485
+ Plain text note.
19486
+ """
19487
+ return f"The target column provided ({column_name}) does not match any columns in the table."
19488
+
19489
+
19490
+ def _create_comparison_column_not_found_note_html(
19491
+ column_name: str, position: str | None, available_columns: list[str], locale: str = "en"
19492
+ ) -> str:
19493
+ """
19494
+ Create an HTML note explaining that a comparison column was not found.
19495
+
19496
+ Parameters
19497
+ ----------
19498
+ column_name
19499
+ The comparison column name that was not found.
19500
+ position
19501
+ Optional position indicator ("left", "right") for between/outside validations.
19502
+ available_columns
19503
+ List of available column names in the table.
19504
+ locale
19505
+ The locale string (e.g., 'en', 'fr').
19506
+
19507
+ Returns
19508
+ -------
19509
+ str
19510
+ HTML-formatted note text.
19511
+ """
19512
+ # Get translated strings
19513
+ intro = NOTES_TEXT.get("comparison_column_provided", {}).get(
19514
+ locale,
19515
+ NOTES_TEXT.get("comparison_column_provided", {}).get(
19516
+ "en", "The comparison column provided"
19517
+ ),
19518
+ )
19519
+ intro_with_for = NOTES_TEXT.get("comparison_column_for", {}).get(
19520
+ locale,
19521
+ NOTES_TEXT.get("comparison_column_for", {}).get("en", "The comparison column provided for"),
19522
+ )
19523
+ not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get(
19524
+ locale,
19525
+ NOTES_TEXT.get("does_not_match_any_columns", {}).get(
19526
+ "en", "does not match any columns in the table"
19527
+ ),
19528
+ )
19529
+
19530
+ # Format the column name with monospace font
19531
+ col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
19532
+
19533
+ # Add position if provided (for between/outside validations)
19534
+ if position:
19535
+ # Format position parameter with monospace font (e.g., "left=", "right=")
19536
+ position_param = (
19537
+ f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{position}=</code>"
19538
+ )
19539
+ # Use the "for" version of the intro text
19540
+ html = f"{intro_with_for} {position_param} ({col_name_html}) {not_found}."
19541
+ else:
19542
+ # Use the standard intro text without "for"
19543
+ html = f"{intro} ({col_name_html}) {not_found}."
19544
+
19545
+ return html
19546
+
19547
+
19548
+ def _create_comparison_column_not_found_note_text(
19549
+ column_name: str, position: str | None, available_columns: list[str]
19550
+ ) -> str:
19551
+ """
19552
+ Create a plain text note explaining that a comparison column was not found.
19553
+
19554
+ Parameters
19555
+ ----------
19556
+ column_name
19557
+ The comparison column name that was not found.
19558
+ position
19559
+ Optional position indicator ("left", "right") for between/outside validations.
19560
+ available_columns
19561
+ List of available column names in the table.
19562
+
19563
+ Returns
19564
+ -------
19565
+ str
19566
+ Plain text note.
19567
+ """
19568
+ if position:
19569
+ position_text = f" for {position}="
19570
+ else:
19571
+ position_text = ""
19572
+
19573
+ return (
19574
+ f"The comparison column provided{position_text} ({column_name}) "
19575
+ f"does not match any columns in the table."
19576
+ )
19577
+
19578
+
19579
+ def _create_preprocessing_note_html(
19580
+ original_rows: int,
19581
+ original_cols: int,
19582
+ processed_rows: int,
19583
+ processed_cols: int,
19584
+ locale: str = "en",
19585
+ ) -> str:
19586
+ """
19587
+ Create an HTML note showing table dimension changes from preprocessing.
19588
+
19589
+ Parameters
19590
+ ----------
19591
+ original_rows
19592
+ Number of rows in the original table.
19593
+ original_cols
19594
+ Number of columns in the original table.
19595
+ processed_rows
19596
+ Number of rows after preprocessing.
19597
+ processed_cols
19598
+ Number of columns after preprocessing.
19599
+ locale
19600
+ The locale string (e.g., 'en', 'fr').
19601
+
19602
+ Returns
19603
+ -------
19604
+ str
19605
+ HTML-formatted note text.
19606
+ """
19607
+ # Get translated strings
19608
+ precondition_text = NOTES_TEXT.get("precondition_applied", {}).get(
19609
+ locale, NOTES_TEXT.get("precondition_applied", {}).get("en", "Precondition applied")
19610
+ )
19611
+ table_dims_text = NOTES_TEXT.get("table_dimensions", {}).get(
19612
+ locale, NOTES_TEXT.get("table_dimensions", {}).get("en", "table dimensions")
19613
+ )
19614
+
19615
+ # Helper function to get singular or plural form
19616
+ def get_row_text(count: int) -> str:
19617
+ if count == 1:
19618
+ return NOTES_TEXT.get("row", {}).get(locale, NOTES_TEXT.get("row", {}).get("en", "row"))
19619
+ return NOTES_TEXT.get("rows", {}).get(locale, NOTES_TEXT.get("rows", {}).get("en", "rows"))
19620
+
19621
+ def get_col_text(count: int) -> str:
19622
+ if count == 1:
19623
+ return NOTES_TEXT.get("column", {}).get(
19624
+ locale, NOTES_TEXT.get("column", {}).get("en", "column")
19625
+ )
19626
+ return NOTES_TEXT.get("columns", {}).get(
19627
+ locale, NOTES_TEXT.get("columns", {}).get("en", "columns")
19628
+ )
19629
+
19630
+ # Determine which dimensions changed
19631
+ rows_changed = original_rows != processed_rows
19632
+ cols_changed = original_cols != processed_cols
19633
+
19634
+ # Format original dimensions
19635
+ original_rows_text = get_row_text(original_rows)
19636
+ original_cols_text = get_col_text(original_cols)
19637
+ original_dim = (
19638
+ f'<span style="font-family: monospace;">'
19639
+ f"[{original_rows:,} {original_rows_text}, {original_cols} {original_cols_text}]"
19640
+ f"</span>"
19641
+ )
19642
+
19643
+ # Format processed dimensions with bold for changed values
19644
+ processed_rows_text = get_row_text(processed_rows)
19645
+ processed_cols_text = get_col_text(processed_cols)
19646
+
19647
+ if rows_changed:
19648
+ rows_display = f"<strong>{processed_rows:,}</strong> {processed_rows_text}"
19649
+ else:
19650
+ rows_display = f"{processed_rows:,} {processed_rows_text}"
19651
+
19652
+ if cols_changed:
19653
+ cols_display = f"<strong>{processed_cols}</strong> {processed_cols_text}"
19654
+ else:
19655
+ cols_display = f"{processed_cols} {processed_cols_text}"
19656
+
19657
+ processed_dim = f'<span style="font-family: monospace;">[{rows_display}, {cols_display}]</span>'
19658
+
19659
+ # Build the HTML note
19660
+ html = f"{precondition_text}: {table_dims_text} {original_dim} → {processed_dim}."
19661
+
19662
+ return html
19663
+
19664
+
19665
+ def _create_preprocessing_note_text(
19666
+ original_rows: int,
19667
+ original_cols: int,
19668
+ processed_rows: int,
19669
+ processed_cols: int,
19670
+ ) -> str:
19671
+ """
19672
+ Create a plain text note showing table dimension changes from preprocessing.
19673
+
19674
+ Parameters
19675
+ ----------
19676
+ original_rows
19677
+ Number of rows in the original table.
19678
+ original_cols
19679
+ Number of columns in the original table.
19680
+ processed_rows
19681
+ Number of rows after preprocessing.
19682
+ processed_cols
19683
+ Number of columns after preprocessing.
19684
+
19685
+ Returns
19686
+ -------
19687
+ str
19688
+ Plain text note.
19689
+ """
19690
+ # Get singular or plural forms
19691
+ original_rows_text = "row" if original_rows == 1 else "rows"
19692
+ original_cols_text = "column" if original_cols == 1 else "columns"
19693
+ processed_rows_text = "row" if processed_rows == 1 else "rows"
19694
+ processed_cols_text = "column" if processed_cols == 1 else "columns"
19695
+
19696
+ return (
19697
+ f"Precondition applied: table dimensions "
19698
+ f"[{original_rows:,} {original_rows_text}, {original_cols} {original_cols_text}] → "
19699
+ f"[{processed_rows:,} {processed_rows_text}, {processed_cols} {processed_cols_text}]."
19700
+ )
19701
+
19702
+
19703
+ def _create_preprocessing_no_change_note_html(locale: str = "en") -> str:
19704
+ """
19705
+ Create an HTML note indicating preprocessing was applied with no dimension change.
19706
+
19707
+ Parameters
19708
+ ----------
19709
+ locale
19710
+ The locale string (e.g., 'en', 'fr').
19711
+
19712
+ Returns
19713
+ -------
19714
+ str
19715
+ HTML-formatted note text.
19716
+ """
19717
+ # Get translated string
19718
+ note_text = NOTES_TEXT.get("precondition_applied_no_change", {}).get(
19719
+ locale,
19720
+ NOTES_TEXT.get("precondition_applied_no_change", {}).get(
19721
+ "en", "Precondition applied: no table dimension change"
19722
+ ),
19723
+ )
19724
+
19725
+ return f"{note_text}."
19726
+
19727
+
19728
+ def _create_preprocessing_no_change_note_text() -> str:
19729
+ """
19730
+ Create a plain text note indicating preprocessing was applied with no dimension change.
19731
+
19732
+ Returns
19733
+ -------
19734
+ str
19735
+ Plain text note.
19736
+ """
19737
+ return "Precondition applied: no table dimension change."
19738
+
19739
+
19740
+ def _create_synthetic_target_column_note_html(column_name: str, locale: str = "en") -> str:
19741
+ """
19742
+ Create an HTML note indicating that the target column was created via preprocessing.
19743
+
19744
+ Parameters
19745
+ ----------
19746
+ column_name
19747
+ The name of the synthetic target column.
19748
+ locale
19749
+ The locale string (e.g., 'en', 'fr').
19750
+
19751
+ Returns
19752
+ -------
19753
+ str
19754
+ HTML-formatted note text.
19755
+ """
19756
+ # Get translated strings
19757
+ synthetic_text = NOTES_TEXT.get("synthetic_target_column", {}).get(
19758
+ locale, NOTES_TEXT.get("synthetic_target_column", {}).get("en", "Synthetic target column")
19759
+ )
19760
+ created_via_text = NOTES_TEXT.get("created_via_preprocessing", {}).get(
19761
+ locale,
19762
+ NOTES_TEXT.get("created_via_preprocessing", {}).get("en", "created via preprocessing"),
19763
+ )
19764
+
19765
+ # Format the column name with monospace font
19766
+ col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
19767
+
19768
+ # Build the HTML note
19769
+ html = f"{synthetic_text} {col_name_html} {created_via_text}."
19770
+
19771
+ return html
19772
+
19773
+
19774
+ def _create_synthetic_target_column_note_text(column_name: str) -> str:
19775
+ """
19776
+ Create a plain text note indicating that the target column was created via preprocessing.
19777
+
19778
+ Parameters
19779
+ ----------
19780
+ column_name
19781
+ The name of the synthetic target column.
19782
+
19783
+ Returns
19784
+ -------
19785
+ str
19786
+ Plain text note.
19787
+ """
19788
+ return f"Synthetic target column ({column_name}) created via preprocessing."
19789
+
19790
+
19791
+ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> str:
19792
+ """
19793
+ Create an HTML note with collapsible schema expectation and results.
19794
+
19795
+ This generates a disclosure-style note showing:
19796
+ 1. A summary of what failed (if anything)
19797
+ 2. The full step report table (collapsible)
19798
+
19799
+ Parameters
19800
+ ----------
19801
+ schema_info
19802
+ The schema validation information dictionary from interrogation.
19803
+ locale
19804
+ The locale string (e.g., 'en', 'fr').
19805
+
19806
+ Returns
19807
+ -------
19808
+ str
19809
+ HTML-formatted note with collapsible schema details.
19810
+ """
19811
+ passed = schema_info["passed"]
19812
+ expect_schema = schema_info["expect_schema"]
19813
+ target_schema = schema_info["target_schema"]
19814
+ params = schema_info["params"]
19815
+ columns_dict = schema_info["columns"]
19816
+ in_order = params["in_order"]
19817
+
19818
+ # Get translations for the locale
19819
+ passed_text = VALIDATION_REPORT_TEXT["note_schema_comparison_passed"].get(
19820
+ locale, VALIDATION_REPORT_TEXT["note_schema_comparison_passed"]["en"]
19821
+ )
19822
+ failed_text = VALIDATION_REPORT_TEXT["note_schema_comparison_failed"].get(
19823
+ locale, VALIDATION_REPORT_TEXT["note_schema_comparison_failed"]["en"]
19824
+ )
19825
+ disclosure_text = VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"].get(
19826
+ locale, VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"]["en"]
19827
+ )
19828
+ settings_title_text = VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"].get(
19829
+ locale, VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"]["en"]
19830
+ )
19831
+
19832
+ # Build summary message
19833
+ if passed:
19834
+ summary = f'<span style="color:#4CA64C;">✓</span> {passed_text}.'
19835
+ else:
19836
+ # Analyze what failed
19837
+ failures = []
19838
+
19839
+ # Check column count mismatch
19840
+ n_expect = len(expect_schema)
19841
+ n_target = len(target_schema)
19842
+ if n_expect != n_target:
19843
+ count_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"].get(
19844
+ locale, VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"]["en"]
19845
+ )
19846
+ failures.append(count_mismatch_text.format(n_expect=n_expect, n_target=n_target))
19847
+
19848
+ # Check for unmatched columns
19849
+ unmatched_cols = [col for col, info in columns_dict.items() if not info["colname_matched"]]
19850
+ if unmatched_cols:
19851
+ unmatched_text = VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"].get(
19852
+ locale, VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"]["en"]
19853
+ )
19854
+ failures.append(unmatched_text.format(n=len(unmatched_cols)))
19855
+
19856
+ # Check for wrong order (if in_order=True)
19857
+ if params["in_order"]:
19858
+ wrong_order = [
19859
+ col
19860
+ for col, info in columns_dict.items()
19861
+ if info["colname_matched"] and not info["index_matched"]
19862
+ ]
19863
+ if wrong_order:
19864
+ wrong_order_text = VALIDATION_REPORT_TEXT["note_schema_wrong_order"].get(
19865
+ locale, VALIDATION_REPORT_TEXT["note_schema_wrong_order"]["en"]
19866
+ )
19867
+ failures.append(wrong_order_text.format(n=len(wrong_order)))
19868
+
19869
+ # Check for dtype mismatches
19870
+ dtype_mismatches = [
19871
+ col
19872
+ for col, info in columns_dict.items()
19873
+ if info["colname_matched"] and info["dtype_present"] and not info["dtype_matched"]
19874
+ ]
19875
+ if dtype_mismatches:
19876
+ dtype_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"].get(
19877
+ locale, VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"]["en"]
19878
+ )
19879
+ failures.append(dtype_mismatch_text.format(n=len(dtype_mismatches)))
19880
+
19881
+ if failures:
19882
+ summary = (
19883
+ f'<span style="color:#FF3300;">✗</span> {failed_text}: ' + ", ".join(failures) + "."
19884
+ )
19885
+ else:
19886
+ summary = f'<span style="color:#FF3300;">✗</span> {failed_text}.' # pragma: no cover
19887
+
19888
+ # Generate the step report table using the existing function
19889
+ # We'll call either _step_report_schema_in_order or _step_report_schema_any_order
19890
+ # depending on the in_order parameter
19891
+ if in_order: # pragma: no cover
19892
+ step_report_gt = _step_report_schema_in_order( # pragma: no cover
19893
+ step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
19894
+ )
19895
+ else:
19896
+ step_report_gt = _step_report_schema_any_order(
19897
+ step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
19898
+ )
19899
+
19900
+ # Generate the settings HTML using the existing function
19901
+ settings_html = _create_col_schema_match_params_html(
19902
+ lang=locale,
19903
+ complete=params["complete"],
19904
+ in_order=params["in_order"],
19905
+ case_sensitive_colnames=params["case_sensitive_colnames"],
19906
+ case_sensitive_dtypes=params["case_sensitive_dtypes"],
19907
+ full_match_dtypes=params["full_match_dtypes"],
19908
+ )
19909
+
19910
+ # Remove the inner div containing column_schema_match_str
19911
+ settings_html = re.sub(r'<div style="margin-right: 5px;">.*?</div>', "", settings_html, count=1)
19912
+
19913
+ # Change padding-top from 7px to 2px
19914
+ settings_html = settings_html.replace("padding-top: 7px;", "padding-top: 2px;")
19915
+
19916
+ # Create new source note HTML that includes both settings and schema
19917
+ source_note_html = f"""
19918
+ <div style='padding-bottom: 2px;'>{settings_title_text}</div>
19919
+ <div style='padding-bottom: 4px;'>{settings_html}</div>
19920
+ """
19921
+
19922
+ # Add the settings as an additional source note to the step report
19923
+ step_report_gt = step_report_gt.tab_source_note(source_note=html(source_note_html)) # type: ignore[union-attr]
19924
+
19925
+ # Extract the HTML from the GT object
19926
+ step_report_html = step_report_gt._repr_html_()
19927
+
19928
+ # Create collapsible section with the step report
19929
+ note_html = f"""
19930
+ {summary}
19931
+
19932
+ <details style="margin-top: 2px; margin-bottom: 8px; font-size: 12px; text-indent: 12px;">
19933
+ <summary style="cursor: pointer; font-weight: bold; color: #555; margin-bottom: -5px;">{disclosure_text}</summary>
19934
+ <div style="margin-top: 6px; padding-left: 15px; padding-right: 15px;">
19935
+
19936
+ {step_report_html}
19937
+
19938
+ </div>
19939
+ </details>
19940
+ """
19941
+
19942
+ return note_html.strip()
19943
+
19944
+
19945
+ def _create_col_schema_match_note_text(schema_info: dict) -> str:
19946
+ """
19947
+ Create a plain text note for schema validation.
19948
+
19949
+ Parameters
19950
+ ----------
19951
+ schema_info
19952
+ The schema validation information dictionary from interrogation.
19953
+
19954
+ Returns
19955
+ -------
19956
+ str
19957
+ Plain text note.
19958
+ """
19959
+ passed = schema_info["passed"]
19960
+ expect_schema = schema_info["expect_schema"]
19961
+ target_schema = schema_info["target_schema"]
19962
+
19963
+ if passed:
19964
+ return f"Schema validation passed. Expected {len(expect_schema)} column(s), found {len(target_schema)}."
19965
+ else:
19966
+ return f"Schema validation failed. Expected {len(expect_schema)} column(s), found {len(target_schema)}."
19967
+
19968
+
19969
+ def _step_report_row_based(
19970
+ assertion_type: str,
19971
+ i: int,
19972
+ column: str,
19973
+ column_position: int,
19974
+ columns_subset: list[str] | None,
19975
+ values: Any,
19976
+ inclusive: tuple[bool, bool] | None,
19977
+ n: int,
19978
+ n_failed: int,
19979
+ all_passed: bool,
19980
+ extract: Any,
19981
+ tbl_preview: GT,
19982
+ header: str,
19983
+ limit: int | None,
19984
+ lang: str,
19985
+ ) -> GT:
19986
+ # Get the length of the extracted data for the step
19987
+ extract_length = get_row_count(extract)
19988
+
19989
+ # Determine whether the `lang` value represents a right-to-left language
19990
+ is_rtl_lang = lang in RTL_LANGUAGES
19991
+ direction_rtl = " direction: rtl;" if is_rtl_lang else ""
19992
+
19993
+ # Generate text that indicates the assertion for the validation step
19994
+ if assertion_type == "col_vals_gt":
19995
+ text = f"{column} > {values}"
19996
+ elif assertion_type == "col_vals_lt":
19997
+ text = f"{column} < {values}"
19998
+ elif assertion_type == "col_vals_eq":
19999
+ text = f"{column} = {values}"
20000
+ elif assertion_type == "col_vals_ne":
20001
+ text = f"{column} &ne; {values}"
20002
+ elif assertion_type == "col_vals_ge":
20003
+ text = f"{column} &ge; {values}"
20004
+ elif assertion_type == "col_vals_le":
20005
+ text = f"{column} &le; {values}"
20006
+ elif assertion_type == "col_vals_between":
20007
+ assert inclusive is not None
20008
+ symbol_left = "&le;" if inclusive[0] else "&lt;"
20009
+ symbol_right = "&le;" if inclusive[1] else "&lt;"
20010
+ text = f"{values[0]} {symbol_left} {column} {symbol_right} {values[1]}"
20011
+ elif assertion_type == "col_vals_outside":
20012
+ assert inclusive is not None
20013
+ symbol_left = "&lt;" if inclusive[0] else "&le;"
20014
+ symbol_right = "&gt;" if inclusive[1] else "&ge;"
20015
+ text = f"{column} {symbol_left} {values[0]}, {column} {symbol_right} {values[1]}"
20016
+ elif assertion_type == "col_vals_in_set":
20017
+ elements = ", ".join(map(str, values))
20018
+ text = f"{column} &isinv; {{{elements}}}"
20019
+ elif assertion_type == "col_vals_not_in_set":
20020
+ elements = ", ".join(values)
20021
+ text = f"{column} &NotElement; {{{elements}}}"
20022
+ elif assertion_type == "col_vals_regex":
20023
+ pattern = values["pattern"]
20024
+ text = STEP_REPORT_TEXT["column_matches_regex"][lang].format(column=column, values=pattern)
20025
+ elif assertion_type == "col_vals_null":
20026
+ text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
20027
+ elif assertion_type == "col_vals_not_null":
20028
+ text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
20029
+ elif assertion_type == "col_vals_expr":
20030
+ text = STEP_REPORT_TEXT["column_expr"][lang].format(values=values)
20031
+ elif assertion_type == "rows_complete":
20032
+ if column is None:
20033
+ text = STEP_REPORT_TEXT["rows_complete_all"][lang]
20034
+ else:
20035
+ text = STEP_REPORT_TEXT["rows_complete_subset"][lang]
20036
+
20037
+ # Wrap assertion text in a <code> tag
20038
+ text = (
20039
+ f"<code style='color: #303030; font-family: monospace; font-size: smaller;'>{text}</code>"
20040
+ )
20041
+
20042
+ if all_passed:
20043
+ # Style the target column in green and add borders but only if that column is present
18509
20044
  # in the `tbl_preview` (i.e., it may not be present if `columns_subset=` didn't include it)
18510
20045
  preview_tbl_columns = tbl_preview._boxhead._get_columns()
18511
20046
  preview_tbl_has_target_column = column in preview_tbl_columns
@@ -18695,7 +20230,7 @@ def _step_report_rows_distinct(
18695
20230
  n: int,
18696
20231
  n_failed: int,
18697
20232
  all_passed: bool,
18698
- extract: any,
20233
+ extract: Any,
18699
20234
  tbl_preview: GT,
18700
20235
  header: str,
18701
20236
  limit: int | None,
@@ -18822,8 +20357,8 @@ def _step_report_rows_distinct(
18822
20357
 
18823
20358
 
18824
20359
  def _step_report_schema_in_order(
18825
- step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False
18826
- ) -> GT | any:
20360
+ step: int, schema_info: dict, header: str | None, lang: str, debug_return_df: bool = False
20361
+ ) -> GT | Any:
18827
20362
  """
18828
20363
  This is the case for schema validation where the schema is supposed to have the same column
18829
20364
  order as the target table.
@@ -18880,16 +20415,33 @@ def _step_report_schema_in_order(
18880
20415
  dtype_exp = []
18881
20416
  dtype_exp_correct = []
18882
20417
 
18883
- for i in range(len(exp_columns_dict)):
20418
+ for i in range(len(expect_schema)):
18884
20419
  #
18885
20420
  # `col_name_exp` values
18886
20421
  #
18887
20422
 
18888
- # The column name is the key in the dictionary, get the column name and
18889
- # append it to the `col_name_exp` list
18890
- col_name_exp.append(list(exp_columns_dict.keys())[i])
20423
+ # Get the column name from expect_schema (which can have duplicates)
20424
+ column_name_exp_i = expect_schema[i][0]
20425
+ col_name_exp.append(column_name_exp_i)
18891
20426
 
18892
- column_name_exp_i = col_name_exp[i]
20427
+ # Check if this column exists in exp_columns_dict (it might not if it's a duplicate)
20428
+ # For duplicates, we need to handle them specially
20429
+ if column_name_exp_i not in exp_columns_dict: # pragma: no cover
20430
+ # This is a duplicate or invalid column, mark it as incorrect
20431
+ col_exp_correct.append(CROSS_MARK_SPAN) # pragma: no cover
20432
+
20433
+ # For dtype, check if there's a dtype specified in the schema
20434
+ if len(expect_schema[i]) > 1: # pragma: no cover
20435
+ dtype_value = expect_schema[i][1] # pragma: no cover
20436
+ if isinstance(dtype_value, list): # pragma: no cover
20437
+ dtype_exp.append(" | ".join(dtype_value)) # pragma: no cover
20438
+ else: # pragma: no cover
20439
+ dtype_exp.append(str(dtype_value)) # pragma: no cover
20440
+ else: # pragma: no cover
20441
+ dtype_exp.append("&mdash;") # pragma: no cover
20442
+
20443
+ dtype_exp_correct.append("&mdash;") # pragma: no cover
20444
+ continue # pragma: no cover
18893
20445
 
18894
20446
  #
18895
20447
  # `col_exp_correct` values
@@ -19112,7 +20664,9 @@ def _step_report_schema_in_order(
19112
20664
  # Add a border below the row that terminates the target table schema
19113
20665
  step_report = step_report.tab_style(
19114
20666
  style=style.borders(sides="bottom", color="#6699CC80", style="solid", weight="1px"),
19115
- locations=loc.body(rows=len(colnames_tgt) - 1),
20667
+ locations=loc.body(
20668
+ rows=len(colnames_tgt) - 1 # ty: ignore (bug in GT, should allow an int)
20669
+ ),
19116
20670
  )
19117
20671
 
19118
20672
  # If the version of `great_tables` is `>=0.17.0` then disable Quarto table processing
@@ -19161,8 +20715,8 @@ def _step_report_schema_in_order(
19161
20715
 
19162
20716
 
19163
20717
  def _step_report_schema_any_order(
19164
- step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False
19165
- ) -> GT | any:
20718
+ step: int, schema_info: dict, header: str | None, lang: str, debug_return_df: bool = False
20719
+ ) -> GT | pl.DataFrame:
19166
20720
  """
19167
20721
  This is the case for schema validation where the schema is permitted to not have to be in the
19168
20722
  same column order as the target table.
@@ -19581,9 +21135,7 @@ def _step_report_schema_any_order(
19581
21135
  header = header.format(title=title, details=details)
19582
21136
 
19583
21137
  # Create the header with `header` string
19584
- step_report = step_report.tab_header(title=md(header))
19585
-
19586
- return step_report
21138
+ return step_report.tab_header(title=md(header))
19587
21139
 
19588
21140
 
19589
21141
  def _create_label_text_html(
@@ -19672,3 +21224,321 @@ def _create_col_schema_match_params_html(
19672
21224
  f"{full_match_dtypes_text}"
19673
21225
  "</div>"
19674
21226
  )
21227
+
21228
+
21229
+ def _generate_agg_docstring(name: str) -> str:
21230
+ """Generate a comprehensive docstring for an aggregation validation method.
21231
+
21232
+ This function creates detailed documentation for dynamically generated methods like
21233
+ `col_sum_eq()`, `col_avg_gt()`, `col_sd_le()`, etc. The docstrings follow the same
21234
+ structure and quality as manually written validation methods like `col_vals_gt()`.
21235
+
21236
+ Parameters
21237
+ ----------
21238
+ name
21239
+ The method name (e.g., "col_sum_eq", "col_avg_gt", "col_sd_le").
21240
+
21241
+ Returns
21242
+ -------
21243
+ str
21244
+ A complete docstring for the method.
21245
+ """
21246
+ # Parse the method name to extract aggregation type and comparison operator
21247
+ # Format: col_{agg}_{comp} (e.g., col_sum_eq, col_avg_gt, col_sd_le)
21248
+ parts = name.split("_")
21249
+ agg_type = parts[1] # sum, avg, sd
21250
+ comp_type = parts[2] # eq, gt, ge, lt, le
21251
+
21252
+ # Human-readable names for aggregation types
21253
+ agg_names = {
21254
+ "sum": ("sum", "summed"),
21255
+ "avg": ("average", "averaged"),
21256
+ "sd": ("standard deviation", "computed for standard deviation"),
21257
+ }
21258
+
21259
+ # Human-readable descriptions for comparison operators (with article for title)
21260
+ comp_descriptions = {
21261
+ "eq": ("equal to", "equals", "an"),
21262
+ "gt": ("greater than", "is greater than", "a"),
21263
+ "ge": ("greater than or equal to", "is at least", "a"),
21264
+ "lt": ("less than", "is less than", "a"),
21265
+ "le": ("less than or equal to", "is at most", "a"),
21266
+ }
21267
+
21268
+ # Mathematical symbols for comparison operators
21269
+ comp_symbols = {
21270
+ "eq": "==",
21271
+ "gt": ">",
21272
+ "ge": ">=",
21273
+ "lt": "<",
21274
+ "le": "<=",
21275
+ }
21276
+
21277
+ agg_name, agg_verb = agg_names[agg_type]
21278
+ comp_desc, comp_phrase, comp_article = comp_descriptions[comp_type]
21279
+ comp_symbol = comp_symbols[comp_type]
21280
+
21281
+ # Determine the appropriate example values based on the aggregation and comparison
21282
+ if agg_type == "sum":
21283
+ example_value = "15"
21284
+ example_data = '{"a": [1, 2, 3, 4, 5], "b": [2, 2, 2, 2, 2]}'
21285
+ example_sum = "15" # sum of a
21286
+ example_ref_sum = "10" # sum of b
21287
+ elif agg_type == "avg":
21288
+ example_value = "3"
21289
+ example_data = '{"a": [1, 2, 3, 4, 5], "b": [2, 2, 2, 2, 2]}'
21290
+ example_sum = "3.0" # avg of a
21291
+ example_ref_sum = "2.0" # avg of b
21292
+ else: # sd
21293
+ example_value = "2"
21294
+ example_data = '{"a": [1, 2, 3, 4, 5], "b": [2, 2, 2, 2, 2]}'
21295
+ example_sum = "~1.58" # sd of a
21296
+ example_ref_sum = "0.0" # sd of b
21297
+
21298
+ # Build appropriate tolerance explanation based on comparison type
21299
+ if comp_type == "eq":
21300
+ tol_explanation = f"""The `tol=` parameter is particularly useful with `{name}()` since exact equality
21301
+ comparisons on floating-point aggregations can be problematic due to numerical precision.
21302
+ Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
21303
+ floating-point arithmetic."""
21304
+ else:
21305
+ tol_explanation = f"""The `tol=` parameter expands the acceptable range for the comparison. For
21306
+ `{name}()`, a tolerance of `tol=0.5` would mean the {agg_name} can be within `0.5` of the
21307
+ target value and still pass validation."""
21308
+
21309
+ docstring = f"""
21310
+ Does the column {agg_name} satisfy {comp_article} {comp_desc} comparison?
21311
+
21312
+ The `{name}()` validation method checks whether the {agg_name} of values in a column
21313
+ {comp_phrase} a specified `value=`. This is an aggregation-based validation where the entire
21314
+ column is reduced to a single {agg_name} value that is then compared against the target. The
21315
+ comparison used in this function is `{agg_name}(column) {comp_symbol} value`.
21316
+
21317
+ Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
21318
+ a single test unit. The validation either passes completely (if the aggregated value satisfies
21319
+ the comparison) or fails completely.
21320
+
21321
+ Parameters
21322
+ ----------
21323
+ columns
21324
+ A single column or a list of columns to validate. If multiple columns are supplied,
21325
+ there will be a separate validation step generated for each column. The columns must
21326
+ contain numeric data for the {agg_name} to be computed.
21327
+ value
21328
+ The value to compare the column {agg_name} against. This can be: (1) a numeric literal
21329
+ (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
21330
+ whose {agg_name} will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
21331
+ referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
21332
+ `None` to automatically compare against the same column in reference data (shorthand for
21333
+ `ref(column_name)` when reference data is set).
21334
+ tol
21335
+ A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
21336
+ set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
21337
+ a {agg_name} that differs from the target by up to `0.5` will still pass. {tol_explanation}
21338
+ thresholds
21339
+ Failure threshold levels so that the validation step can react accordingly when
21340
+ failing test units are level. Since this is an aggregation-based validation with only
21341
+ one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
21342
+ indicate pass/fail, or as proportions where any value less than `1.0` means failure is
21343
+ acceptable.
21344
+ brief
21345
+ An optional brief description of the validation step that will be displayed in the
21346
+ reporting table. You can use the templating elements like `"{{step}}"` to insert
21347
+ the step number, or `"{{auto}}"` to include an automatically generated brief. If `True`
21348
+ the entire brief will be automatically generated. If `None` (the default) then there
21349
+ won't be a brief.
21350
+ actions
21351
+ Optional actions to take when the validation step meets or exceeds any set threshold
21352
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
21353
+ define the actions.
21354
+ active
21355
+ A boolean value indicating whether the validation step should be active. Using `False`
21356
+ will make the validation step inactive (still reporting its presence and keeping indexes
21357
+ for the steps unchanged).
21358
+
21359
+ Returns
21360
+ -------
21361
+ Validate
21362
+ The `Validate` object with the added validation step.
21363
+
21364
+ Using Reference Data
21365
+ --------------------
21366
+ The `{name}()` method supports comparing column aggregations against reference data. This
21367
+ is useful for validating that statistical properties remain consistent across different
21368
+ versions of a dataset, or for comparing current data against historical baselines.
21369
+
21370
+ To use reference data, set the `reference=` parameter when creating the `Validate` object:
21371
+
21372
+ ```python
21373
+ validation = (
21374
+ pb.Validate(data=current_data, reference=baseline_data)
21375
+ .{name}(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
21376
+ .interrogate()
21377
+ )
21378
+ ```
21379
+
21380
+ When `value=None` and reference data is set, the method automatically compares against the
21381
+ same column in the reference data. You can also explicitly specify reference columns using
21382
+ the `ref()` helper:
21383
+
21384
+ ```python
21385
+ .{name}(columns="revenue", value=pb.ref("baseline_revenue"))
21386
+ ```
21387
+
21388
+ Understanding Tolerance
21389
+ -----------------------
21390
+ The `tol=` parameter allows for fuzzy comparisons, which is especially important for
21391
+ floating-point aggregations where exact equality is often unreliable.
21392
+
21393
+ {tol_explanation}
21394
+
21395
+ For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
21396
+ within which the aggregation is considered valid. For inequality comparisons, the tolerance
21397
+ shifts the comparison boundary.
21398
+
21399
+ Thresholds
21400
+ ----------
21401
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
21402
+ step. If they are set here at the step level, these thresholds will override any thresholds
21403
+ set at the global level in `Validate(thresholds=...)`.
21404
+
21405
+ There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
21406
+ validations operate on a single test unit (the aggregated value), threshold values are
21407
+ typically set as absolute counts:
21408
+
21409
+ - `thresholds=1` means any failure triggers a 'warning'
21410
+ - `thresholds=(1, 1, 1)` means any failure triggers all three levels
21411
+
21412
+ Thresholds can be defined using one of these input schemes:
21413
+
21414
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
21415
+ thresholds)
21416
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
21417
+ the 'error' level, and position `2` is the 'critical' level
21418
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
21419
+ 'critical'
21420
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
21421
+ for the 'warning' level only
21422
+
21423
+ Examples
21424
+ --------
21425
+ ```{{python}}
21426
+ #| echo: false
21427
+ #| output: false
21428
+ import pointblank as pb
21429
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
21430
+ ```
21431
+ For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
21432
+ shown below:
21433
+
21434
+ ```{{python}}
21435
+ import pointblank as pb
21436
+ import polars as pl
21437
+
21438
+ tbl = pl.DataFrame(
21439
+ {{
21440
+ "a": [1, 2, 3, 4, 5],
21441
+ "b": [2, 2, 2, 2, 2],
21442
+ }}
21443
+ )
21444
+
21445
+ pb.preview(tbl)
21446
+ ```
21447
+
21448
+ Let's validate that the {agg_name} of column `a` {comp_phrase} `{example_value}`:
21449
+
21450
+ ```{{python}}
21451
+ validation = (
21452
+ pb.Validate(data=tbl)
21453
+ .{name}(columns="a", value={example_value})
21454
+ .interrogate()
21455
+ )
21456
+
21457
+ validation
21458
+ ```
21459
+
21460
+ The validation result shows whether the {agg_name} comparison passed or failed. Since this
21461
+ is an aggregation-based validation, there is exactly one test unit per column.
21462
+
21463
+ When validating multiple columns, each column gets its own validation step:
21464
+
21465
+ ```{{python}}
21466
+ validation = (
21467
+ pb.Validate(data=tbl)
21468
+ .{name}(columns=["a", "b"], value={example_value})
21469
+ .interrogate()
21470
+ )
21471
+
21472
+ validation
21473
+ ```
21474
+
21475
+ Using tolerance for flexible comparisons:
21476
+
21477
+ ```{{python}}
21478
+ validation = (
21479
+ pb.Validate(data=tbl)
21480
+ .{name}(columns="a", value={example_value}, tol=1.0)
21481
+ .interrogate()
21482
+ )
21483
+
21484
+ validation
21485
+ ```
21486
+ """
21487
+
21488
+ return docstring.strip()
21489
+
21490
+
21491
+ def make_agg_validator(name: str):
21492
+ """Factory for dynamically generated aggregate validation methods.
21493
+
21494
+ Why this exists:
21495
+ Aggregate validators all share identical behavior. The only thing that differs
21496
+ between them is the semantic assertion type (their name). The implementation
21497
+ of each aggregate validator is fetched from `from_agg_validator`.
21498
+
21499
+ Instead of copy/pasting dozens of identical methods, we generate
21500
+ them dynamically and attach them to the Validate class. The types are generated
21501
+ at build time with `make pyi` to allow the methods to be visible to the type checker,
21502
+ documentation builders and the IDEs/LSPs.
21503
+
21504
+ The returned function is a thin adapter that forwards all arguments to
21505
+ `_add_agg_validation`, supplying the assertion type explicitly.
21506
+ """
21507
+
21508
+ def agg_validator(
21509
+ self: Validate,
21510
+ columns: str | Collection[str],
21511
+ value: float | int | Column | ReferenceColumn | None = None,
21512
+ tol: float = 0,
21513
+ thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
21514
+ brief: str | bool | None = None,
21515
+ actions: Actions | None = None,
21516
+ active: bool = True,
21517
+ ) -> Validate:
21518
+ # Dynamically generated aggregate validator.
21519
+ # This method is generated per assertion type and forwards all arguments
21520
+ # to the shared aggregate validation implementation.
21521
+ return self._add_agg_validation(
21522
+ assertion_type=name,
21523
+ columns=columns,
21524
+ value=value,
21525
+ tol=tol,
21526
+ thresholds=thresholds,
21527
+ brief=brief,
21528
+ actions=actions,
21529
+ active=active,
21530
+ )
21531
+
21532
+ # Manually set function identity so this behaves like a real method.
21533
+ # These must be set before attaching the function to the class.
21534
+ agg_validator.__name__ = name
21535
+ agg_validator.__qualname__ = f"Validate.{name}"
21536
+ agg_validator.__doc__ = _generate_agg_docstring(name)
21537
+
21538
+ return agg_validator
21539
+
21540
+
21541
+ # Finally, we grab all the valid aggregation method names and attach them to
21542
+ # the Validate class, registering each one appropriately.
21543
+ for method in load_validation_method_grid(): # -> `col_sum_*`, `col_mean_*`, etc.
21544
+ setattr(Validate, method, make_agg_validator(method))