pointblank 0.15.0__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -12,6 +12,7 @@ import tempfile
12
12
  import threading
13
13
  from dataclasses import dataclass
14
14
  from enum import Enum
15
+ from functools import partial
15
16
  from importlib.metadata import version
16
17
  from pathlib import Path
17
18
  from typing import TYPE_CHECKING, Any, Callable, Literal
@@ -45,6 +46,7 @@ from pointblank._constants import (
45
46
  )
46
47
  from pointblank._constants_translations import (
47
48
  EXPECT_FAIL_TEXT,
49
+ NOTES_TEXT,
48
50
  STEP_REPORT_TEXT,
49
51
  VALIDATION_REPORT_TEXT,
50
52
  )
@@ -53,6 +55,7 @@ from pointblank._interrogation import (
53
55
  SpeciallyValidation,
54
56
  col_count_match,
55
57
  col_exists,
58
+ col_pct_null,
56
59
  col_schema_match,
57
60
  col_vals_expr,
58
61
  conjointly_validation,
@@ -122,6 +125,7 @@ __all__ = [
122
125
  "write_file",
123
126
  "config",
124
127
  "connect_to_table",
128
+ "print_database_tables",
125
129
  "preview",
126
130
  "missing_vals_tbl",
127
131
  "get_action_metadata",
@@ -361,12 +365,16 @@ class PointblankConfig:
361
365
 
362
366
  report_incl_header: bool = True
363
367
  report_incl_footer: bool = True
368
+ report_incl_footer_timings: bool = True
369
+ report_incl_footer_notes: bool = True
364
370
  preview_incl_header: bool = True
365
371
 
366
372
  def __repr__(self):
367
373
  return (
368
374
  f"PointblankConfig(report_incl_header={self.report_incl_header}, "
369
375
  f"report_incl_footer={self.report_incl_footer}, "
376
+ f"report_incl_footer_timings={self.report_incl_footer_timings}, "
377
+ f"report_incl_footer_notes={self.report_incl_footer_notes}, "
370
378
  f"preview_incl_header={self.preview_incl_header})"
371
379
  )
372
380
 
@@ -378,6 +386,8 @@ global_config = PointblankConfig()
378
386
  def config(
379
387
  report_incl_header: bool = True,
380
388
  report_incl_footer: bool = True,
389
+ report_incl_footer_timings: bool = True,
390
+ report_incl_footer_notes: bool = True,
381
391
  preview_incl_header: bool = True,
382
392
  ) -> PointblankConfig:
383
393
  """
@@ -391,7 +401,13 @@ def config(
391
401
  threshold levels (if set).
392
402
  report_incl_footer
393
403
  Should the footer of the validation table report be displayed? The footer contains the
394
- starting and ending times of the interrogation.
404
+ starting and ending times of the interrogation and any notes added to validation steps.
405
+ report_incl_footer_timings
406
+ Controls whether the validation timing information (start time, duration, and end time)
407
+ should be displayed in the footer. Only applies when `report_incl_footer=True`.
408
+ report_incl_footer_notes
409
+ Controls whether the notes from validation steps should be displayed in the footer. Only
410
+ applies when `report_incl_footer=True`.
395
411
  preview_incl_header
396
412
  Whether the header should be present in any preview table (generated via the
397
413
  [`preview()`](`pointblank.preview`) function).
@@ -405,6 +421,8 @@ def config(
405
421
  global global_config
406
422
  global_config.report_incl_header = report_incl_header # pragma: no cover
407
423
  global_config.report_incl_footer = report_incl_footer # pragma: no cover
424
+ global_config.report_incl_footer_timings = report_incl_footer_timings # pragma: no cover
425
+ global_config.report_incl_footer_notes = report_incl_footer_notes # pragma: no cover
408
426
  global_config.preview_incl_header = preview_incl_header # pragma: no cover
409
427
 
410
428
 
@@ -3918,6 +3936,47 @@ class _ValidationInfo:
3918
3936
  return self.notes is not None and len(self.notes) > 0
3919
3937
 
3920
3938
 
3939
+ def _handle_connection_errors(e: Exception, connection_string: str) -> None:
3940
+ """
3941
+ Shared error handling for database connection failures.
3942
+
3943
+ Raises appropriate ConnectionError with helpful messages based on the exception.
3944
+ """
3945
+
3946
+ error_str = str(e).lower()
3947
+ backend_install_map = {
3948
+ "duckdb": "pip install 'ibis-framework[duckdb]'",
3949
+ "postgresql": "pip install 'ibis-framework[postgres]'",
3950
+ "postgres": "pip install 'ibis-framework[postgres]'",
3951
+ "mysql": "pip install 'ibis-framework[mysql]'",
3952
+ "sqlite": "pip install 'ibis-framework[sqlite]'",
3953
+ "bigquery": "pip install 'ibis-framework[bigquery]'",
3954
+ "snowflake": "pip install 'ibis-framework[snowflake]'",
3955
+ }
3956
+
3957
+ # Check if this is a missing backend dependency
3958
+ for backend, install_cmd in backend_install_map.items():
3959
+ if backend in error_str and ("not found" in error_str or "no module" in error_str):
3960
+ raise ConnectionError(
3961
+ f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
3962
+ f" {install_cmd}\n\n"
3963
+ f"Original error: {e}"
3964
+ ) from e
3965
+
3966
+ # Generic connection error
3967
+ raise ConnectionError( # pragma: no cover
3968
+ f"Failed to connect using: {connection_string}\n"
3969
+ f"Error: {e}\n\n"
3970
+ f"Supported connection string formats:\n"
3971
+ f"- DuckDB: 'duckdb:///path/to/file.ddb'\n"
3972
+ f"- SQLite: 'sqlite:///path/to/file.db'\n"
3973
+ f"- PostgreSQL: 'postgresql://user:pass@host:port/db'\n"
3974
+ f"- MySQL: 'mysql://user:pass@host:port/db'\n"
3975
+ f"- BigQuery: 'bigquery://project/dataset'\n"
3976
+ f"- Snowflake: 'snowflake://user:pass@account/db/schema'"
3977
+ ) from e
3978
+
3979
+
3921
3980
  def connect_to_table(connection_string: str) -> Any:
3922
3981
  """
3923
3982
  Connect to a database table using a connection string.
@@ -3997,7 +4056,11 @@ def connect_to_table(connection_string: str) -> Any:
3997
4056
  pip install 'ibis-framework[duckdb]' # for DuckDB
3998
4057
  pip install 'ibis-framework[postgres]' # for PostgreSQL
3999
4058
  ```
4059
+ See Also
4060
+ --------
4061
+ print_database_tables : List all available tables in a database for discovery
4000
4062
  """
4063
+
4001
4064
  # Check if Ibis is available
4002
4065
  if not _is_lib_present(lib_name="ibis"):
4003
4066
  raise ImportError(
@@ -4011,14 +4074,10 @@ def connect_to_table(connection_string: str) -> Any:
4011
4074
  if "::" not in connection_string:
4012
4075
  # Try to connect to get available tables for helpful error message
4013
4076
  try:
4014
- # Extract the base connection string (without table name)
4015
4077
  base_connection = connection_string
4016
-
4017
- # Connect to the database
4018
4078
  conn = ibis.connect(base_connection)
4019
4079
 
4020
- # Get list of available tables
4021
- try:
4080
+ try: # pragma: no cover
4022
4081
  available_tables = conn.list_tables()
4023
4082
  except Exception: # pragma: no cover
4024
4083
  available_tables = []
@@ -4035,7 +4094,6 @@ def connect_to_table(connection_string: str) -> Any:
4035
4094
  f" {connection_string}::TABLE_NAME\n\n"
4036
4095
  f"Examples:\n"
4037
4096
  )
4038
- # Add examples with first few table names
4039
4097
  for table in available_tables[:3]:
4040
4098
  error_msg += f" {connection_string}::{table}\n"
4041
4099
  else:
@@ -4050,43 +4108,8 @@ def connect_to_table(connection_string: str) -> Any:
4050
4108
 
4051
4109
  except Exception as e:
4052
4110
  if isinstance(e, ValueError):
4053
- raise # Re-raise our custom ValueError
4054
-
4055
- # Check for backend-specific errors and provide installation guidance
4056
- error_str = str(e).lower()
4057
- backend_install_map = {
4058
- "duckdb": "pip install 'ibis-framework[duckdb]'",
4059
- "postgresql": "pip install 'ibis-framework[postgres]'",
4060
- "postgres": "pip install 'ibis-framework[postgres]'",
4061
- "mysql": "pip install 'ibis-framework[mysql]'",
4062
- "sqlite": "pip install 'ibis-framework[sqlite]'",
4063
- "bigquery": "pip install 'ibis-framework[bigquery]'",
4064
- "snowflake": "pip install 'ibis-framework[snowflake]'",
4065
- }
4066
-
4067
- # Check if this is a missing backend dependency
4068
- for backend, install_cmd in backend_install_map.items(): # pragma: no cover
4069
- if backend in error_str and ("not found" in error_str or "no module" in error_str):
4070
- raise ConnectionError(
4071
- f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
4072
- f" {install_cmd}\n\n"
4073
- f"Original error: {e}\n\n"
4074
- f"Supported connection string formats:\n"
4075
- f"- DuckDB: 'duckdb:///path/to/file.ddb::table_name'\n"
4076
- f"- SQLite: 'sqlite:///path/to/file.db::table_name'\n"
4077
- f"- PostgreSQL: 'postgresql://user:pass@host:port/db::table_name'\n"
4078
- f"- MySQL: 'mysql://user:pass@host:port/db::table_name'\n"
4079
- f"- BigQuery: 'bigquery://project/dataset::table_name'\n"
4080
- f"- Snowflake: 'snowflake://user:pass@account/db/schema::table_name'\n"
4081
- f"\nNote: Use '::table_name' to specify the table within the database."
4082
- ) from e
4083
-
4084
- # Generic connection error
4085
- raise ConnectionError( # pragma: no cover
4086
- f"Failed to connect to database using connection string: {connection_string}\n"
4087
- f"Error: {e}\n\n"
4088
- f"No table specified. Use the format: {connection_string}::TABLE_NAME"
4089
- ) from e
4111
+ raise
4112
+ _handle_connection_errors(e, connection_string)
4090
4113
 
4091
4114
  # Split connection string and table name
4092
4115
  try:
@@ -4099,32 +4122,14 @@ def connect_to_table(connection_string: str) -> Any:
4099
4122
  conn = ibis.connect(base_connection)
4100
4123
  table = conn.table(table_name)
4101
4124
  return table
4102
-
4103
4125
  except Exception as e:
4104
- # Check for backend-specific errors and provide installation guidance
4105
4126
  error_str = str(e).lower()
4106
- backend_install_map = {
4107
- "duckdb": "pip install 'ibis-framework[duckdb]'",
4108
- "postgresql": "pip install 'ibis-framework[postgres]'",
4109
- "postgres": "pip install 'ibis-framework[postgres]'",
4110
- "mysql": "pip install 'ibis-framework[mysql]'",
4111
- "sqlite": "pip install 'ibis-framework[sqlite]'",
4112
- "bigquery": "pip install 'ibis-framework[bigquery]'",
4113
- "snowflake": "pip install 'ibis-framework[snowflake]'",
4114
- }
4115
-
4116
- # Check if this is a missing backend dependency
4117
- for backend, install_cmd in backend_install_map.items():
4118
- if backend in error_str and ("not found" in error_str or "no module" in error_str):
4119
- raise ConnectionError(
4120
- f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
4121
- f" {install_cmd}\n\n"
4122
- f"Original error: {e}"
4123
- ) from e
4124
4127
 
4125
- # Check if table doesn't exist
4126
- if "table" in error_str and ("not found" in error_str or "does not exist" in error_str):
4127
- # Try to get available tables for helpful message
4128
+ # Check if this is a "table not found" error
4129
+ if "table" in error_str and (
4130
+ "not found" in error_str or "does not exist" in error_str or "not exist" in error_str
4131
+ ):
4132
+ # Try to get available tables for a helpful error message
4128
4133
  try: # pragma: no cover
4129
4134
  available_tables = conn.list_tables()
4130
4135
  if available_tables:
@@ -4132,23 +4137,79 @@ def connect_to_table(connection_string: str) -> Any:
4132
4137
  raise ValueError(
4133
4138
  f"Table '{table_name}' not found in database.\n\n"
4134
4139
  f"Available tables:\n{table_list}\n\n"
4135
- f"Check the table name and try again with:\n"
4136
- f" {base_connection}::CORRECT_TABLE_NAME"
4137
- ) from e
4138
- else:
4139
- raise ValueError(
4140
- f"Table '{table_name}' not found and no tables available in database."
4140
+ f"Connection: {base_connection}"
4141
4141
  ) from e
4142
+ except ValueError:
4143
+ # Re-raise the table-specific ValueError
4144
+ raise
4142
4145
  except Exception:
4143
- raise ValueError(
4144
- f"Table '{table_name}' not found in database. "
4145
- f"Check the table name and connection string."
4146
- ) from e
4146
+ # If we can't list tables, just raise a simple error
4147
+ pass
4148
+
4149
+ raise ValueError(
4150
+ f"Table '{table_name}' not found in database.\n"
4151
+ f"Connection: {base_connection}\n\n"
4152
+ f"Original error: {e}"
4153
+ ) from e
4154
+
4155
+ # For other errors, use the generic connection error handler
4156
+ _handle_connection_errors(e, base_connection)
4157
+
4158
+
4159
+ def print_database_tables(connection_string: str) -> list[str]:
4160
+ """
4161
+ List all tables in a database from a connection string.
4162
+
4163
+ The `print_database_tables()` function connects to a database and returns a list of all
4164
+ available tables. This is particularly useful for discovering what tables exist in a database
4165
+ before connecting to a specific table with `connect_to_table(). The function automatically
4166
+ filters out temporary Ibis tables (memtables) to show only user tables. It supports all database
4167
+ backends available through Ibis, including DuckDB, SQLite, PostgreSQL, MySQL, BigQuery, and
4168
+ Snowflake.
4169
+
4170
+ Parameters
4171
+ ----------
4172
+ connection_string
4173
+ A database connection string *without* the `::table_name` suffix. Example:
4174
+ `"duckdb:///path/to/database.ddb"`.
4175
+
4176
+ Returns
4177
+ -------
4178
+ list[str]
4179
+ List of table names, excluding temporary Ibis tables.
4180
+
4181
+ See Also
4182
+ --------
4183
+ connect_to_table : Connect to a database table with full connection string documentation
4184
+ """
4185
+ # Check if connection string includes table specification (which is not allowed)
4186
+ if "::" in connection_string:
4187
+ raise ValueError(
4188
+ "Connection string should not include table specification (::table_name).\n"
4189
+ f"You've supplied: {connection_string}\n"
4190
+ f"Expected format: 'duckdb:///path/to/database.ddb' (without ::table_name)"
4191
+ )
4192
+
4193
+ # Check if Ibis is available
4194
+ if not _is_lib_present(lib_name="ibis"):
4195
+ raise ImportError(
4196
+ "The Ibis library is not installed but is required for database connection strings.\n"
4197
+ "Install it with: pip install 'ibis-framework[duckdb]' (or other backend as needed)"
4198
+ )
4199
+
4200
+ import ibis
4201
+
4202
+ try:
4203
+ # Connect to database
4204
+ conn = ibis.connect(connection_string)
4205
+ # Get all tables and filter out temporary Ibis tables
4206
+ all_tables = conn.list_tables()
4207
+ user_tables = [t for t in all_tables if "memtable" not in t]
4147
4208
 
4148
- # Generic connection error
4149
- raise ConnectionError(
4150
- f"Failed to connect to table '{table_name}' using: {base_connection}\nError: {e}"
4151
- ) from e
4209
+ return user_tables
4210
+
4211
+ except Exception as e:
4212
+ _handle_connection_errors(e, connection_string)
4152
4213
 
4153
4214
 
4154
4215
  @dataclass
@@ -4430,6 +4491,16 @@ class Validate:
4430
4491
  - Vietnamese (`"vi"`)
4431
4492
  - Indonesian (`"id"`)
4432
4493
  - Ukrainian (`"uk"`)
4494
+ - Bulgarian (`"bg"`)
4495
+ - Croatian (`"hr"`)
4496
+ - Estonian (`"et"`)
4497
+ - Hungarian (`"hu"`)
4498
+ - Irish (`"ga"`)
4499
+ - Latvian (`"lv"`)
4500
+ - Lithuanian (`"lt"`)
4501
+ - Maltese (`"mt"`)
4502
+ - Slovak (`"sk"`)
4503
+ - Slovenian (`"sl"`)
4433
4504
  - Hebrew (`"he"`)
4434
4505
  - Thai (`"th"`)
4435
4506
  - Persian (`"fa"`)
@@ -9700,40 +9771,41 @@ class Validate:
9700
9771
 
9701
9772
  return self
9702
9773
 
9703
- def rows_distinct(
9774
+ def col_pct_null(
9704
9775
  self,
9705
- columns_subset: str | list[str] | None = None,
9706
- pre: Callable | None = None,
9707
- segments: SegmentSpec | None = None,
9708
- thresholds: int | float | bool | tuple | dict | Thresholds = None,
9776
+ columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
9777
+ p: float,
9778
+ tol: Tolerance = 0,
9779
+ thresholds: int | float | None | bool | tuple | dict | Thresholds = None,
9709
9780
  actions: Actions | None = None,
9710
9781
  brief: str | bool | None = None,
9711
9782
  active: bool = True,
9712
9783
  ) -> Validate:
9713
9784
  """
9714
- Validate whether rows in the table are distinct.
9785
+ Validate whether a column has a specific percentage of Null values.
9715
9786
 
9716
- The `rows_distinct()` method checks whether rows in the table are distinct. This validation
9717
- will operate over the number of test units that is equal to the number of rows in the table
9718
- (determined after any `pre=` mutation has been applied).
9787
+ The `col_pct_null()` validation method checks whether the percentage of Null values in a
9788
+ column matches a specified percentage `p=` (within an optional tolerance `tol=`). This
9789
+ validation operates at the column level, generating a single validation step per column that
9790
+ passes or fails based on whether the actual percentage of Null values falls within the
9791
+ acceptable range defined by `p ± tol`.
9719
9792
 
9720
9793
  Parameters
9721
9794
  ----------
9722
- columns_subset
9723
- A single column or a list of columns to use as a subset for the distinct comparison.
9724
- If `None`, then all columns in the table will be used for the comparison. If multiple
9725
- columns are supplied, the distinct comparison will be made over the combination of
9726
- values in those columns.
9727
- pre
9728
- An optional preprocessing function or lambda to apply to the data table during
9729
- interrogation. This function should take a table as input and return a modified table.
9730
- Have a look at the *Preprocessing* section for more information on how to use this
9731
- argument.
9732
- segments
9733
- An optional directive on segmentation, which serves to split a validation step into
9734
- multiple (one step per segment). Can be a single column name, a tuple that specifies a
9735
- column name and its corresponding values to segment on, or a combination of both
9736
- (provided as a list). Read the *Segmentation* section for usage information.
9795
+ columns
9796
+ A single column or a list of columns to validate. Can also use
9797
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
9798
+ multiple columns are supplied or resolved, there will be a separate validation step
9799
+ generated for each column.
9800
+ p
9801
+ The expected percentage of Null values in the column, expressed as a decimal between
9802
+ `0.0` and `1.0`. For example, `p=0.5` means 50% of values should be Null.
9803
+ tol
9804
+ The tolerance allowed when comparing the actual percentage of Null values to the
9805
+ expected percentage `p=`. The validation passes if the actual percentage falls within
9806
+ the range `[p - tol, p + tol]`. Default is `0`, meaning an exact match is required. See
9807
+ the *Tolerance* section for details on all supported formats (absolute, relative,
9808
+ symmetric, and asymmetric bounds).
9737
9809
  thresholds
9738
9810
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
9739
9811
  The thresholds are set at the step level and will override any global thresholds set in
@@ -9741,7 +9813,7 @@ class Validate:
9741
9813
  be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
9742
9814
  section for information on how to set threshold levels.
9743
9815
  actions
9744
- Optional actions to take when the validation step meets or exceeds any set threshold
9816
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
9745
9817
  levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
9746
9818
  define the actions.
9747
9819
  brief
@@ -9760,60 +9832,30 @@ class Validate:
9760
9832
  Validate
9761
9833
  The `Validate` object with the added validation step.
9762
9834
 
9763
- Preprocessing
9764
- -------------
9765
- The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
9766
- table during interrogation. This function should take a table as input and return a modified
9767
- table. This is useful for performing any necessary transformations or filtering on the data
9768
- before the validation step is applied.
9769
-
9770
- The preprocessing function can be any callable that takes a table as input and returns a
9771
- modified table. For example, you could use a lambda function to filter the table based on
9772
- certain criteria or to apply a transformation to the data. Note that you can refer to
9773
- columns via `columns_subset=` that are expected to be present in the transformed table, but
9774
- may not exist in the table before preprocessing. Regarding the lifetime of the transformed
9775
- table, it only exists during the validation step and is not stored in the `Validate` object
9776
- or used in subsequent validation steps.
9777
-
9778
- Segmentation
9779
- ------------
9780
- The `segments=` argument allows for the segmentation of a validation step into multiple
9781
- segments. This is useful for applying the same validation step to different subsets of the
9782
- data. The segmentation can be done based on a single column or specific fields within a
9783
- column.
9784
-
9785
- Providing a single column name will result in a separate validation step for each unique
9786
- value in that column. For example, if you have a column called `"region"` with values
9787
- `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
9788
- region.
9789
-
9790
- Alternatively, you can provide a tuple that specifies a column name and its corresponding
9791
- values to segment on. For example, if you have a column called `"date"` and you want to
9792
- segment on only specific dates, you can provide a tuple like
9793
- `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
9794
- (i.e., no validation steps will be created for them).
9835
+ Tolerance
9836
+ ---------
9837
+ The `tol=` parameter accepts several different formats to specify the acceptable deviation
9838
+ from the expected percentage `p=`. The tolerance can be expressed as:
9795
9839
 
9796
- A list with a combination of column names and tuples can be provided as well. This allows
9797
- for more complex segmentation scenarios. The following inputs are both valid:
9840
+ 1. *single integer* (absolute tolerance): the exact number of test units that can deviate.
9841
+ For example, `tol=2` means the actual count can differ from the expected count by up to 2
9842
+ units in either direction.
9798
9843
 
9799
- ```
9800
- # Segments from all unique values in the `region` column
9801
- # and specific dates in the `date` column
9802
- segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
9844
+ 2. *single float between 0 and 1* (relative tolerance): a proportion of the expected
9845
+ count. For example, if the expected count is 50 and `tol=0.1`, the acceptable range is
9846
+ 45 to 55 (50 ± 10% of 50 = 50 ± 5).
9803
9847
 
9804
- # Segments from all unique values in the `region` and `date` columns
9805
- segments=["region", "date"]
9806
- ```
9848
+ 3. *tuple of two integers* (absolute bounds): explicitly specify the lower and upper
9849
+ bounds as absolute deviations. For example, `tol=(1, 3)` means the actual count can be
9850
+ 1 unit below or 3 units above the expected count.
9807
9851
 
9808
- The segmentation is performed during interrogation, and the resulting validation steps will
9809
- be numbered sequentially. Each segment will have its own validation step, and the results
9810
- will be reported separately. This allows for a more granular analysis of the data and helps
9811
- identify issues within specific segments.
9852
+ 4. *tuple of two floats between 0 and 1* (relative bounds): explicitly specify the lower
9853
+ and upper bounds as proportional deviations. For example, `tol=(0.05, 0.15)` means the
9854
+ lower bound is 5% below and the upper bound is 15% above the expected count.
9812
9855
 
9813
- Importantly, the segmentation process will be performed after any preprocessing of the data
9814
- table. Because of this, one can conceivably use the `pre=` argument to generate a column
9815
- that can be used for segmentation. For example, you could create a new column called
9816
- `"segment"` through use of `pre=` and then use that column for segmentation.
9856
+ When using a single value (integer or float), the tolerance is applied symmetrically in both
9857
+ directions. When using a tuple, you can specify asymmetric tolerances where the lower and
9858
+ upper bounds differ.
9817
9859
 
9818
9860
  Thresholds
9819
9861
  ----------
@@ -9851,8 +9893,8 @@ class Validate:
9851
9893
  import pointblank as pb
9852
9894
  pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
9853
9895
  ```
9854
- For the examples here, we'll use a simple Polars DataFrame with three string columns
9855
- (`col_1`, `col_2`, and `col_3`). The table is shown below:
9896
+ For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`,
9897
+ and `c`) that have different percentages of Null values. The table is shown below:
9856
9898
 
9857
9899
  ```{python}
9858
9900
  import pointblank as pb
@@ -9860,56 +9902,133 @@ class Validate:
9860
9902
 
9861
9903
  tbl = pl.DataFrame(
9862
9904
  {
9863
- "col_1": ["a", "b", "c", "d"],
9864
- "col_2": ["a", "a", "c", "d"],
9865
- "col_3": ["a", "a", "d", "e"],
9905
+ "a": [1, 2, 3, 4, 5, 6, 7, 8],
9906
+ "b": [1, None, 3, None, 5, None, 7, None],
9907
+ "c": [None, None, None, None, None, None, 1, 2],
9866
9908
  }
9867
9909
  )
9868
9910
 
9869
9911
  pb.preview(tbl)
9870
9912
  ```
9871
9913
 
9872
- Let's validate that the rows in the table are distinct with `rows_distinct()`. We'll
9873
- determine if this validation had any failing test units (there are four test units, one for
9874
- each row). A failing test units means that a given row is not distinct from every other row.
9914
+ Let's validate that column `a` has 0% Null values (i.e., no Null values at all).
9875
9915
 
9876
9916
  ```{python}
9877
9917
  validation = (
9878
9918
  pb.Validate(data=tbl)
9879
- .rows_distinct()
9919
+ .col_pct_null(columns="a", p=0.0)
9880
9920
  .interrogate()
9881
9921
  )
9882
9922
 
9883
9923
  validation
9884
9924
  ```
9885
9925
 
9886
- From this validation table we see that there are no failing test units. All rows in the
9887
- table are distinct from one another.
9926
+ Printing the `validation` object shows the validation table in an HTML viewing environment.
9927
+ The validation table shows the single entry that corresponds to the validation step created
9928
+ by using `col_pct_null()`. The validation passed since column `a` has no Null values.
9888
9929
 
9889
- We can also use a subset of columns to determine distinctness. Let's specify the subset
9890
- using columns `col_2` and `col_3` for the next validation.
9930
+ Now, let's check that column `b` has exactly 50% Null values.
9891
9931
 
9892
9932
  ```{python}
9893
9933
  validation = (
9894
9934
  pb.Validate(data=tbl)
9895
- .rows_distinct(columns_subset=["col_2", "col_3"])
9935
+ .col_pct_null(columns="b", p=0.5)
9896
9936
  .interrogate()
9897
9937
  )
9898
9938
 
9899
9939
  validation
9900
9940
  ```
9901
9941
 
9902
- The validation table reports two failing test units. The first and second rows are
9903
- duplicated when considering only the values in columns `col_2` and `col_3`. There's only
9904
- one set of duplicates but there are two failing test units since each row is compared to all
9905
- others.
9906
- """
9942
+ This validation also passes, as column `b` has exactly 4 out of 8 values as Null (50%).
9943
+
9944
+ Finally, let's validate column `c` with a tolerance. Column `c` has 75% Null values, so
9945
+ we'll check if it's approximately 70% Null with a tolerance of 10%.
9946
+
9947
+ ```{python}
9948
+ validation = (
9949
+ pb.Validate(data=tbl)
9950
+ .col_pct_null(columns="c", p=0.70, tol=0.10)
9951
+ .interrogate()
9952
+ )
9953
+
9954
+ validation
9955
+ ```
9956
+
9957
+ This validation passes because the actual percentage (75%) falls within the acceptable
9958
+ range of 60% to 80% (70% ± 10%).
9959
+
9960
+ The `tol=` parameter supports multiple formats to express tolerance. Let's explore all the
9961
+ different ways to specify tolerance using column `b`, which has exactly 50% Null values
9962
+ (4 out of 8 values).
9963
+
9964
+ *Using an absolute tolerance (integer)*: Specify the exact number of rows that can
9965
+ deviate. With `tol=1`, we allow the count to differ by 1 row in either direction.
9966
+
9967
+ ```{python}
9968
+ validation = (
9969
+ pb.Validate(data=tbl)
9970
+ .col_pct_null(columns="b", p=0.375, tol=1) # Expect 3 nulls, allow ±1 (range: 2-4)
9971
+ .interrogate()
9972
+ )
9973
+
9974
+ validation
9975
+ ```
9907
9976
 
9977
+ This passes because column `b` has 4 Null values, which falls within the acceptable range
9978
+ of 2 to 4 (3 ± 1).
9979
+
9980
+ *Using a relative tolerance (float)*: Specify the tolerance as a proportion of the
9981
+ expected count. With `tol=0.25`, we allow a 25% deviation from the expected count.
9982
+
9983
+ ```{python}
9984
+ validation = (
9985
+ pb.Validate(data=tbl)
9986
+ .col_pct_null(columns="b", p=0.375, tol=0.25) # Expect 3 nulls, allow ±25% (range: 2.25-3.75)
9987
+ .interrogate()
9988
+ )
9989
+
9990
+ validation
9991
+ ```
9992
+
9993
+ This passes because 4 Null values falls within the acceptable range (3 ± 0.75 calculates
9994
+ to 2.25 to 3.75, which rounds down to 2 to 3 rows).
9995
+
9996
+ *Using asymmetric absolute bounds (tuple of integers)*: Specify different lower and
9997
+ upper bounds as absolute values. With `tol=(0, 2)`, we allow no deviation below but up
9998
+ to 2 rows above the expected count.
9999
+
10000
+ ```{python}
10001
+ validation = (
10002
+ pb.Validate(data=tbl)
10003
+ .col_pct_null(columns="b", p=0.25, tol=(0, 2)) # Expect 2 Nulls, allow +0/-2 (range: 2-4)
10004
+ .interrogate()
10005
+ )
10006
+
10007
+ validation
10008
+ ```
10009
+
10010
+ This passes because 4 Null values falls within the acceptable range of 2 to 4.
10011
+
10012
+ *Using asymmetric relative bounds (tuple of floats)*: Specify different lower and upper
10013
+ bounds as proportions. With `tol=(0.1, 0.3)`, we allow 10% below and 30% above the
10014
+ expected count.
10015
+
10016
+ ```{python}
10017
+ validation = (
10018
+ pb.Validate(data=tbl)
10019
+ .col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3)) # Expect 3 Nulls, allow -10%/+30%
10020
+ .interrogate()
10021
+ )
10022
+
10023
+ validation
10024
+ ```
10025
+
10026
+ This passes because 4 Null values falls within the acceptable range (3 - 0.3 to 3 + 0.9
10027
+ calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows).
10028
+ """
9908
10029
  assertion_type = _get_fn_name()
9909
10030
 
9910
- _check_pre(pre=pre)
9911
- # TODO: add check for segments
9912
- # _check_segments(segments=segments)
10031
+ _check_column(column=columns)
9913
10032
  _check_thresholds(thresholds=thresholds)
9914
10033
  _check_boolean_input(param=active, param_name="active")
9915
10034
 
@@ -9918,31 +10037,38 @@ class Validate:
9918
10037
  self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
9919
10038
  )
9920
10039
 
9921
- if columns_subset is not None and isinstance(columns_subset, str):
9922
- columns_subset = [columns_subset]
10040
+ # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
10041
+ # resolve the columns
10042
+ if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
10043
+ columns = col(columns)
9923
10044
 
9924
- # TODO: incorporate Column object
10045
+ # If `columns` is Column value or a string, place it in a list for iteration
10046
+ if isinstance(columns, (Column, str)):
10047
+ columns = [columns]
9925
10048
 
9926
10049
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
9927
10050
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
9928
10051
 
9929
- val_info = _ValidationInfo(
9930
- assertion_type=assertion_type,
9931
- column=columns_subset,
9932
- pre=pre,
9933
- segments=segments,
9934
- thresholds=thresholds,
9935
- actions=actions,
9936
- brief=brief,
9937
- active=active,
9938
- )
9939
-
9940
- self._add_validation(validation_info=val_info)
10052
+ bound_finder: Callable[[int], AbsoluteBounds] = partial(_derive_bounds, tol=tol)
9941
10053
 
9942
- return self
9943
-
9944
- def rows_complete(
9945
- self,
10054
+ # Iterate over the columns and create a validation step for each
10055
+ for column in columns:
10056
+ val_info = _ValidationInfo(
10057
+ assertion_type=assertion_type,
10058
+ column=column,
10059
+ values={"p": p, "bound_finder": bound_finder},
10060
+ thresholds=thresholds,
10061
+ actions=actions,
10062
+ brief=brief,
10063
+ active=active,
10064
+ )
10065
+
10066
+ self._add_validation(validation_info=val_info)
10067
+
10068
+ return self
10069
+
10070
+ def rows_distinct(
10071
+ self,
9946
10072
  columns_subset: str | list[str] | None = None,
9947
10073
  pre: Callable | None = None,
9948
10074
  segments: SegmentSpec | None = None,
@@ -9952,19 +10078,19 @@ class Validate:
9952
10078
  active: bool = True,
9953
10079
  ) -> Validate:
9954
10080
  """
9955
- Validate whether row data are complete by having no missing values.
10081
+ Validate whether rows in the table are distinct.
9956
10082
 
9957
- The `rows_complete()` method checks whether rows in the table are complete. Completeness
9958
- of a row means that there are no missing values within the row. This validation will operate
9959
- over the number of test units that is equal to the number of rows in the table (determined
9960
- after any `pre=` mutation has been applied). A subset of columns can be specified for the
9961
- completeness check. If no subset is provided, all columns in the table will be used.
10083
+ The `rows_distinct()` method checks whether rows in the table are distinct. This validation
10084
+ will operate over the number of test units that is equal to the number of rows in the table
10085
+ (determined after any `pre=` mutation has been applied).
9962
10086
 
9963
10087
  Parameters
9964
10088
  ----------
9965
10089
  columns_subset
9966
- A single column or a list of columns to use as a subset for the completeness check. If
9967
- `None` (the default), then all columns in the table will be used.
10090
+ A single column or a list of columns to use as a subset for the distinct comparison.
10091
+ If `None`, then all columns in the table will be used for the comparison. If multiple
10092
+ columns are supplied, the distinct comparison will be made over the combination of
10093
+ values in those columns.
9968
10094
  pre
9969
10095
  An optional preprocessing function or lambda to apply to the data table during
9970
10096
  interrogation. This function should take a table as input and return a modified table.
@@ -10101,48 +10227,48 @@ class Validate:
10101
10227
 
10102
10228
  tbl = pl.DataFrame(
10103
10229
  {
10104
- "col_1": ["a", None, "c", "d"],
10105
- "col_2": ["a", "a", "c", None],
10106
- "col_3": ["a", "a", "d", None],
10230
+ "col_1": ["a", "b", "c", "d"],
10231
+ "col_2": ["a", "a", "c", "d"],
10232
+ "col_3": ["a", "a", "d", "e"],
10107
10233
  }
10108
10234
  )
10109
10235
 
10110
10236
  pb.preview(tbl)
10111
10237
  ```
10112
10238
 
10113
- Let's validate that the rows in the table are complete with `rows_complete()`. We'll
10239
+ Let's validate that the rows in the table are distinct with `rows_distinct()`. We'll
10114
10240
  determine if this validation had any failing test units (there are four test units, one for
10115
- each row). A failing test units means that a given row is not complete (i.e., has at least
10116
- one missing value).
10241
+ each row). A failing test units means that a given row is not distinct from every other row.
10117
10242
 
10118
10243
  ```{python}
10119
10244
  validation = (
10120
10245
  pb.Validate(data=tbl)
10121
- .rows_complete()
10246
+ .rows_distinct()
10122
10247
  .interrogate()
10123
10248
  )
10124
10249
 
10125
10250
  validation
10126
10251
  ```
10127
10252
 
10128
- From this validation table we see that there are two failing test units. This is because
10129
- two rows in the table have at least one missing value (the second row and the last row).
10253
+ From this validation table we see that there are no failing test units. All rows in the
10254
+ table are distinct from one another.
10130
10255
 
10131
- We can also use a subset of columns to determine completeness. Let's specify the subset
10256
+ We can also use a subset of columns to determine distinctness. Let's specify the subset
10132
10257
  using columns `col_2` and `col_3` for the next validation.
10133
10258
 
10134
10259
  ```{python}
10135
10260
  validation = (
10136
10261
  pb.Validate(data=tbl)
10137
- .rows_complete(columns_subset=["col_2", "col_3"])
10262
+ .rows_distinct(columns_subset=["col_2", "col_3"])
10138
10263
  .interrogate()
10139
10264
  )
10140
10265
 
10141
10266
  validation
10142
10267
  ```
10143
10268
 
10144
- The validation table reports a single failing test units. The last row contains missing
10145
- values in both the `col_2` and `col_3` columns.
10269
+ The validation table reports two failing test units. The first and second rows are
10270
+ duplicated when considering only the values in columns `col_2` and `col_3`. There's only
10271
+ one set of duplicates but there are two failing test units since each row is compared to all
10146
10272
  others.
10147
10273
  """
10148
10274
 
@@ -10159,8 +10285,8 @@ class Validate:
10159
10285
  self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
10160
10286
  )
10161
10287
 
10162
- if columns_subset is not None and isinstance(columns_subset, str): # pragma: no cover
10163
- columns_subset = [columns_subset] # pragma: no cover
10288
+ if columns_subset is not None and isinstance(columns_subset, str):
10289
+ columns_subset = [columns_subset]
10164
10290
 
10165
10291
  # TODO: incorporate Column object
10166
10292
 
@@ -10182,13 +10308,9 @@ class Validate:
10182
10308
 
10183
10309
  return self
10184
10310
 
10185
- def prompt(
10311
+ def rows_complete(
10186
10312
  self,
10187
- prompt: str,
10188
- model: str,
10189
10313
  columns_subset: str | list[str] | None = None,
10190
- batch_size: int = 1000,
10191
- max_concurrent: int = 3,
10192
10314
  pre: Callable | None = None,
10193
10315
  segments: SegmentSpec | None = None,
10194
10316
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
@@ -10197,66 +10319,35 @@ class Validate:
10197
10319
  active: bool = True,
10198
10320
  ) -> Validate:
10199
10321
  """
10200
- Validate rows using AI/LLM-powered analysis.
10201
-
10202
- The `prompt()` validation method uses Large Language Models (LLMs) to validate rows of data
10203
- based on natural language criteria. Similar to other Pointblank validation methods, this
10204
- generates binary test results (pass/fail) that integrate seamlessly with the standard
10205
- reporting framework.
10206
-
10207
- Like `col_vals_*()` methods, `prompt()` evaluates data against specific criteria, but
10208
- instead of using programmatic rules, it uses natural language prompts interpreted by an LLM.
10209
- Like `rows_distinct()` and `rows_complete()`, it operates at the row level and allows you to
10210
- specify a subset of columns for evaluation using `columns_subset=`.
10211
-
10212
- The system automatically combines your validation criteria from the `prompt=` parameter with
10213
- the necessary technical context, data formatting instructions, and response structure
10214
- requirements. This is all so you only need to focus on describing your validation logic in
10215
- plain language.
10322
+ Validate whether row data are complete by having no missing values.
10216
10323
 
10217
- Each row becomes a test unit that either passes or fails the validation criteria, producing
10218
- the familiar True/False results that appear in Pointblank validation reports. This method
10219
- is particularly useful for complex validation rules that are difficult to express with
10220
- traditional validation methods, such as semantic checks, context-dependent validation, or
10221
- subjective quality assessments.
10324
+ The `rows_complete()` method checks whether rows in the table are complete. Completeness
10325
+ of a row means that there are no missing values within the row. This validation will operate
10326
+ over the number of test units that is equal to the number of rows in the table (determined
10327
+ after any `pre=` mutation has been applied). A subset of columns can be specified for the
10328
+ completeness check. If no subset is provided, all columns in the table will be used.
10222
10329
 
10223
10330
  Parameters
10224
10331
  ----------
10225
- prompt
10226
- A natural language description of the validation criteria. This prompt should clearly
10227
- describe what constitutes valid vs invalid rows. Some examples:
10228
- `"Each row should contain a valid email address and a realistic person name"`,
10229
- `"Values should indicate positive sentiment"`,
10230
- `"The description should mention a country name"`.
10231
10332
  columns_subset
10232
- A single column or list of columns to include in the validation. If `None`, all columns
10233
- will be included. Specifying fewer columns can improve performance and reduce API costs
10234
- so try to include only the columns necessary for the validation.
10235
- model
10236
- The model to be used. This should be in the form of `provider:model` (e.g.,
10237
- `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
10238
- `"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from
10239
- the provider. Model names are subject to change so consult the provider's documentation
10240
- for the most up-to-date model names.
10241
- batch_size
10242
- Number of rows to process in each batch. Larger batches are more efficient but may hit
10243
- API limits. Default is `1000`.
10244
- max_concurrent
10245
- Maximum number of concurrent API requests. Higher values speed up processing but may
10246
- hit rate limits. Default is `3`.
10333
+ A single column or a list of columns to use as a subset for the completeness check. If
10334
+ `None` (the default), then all columns in the table will be used.
10247
10335
  pre
10248
10336
  An optional preprocessing function or lambda to apply to the data table during
10249
10337
  interrogation. This function should take a table as input and return a modified table.
10338
+ Have a look at the *Preprocessing* section for more information on how to use this
10339
+ argument.
10250
10340
  segments
10251
10341
  An optional directive on segmentation, which serves to split a validation step into
10252
10342
  multiple (one step per segment). Can be a single column name, a tuple that specifies a
10253
10343
  column name and its corresponding values to segment on, or a combination of both
10254
- (provided as a list).
10344
+ (provided as a list). Read the *Segmentation* section for usage information.
10255
10345
  thresholds
10256
10346
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
10257
10347
  The thresholds are set at the step level and will override any global thresholds set in
10258
10348
  `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
10259
- be set locally and global thresholds (if any) will take effect.
10349
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
10350
+ section for information on how to set threshold levels.
10260
10351
  actions
10261
10352
  Optional actions to take when the validation step meets or exceeds any set threshold
10262
10353
  levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
@@ -10277,152 +10368,88 @@ class Validate:
10277
10368
  Validate
10278
10369
  The `Validate` object with the added validation step.
10279
10370
 
10280
- Constructing the `model` Argument
10281
- ---------------------------------
10282
- The `model=` argument should be constructed using the provider and model name separated by a
10283
- colon (`provider:model`). The provider text can any of:
10371
+ Preprocessing
10372
+ -------------
10373
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
10374
+ table during interrogation. This function should take a table as input and return a modified
10375
+ table. This is useful for performing any necessary transformations or filtering on the data
10376
+ before the validation step is applied.
10284
10377
 
10285
- - `"anthropic"` (Anthropic)
10286
- - `"openai"` (OpenAI)
10287
- - `"ollama"` (Ollama)
10288
- - `"bedrock"` (Amazon Bedrock)
10378
+ The preprocessing function can be any callable that takes a table as input and returns a
10379
+ modified table. For example, you could use a lambda function to filter the table based on
10380
+ certain criteria or to apply a transformation to the data. Note that you can refer to
10381
+ columns via `columns_subset=` that are expected to be present in the transformed table, but
10382
+ may not exist in the table before preprocessing. Regarding the lifetime of the transformed
10383
+ table, it only exists during the validation step and is not stored in the `Validate` object
10384
+ or used in subsequent validation steps.
10289
10385
 
10290
- The model name should be the specific model to be used from the provider. Model names are
10291
- subject to change so consult the provider's documentation for the most up-to-date model
10292
- names.
10386
+ Segmentation
10387
+ ------------
10388
+ The `segments=` argument allows for the segmentation of a validation step into multiple
10389
+ segments. This is useful for applying the same validation step to different subsets of the
10390
+ data. The segmentation can be done based on a single column or specific fields within a
10391
+ column.
10293
10392
 
10294
- Notes on Authentication
10295
- -----------------------
10296
- API keys are automatically loaded from environment variables or `.env` files and are **not**
10297
- stored in the validation object for security reasons. You should consider using a secure
10298
- method for handling API keys.
10393
+ Providing a single column name will result in a separate validation step for each unique
10394
+ value in that column. For example, if you have a column called `"region"` with values
10395
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
10396
+ region.
10299
10397
 
10300
- One way to do this is to load the API key from an environment variable and retrieve it using
10301
- the `os` module (specifically the `os.getenv()` function). Places to store the API key might
10302
- include `.bashrc`, `.bash_profile`, `.zshrc`, or `.zsh_profile`.
10398
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
10399
+ values to segment on. For example, if you have a column called `"date"` and you want to
10400
+ segment on only specific dates, you can provide a tuple like
10401
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
10402
+ (i.e., no validation steps will be created for them).
10303
10403
 
10304
- Another solution is to store one or more model provider API keys in an `.env` file (in the
10305
- root of your project). If the API keys have correct names (e.g., `ANTHROPIC_API_KEY` or
10306
- `OPENAI_API_KEY`) then the AI validation will automatically load the API key from the `.env`
10307
- file. An `.env` file might look like this:
10404
+ A list with a combination of column names and tuples can be provided as well. This allows
10405
+ for more complex segmentation scenarios. The following inputs are both valid:
10308
10406
 
10309
- ```plaintext
10310
- ANTHROPIC_API_KEY="your_anthropic_api_key_here"
10311
- OPENAI_API_KEY="your_openai_api_key_here"
10312
10407
  ```
10408
+ # Segments from all unique values in the `region` column
10409
+ # and specific dates in the `date` column
10410
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
10313
10411
 
10314
- There's no need to have the `python-dotenv` package installed when using `.env` files in
10315
- this way.
10412
+ # Segments from all unique values in the `region` and `date` columns
10413
+ segments=["region", "date"]
10414
+ ```
10316
10415
 
10317
- **Provider-specific setup**:
10416
+ The segmentation is performed during interrogation, and the resulting validation steps will
10417
+ be numbered sequentially. Each segment will have its own validation step, and the results
10418
+ will be reported separately. This allows for a more granular analysis of the data and helps
10419
+ identify issues within specific segments.
10318
10420
 
10319
- - **OpenAI**: set `OPENAI_API_KEY` environment variable or create `.env` file
10320
- - **Anthropic**: set `ANTHROPIC_API_KEY` environment variable or create `.env` file
10321
- - **Ollama**: no API key required, just ensure Ollama is running locally
10322
- - **Bedrock**: configure AWS credentials through standard AWS methods
10421
+ Importantly, the segmentation process will be performed after any preprocessing of the data
10422
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
10423
+ that can be used for segmentation. For example, you could create a new column called
10424
+ `"segment"` through use of `pre=` and then use that column for segmentation.
10323
10425
 
10324
- AI Validation Process
10325
- ---------------------
10326
- The AI validation process works as follows:
10426
+ Thresholds
10427
+ ----------
10428
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
10429
+ step. If they are set here at the step level, these thresholds will override any thresholds
10430
+ set at the global level in `Validate(thresholds=...)`.
10327
10431
 
10328
- 1. data batching: the data is split into batches of the specified size
10329
- 2. row deduplication: duplicate rows (based on selected columns) are identified and only
10330
- unique combinations are sent to the LLM for analysis
10331
- 3. json conversion: each batch of unique rows is converted to JSON format for the LLM
10332
- 4. prompt construction: the user prompt is embedded in a structured system prompt
10333
- 5. llm processing: each batch is sent to the LLM for analysis
10334
- 6. response parsing: LLM responses are parsed to extract validation results
10335
- 7. result projection: results are mapped back to all original rows using row signatures
10336
- 8. result aggregation: results from all batches are combined
10432
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
10433
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
10434
+ or, the absolute number of failing test units (as integer that's `1` or greater).
10337
10435
 
10338
- **Performance Optimization**: the process uses row signature memoization to avoid redundant
10339
- LLM calls. When multiple rows have identical values in the selected columns, only one
10340
- representative row is validated, and the result is applied to all matching rows. This can
10341
- dramatically reduce API costs and processing time for datasets with repetitive patterns.
10436
+ Thresholds can be defined using one of these input schemes:
10342
10437
 
10343
- The LLM receives data in this JSON format:
10438
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
10439
+ thresholds)
10440
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
10441
+ the 'error' level, and position `2` is the 'critical' level
10442
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
10443
+ 'critical'
10444
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
10445
+ for the 'warning' level only
10344
10446
 
10345
- ```json
10346
- {
10347
- "columns": ["col1", "col2", "col3"],
10348
- "rows": [
10349
- {"col1": "value1", "col2": "value2", "col3": "value3", "_pb_row_index": 0},
10350
- {"col1": "value4", "col2": "value5", "col3": "value6", "_pb_row_index": 1}
10351
- ]
10352
- }
10353
- ```
10447
+ If the number of failing test units exceeds set thresholds, the validation step will be
10448
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
10449
+ set, you're free to set any combination of them.
10354
10450
 
10355
- The LLM returns validation results in this format:
10356
- ```json
10357
- [
10358
- {"index": 0, "result": true},
10359
- {"index": 1, "result": false}
10360
- ]
10361
- ```
10362
-
10363
- Prompt Design Tips
10364
- ------------------
10365
- For best results, design prompts that are:
10366
-
10367
- - boolean-oriented: frame validation criteria to elicit clear valid/invalid responses
10368
- - specific: clearly define what makes a row valid/invalid
10369
- - unambiguous: avoid subjective language that could be interpreted differently
10370
- - context-aware: include relevant business rules or domain knowledge
10371
- - example-driven: consider providing examples in the prompt when helpful
10372
-
10373
- **Critical**: Prompts must be designed so the LLM can determine whether each row passes or
10374
- fails the validation criteria. The system expects binary validation responses, so avoid
10375
- open-ended questions or prompts that might generate explanatory text instead of clear
10376
- pass/fail judgments.
10377
-
10378
- Good prompt examples:
10379
-
10380
- - "Each row should contain a valid email address in the 'email' column and a non-empty name
10381
- in the 'name' column"
10382
- - "The 'sentiment' column should contain positive sentiment words (happy, good, excellent,
10383
- etc.)"
10384
- - "Product descriptions should mention at least one technical specification"
10385
-
10386
- Poor prompt examples (avoid these):
10387
-
10388
- - "What do you think about this data?" (too open-ended)
10389
- - "Describe the quality of each row" (asks for description, not validation)
10390
- - "How would you improve this data?" (asks for suggestions, not pass/fail)
10391
-
10392
- Performance Considerations
10393
- --------------------------
10394
- AI validation is significantly slower than traditional validation methods due to API calls
10395
- to LLM providers. However, performance varies dramatically based on data characteristics:
10396
-
10397
- **High Memoization Scenarios** (seconds to minutes):
10398
-
10399
- - data with many duplicate rows in the selected columns
10400
- - low cardinality data (repeated patterns)
10401
- - small number of unique row combinations
10402
-
10403
- **Low Memoization Scenarios** (minutes to hours):
10404
-
10405
- - high cardinality data with mostly unique rows
10406
- - large datasets with few repeated patterns
10407
- - all or most rows requiring individual LLM evaluation
10408
-
10409
- The row signature memoization optimization can reduce processing time significantly when
10410
- data has repetitive patterns. For datasets where every row is unique, expect longer
10411
- processing times similar to validating each row individually.
10412
-
10413
- **Strategies to Reduce Processing Time**:
10414
-
10415
- - test on data slices: define a sampling function like `def sample_1000(df): return df.head(1000)`
10416
- and use `pre=sample_1000` to validate on smaller samples
10417
- - filter relevant data: define filter functions like `def active_only(df): return df.filter(df["status"] == "active")`
10418
- and use `pre=active_only` to focus on a specific subset
10419
- - optimize column selection: use `columns_subset=` to include only the columns necessary
10420
- for validation
10421
- - start with smaller batches: begin with `batch_size=100` for testing, then increase
10422
- gradually
10423
- - reduce concurrency: lower `max_concurrent=1` if hitting rate limits
10424
- - use faster/cheaper models: consider using smaller or more efficient models for initial
10425
- testing before switching to more capable models
10451
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
10452
+ take for each level of failure (using the `actions=` parameter).
10426
10453
 
10427
10454
  Examples
10428
10455
  --------
@@ -10432,139 +10459,84 @@ class Validate:
10432
10459
  import pointblank as pb
10433
10460
  pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
10434
10461
  ```
10435
- The following examples demonstrate how to use AI validation for different types of data
10436
- quality checks. These examples show both basic usage and more advanced configurations with
10437
- custom thresholds and actions.
10438
-
10439
- **Basic AI validation example:**
10440
-
10441
- This first example shows a simple validation scenario where we want to check that customer
10442
- records have both valid email addresses and non-empty names. Notice how we use
10443
- `columns_subset=` to focus only on the relevant columns, which improves both performance
10444
- and cost-effectiveness.
10462
+ For the examples here, we'll use a simple Polars DataFrame with three string columns
10463
+ (`col_1`, `col_2`, and `col_3`). The table is shown below:
10445
10464
 
10446
- ```python
10465
+ ```{python}
10447
10466
  import pointblank as pb
10448
10467
  import polars as pl
10449
10468
 
10450
- # Sample data with email and name columns
10451
- tbl = pl.DataFrame({
10452
- "email": ["john@example.com", "invalid-email", "jane@test.org"],
10453
- "name": ["John Doe", "", "Jane Smith"],
10454
- "age": [25, 30, 35]
10455
- })
10469
+ tbl = pl.DataFrame(
10470
+ {
10471
+ "col_1": ["a", None, "c", "d"],
10472
+ "col_2": ["a", "a", "c", None],
10473
+ "col_3": ["a", "a", "d", None],
10474
+ }
10475
+ )
10456
10476
 
10457
- # Validate using AI
10477
+ pb.preview(tbl)
10478
+ ```
10479
+
10480
+ Let's validate that the rows in the table are complete with `rows_complete()`. We'll
10481
+ determine if this validation had any failing test units (there are four test units, one for
10482
+ each row). A failing test units means that a given row is not complete (i.e., has at least
10483
+ one missing value).
10484
+
10485
+ ```{python}
10458
10486
  validation = (
10459
10487
  pb.Validate(data=tbl)
10460
- .prompt(
10461
- prompt="Each row should have a valid email address and a non-empty name",
10462
- columns_subset=["email", "name"], # Only check these columns
10463
- model="openai:gpt-4o-mini",
10464
- )
10488
+ .rows_complete()
10465
10489
  .interrogate()
10466
10490
  )
10467
10491
 
10468
10492
  validation
10469
10493
  ```
10470
10494
 
10471
- In this example, the AI will identify that the second row fails validation because it has
10472
- an invalid email format (`"invalid-email"`) and the third row also fails because it has an
10473
- empty name field. The validation results will show 2 out of 3 rows failing the criteria.
10474
-
10475
- **Advanced example with custom thresholds:**
10476
-
10477
- This more sophisticated example demonstrates how to use AI validation with custom thresholds
10478
- and actions. Here we're validating phone number formats to ensure they include area codes,
10479
- which is a common data quality requirement for customer contact information.
10495
+ From this validation table we see that there are two failing test units. This is because
10496
+ two rows in the table have at least one missing value (the second row and the last row).
10480
10497
 
10481
- ```python
10482
- customer_data = pl.DataFrame({
10483
- "customer_id": [1, 2, 3, 4, 5],
10484
- "name": ["John Doe", "Jane Smith", "Bob Johnson", "Alice Brown", "Charlie Davis"],
10485
- "phone_number": [
10486
- "(555) 123-4567", # Valid with area code
10487
- "555-987-6543", # Valid with area code
10488
- "123-4567", # Missing area code
10489
- "(800) 555-1234", # Valid with area code
10490
- "987-6543" # Missing area code
10491
- ]
10492
- })
10498
+ We can also use a subset of columns to determine completeness. Let's specify the subset
10499
+ using columns `col_2` and `col_3` for the next validation.
10493
10500
 
10501
+ ```{python}
10494
10502
  validation = (
10495
- pb.Validate(data=customer_data)
10496
- .prompt(
10497
- prompt="Do all the phone numbers include an area code?",
10498
- columns_subset="phone_number", # Only check the `phone_number` column
10499
- model="openai:gpt-4o",
10500
- batch_size=500,
10501
- max_concurrent=5,
10502
- thresholds=pb.Thresholds(warning=0.1, error=0.2, critical=0.3),
10503
- actions=pb.Actions(error="Too many phone numbers missing area codes.")
10504
- )
10503
+ pb.Validate(data=tbl)
10504
+ .rows_complete(columns_subset=["col_2", "col_3"])
10505
10505
  .interrogate()
10506
10506
  )
10507
+
10508
+ validation
10507
10509
  ```
10508
10510
 
10509
- This validation will identify that 2 out of 5 phone numbers (40%) are missing area codes,
10510
- which exceeds all threshold levels. The validation will trigger the specified error action
10511
- since the failure rate (40%) is above the error threshold (20%). The AI can recognize
10512
- various phone number formats and determine whether they include area codes.
10511
+ The validation table reports a single failing test units. The last row contains missing
10512
+ values in both the `col_2` and `col_3` columns.
10513
+ others.
10513
10514
  """
10514
10515
 
10515
10516
  assertion_type = _get_fn_name()
10516
10517
 
10517
- # Validation of inputs
10518
- if not isinstance(prompt, str) or not prompt.strip():
10519
- raise ValueError("prompt must be a non-empty string")
10520
-
10521
- # Parse the provider and model name from the `model=` argument
10522
- try:
10523
- provider, model_name = model.split(sep=":", maxsplit=1)
10524
- except ValueError:
10525
- raise ValueError(f"Model must be in format 'provider:model_name', got: {model}")
10526
-
10527
- # Error if an unsupported provider is used
10528
- if provider not in MODEL_PROVIDERS:
10529
- raise ValueError(
10530
- f"Unsupported provider: {provider}. Supported providers are {MODEL_PROVIDERS}."
10531
- )
10532
-
10533
- # Ensure that `batch_size` and `max_concurrent` are positive integers
10534
- if not isinstance(batch_size, int) or batch_size < 1:
10535
- raise ValueError("batch_size must be a positive integer")
10536
- if not isinstance(max_concurrent, int) or max_concurrent < 1:
10537
- raise ValueError("max_concurrent must be a positive integer")
10538
-
10539
10518
  _check_pre(pre=pre)
10519
+ # TODO: add check for segments
10520
+ # _check_segments(segments=segments)
10540
10521
  _check_thresholds(thresholds=thresholds)
10541
10522
  _check_boolean_input(param=active, param_name="active")
10542
10523
 
10543
- # Promote a single column given as a string to a list
10544
- if columns_subset is not None and isinstance(columns_subset, str):
10545
- columns_subset = [columns_subset]
10546
-
10547
10524
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
10548
10525
  thresholds = (
10549
10526
  self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
10550
10527
  )
10551
10528
 
10529
+ if columns_subset is not None and isinstance(columns_subset, str): # pragma: no cover
10530
+ columns_subset = [columns_subset] # pragma: no cover
10531
+
10532
+ # TODO: incorporate Column object
10533
+
10552
10534
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
10553
10535
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
10554
10536
 
10555
- # Package up the AI-specific parameters as a dictionary for later use
10556
- ai_config = {
10557
- "prompt": prompt,
10558
- "llm_provider": provider,
10559
- "llm_model": model_name,
10560
- "batch_size": batch_size,
10561
- "max_concurrent": max_concurrent,
10562
- }
10563
-
10564
10537
  val_info = _ValidationInfo(
10565
10538
  assertion_type=assertion_type,
10566
10539
  column=columns_subset,
10567
- values=ai_config,
10568
10540
  pre=pre,
10569
10541
  segments=segments,
10570
10542
  thresholds=thresholds,
@@ -10577,66 +10549,81 @@ class Validate:
10577
10549
 
10578
10550
  return self
10579
10551
 
10580
- def col_schema_match(
10552
+ def prompt(
10581
10553
  self,
10582
- schema: Schema,
10583
- complete: bool = True,
10584
- in_order: bool = True,
10585
- case_sensitive_colnames: bool = True,
10586
- case_sensitive_dtypes: bool = True,
10587
- full_match_dtypes: bool = True,
10554
+ prompt: str,
10555
+ model: str,
10556
+ columns_subset: str | list[str] | None = None,
10557
+ batch_size: int = 1000,
10558
+ max_concurrent: int = 3,
10588
10559
  pre: Callable | None = None,
10560
+ segments: SegmentSpec | None = None,
10589
10561
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
10590
10562
  actions: Actions | None = None,
10591
10563
  brief: str | bool | None = None,
10592
10564
  active: bool = True,
10593
10565
  ) -> Validate:
10594
10566
  """
10595
- Do columns in the table (and their types) match a predefined schema?
10567
+ Validate rows using AI/LLM-powered analysis.
10596
10568
 
10597
- The `col_schema_match()` method works in conjunction with an object generated by the
10598
- [`Schema`](`pointblank.Schema`) class. That class object is the expectation for the actual
10599
- schema of the target table. The validation step operates over a single test unit, which is
10600
- whether the schema matches that of the table (within the constraints enforced by the
10601
- `complete=`, and `in_order=` options).
10569
+ The `prompt()` validation method uses Large Language Models (LLMs) to validate rows of data
10570
+ based on natural language criteria. Similar to other Pointblank validation methods, this
10571
+ generates binary test results (pass/fail) that integrate seamlessly with the standard
10572
+ reporting framework.
10573
+
10574
+ Like `col_vals_*()` methods, `prompt()` evaluates data against specific criteria, but
10575
+ instead of using programmatic rules, it uses natural language prompts interpreted by an LLM.
10576
+ Like `rows_distinct()` and `rows_complete()`, it operates at the row level and allows you to
10577
+ specify a subset of columns for evaluation using `columns_subset=`.
10578
+
10579
+ The system automatically combines your validation criteria from the `prompt=` parameter with
10580
+ the necessary technical context, data formatting instructions, and response structure
10581
+ requirements. This is all so you only need to focus on describing your validation logic in
10582
+ plain language.
10583
+
10584
+ Each row becomes a test unit that either passes or fails the validation criteria, producing
10585
+ the familiar True/False results that appear in Pointblank validation reports. This method
10586
+ is particularly useful for complex validation rules that are difficult to express with
10587
+ traditional validation methods, such as semantic checks, context-dependent validation, or
10588
+ subjective quality assessments.
10602
10589
 
10603
10590
  Parameters
10604
10591
  ----------
10605
- schema
10606
- A `Schema` object that represents the expected schema of the table. This object is
10607
- generated by the [`Schema`](`pointblank.Schema`) class.
10608
- complete
10609
- Should the schema match be complete? If `True`, then the target table must have all
10610
- columns specified in the schema. If `False`, then the table can have additional columns
10611
- not in the schema (i.e., the schema is a subset of the target table's columns).
10612
- in_order
10613
- Should the schema match be in order? If `True`, then the columns in the schema must
10614
- appear in the same order as they do in the target table. If `False`, then the order of
10615
- columns in the schema and the target table can differ.
10616
- case_sensitive_colnames
10617
- Should the schema match be case-sensitive with regard to column names? If `True`, then
10618
- the column names in the schema and the target table must match exactly. If `False`, then
10619
- the column names are compared in a case-insensitive manner.
10620
- case_sensitive_dtypes
10621
- Should the schema match be case-sensitive with regard to column data types? If `True`,
10622
- then the column data types in the schema and the target table must match exactly. If
10623
- `False`, then the column data types are compared in a case-insensitive manner.
10624
- full_match_dtypes
10625
- Should the schema match require a full match of data types? If `True`, then the column
10626
- data types in the schema and the target table must match exactly. If `False` then
10627
- substring matches are allowed, so a schema data type of `Int` would match a target table
10628
- data type of `Int64`.
10592
+ prompt
10593
+ A natural language description of the validation criteria. This prompt should clearly
10594
+ describe what constitutes valid vs invalid rows. Some examples:
10595
+ `"Each row should contain a valid email address and a realistic person name"`,
10596
+ `"Values should indicate positive sentiment"`,
10597
+ `"The description should mention a country name"`.
10598
+ columns_subset
10599
+ A single column or list of columns to include in the validation. If `None`, all columns
10600
+ will be included. Specifying fewer columns can improve performance and reduce API costs
10601
+ so try to include only the columns necessary for the validation.
10602
+ model
10603
+ The model to be used. This should be in the form of `provider:model` (e.g.,
10604
+ `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
10605
+ `"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from
10606
+ the provider. Model names are subject to change so consult the provider's documentation
10607
+ for the most up-to-date model names.
10608
+ batch_size
10609
+ Number of rows to process in each batch. Larger batches are more efficient but may hit
10610
+ API limits. Default is `1000`.
10611
+ max_concurrent
10612
+ Maximum number of concurrent API requests. Higher values speed up processing but may
10613
+ hit rate limits. Default is `3`.
10629
10614
  pre
10630
10615
  An optional preprocessing function or lambda to apply to the data table during
10631
10616
  interrogation. This function should take a table as input and return a modified table.
10632
- Have a look at the *Preprocessing* section for more information on how to use this
10633
- argument.
10617
+ segments
10618
+ An optional directive on segmentation, which serves to split a validation step into
10619
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
10620
+ column name and its corresponding values to segment on, or a combination of both
10621
+ (provided as a list).
10634
10622
  thresholds
10635
10623
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
10636
10624
  The thresholds are set at the step level and will override any global thresholds set in
10637
10625
  `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
10638
- be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
10639
- section for information on how to set threshold levels.
10626
+ be set locally and global thresholds (if any) will take effect.
10640
10627
  actions
10641
10628
  Optional actions to take when the validation step meets or exceeds any set threshold
10642
10629
  levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
@@ -10657,154 +10644,314 @@ class Validate:
10657
10644
  Validate
10658
10645
  The `Validate` object with the added validation step.
10659
10646
 
10660
- Preprocessing
10661
- -------------
10662
- The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
10663
- table during interrogation. This function should take a table as input and return a modified
10664
- table. This is useful for performing any necessary transformations or filtering on the data
10665
- before the validation step is applied.
10647
+ Constructing the `model` Argument
10648
+ ---------------------------------
10649
+ The `model=` argument should be constructed using the provider and model name separated by a
10650
+ colon (`provider:model`). The provider text can any of:
10666
10651
 
10667
- The preprocessing function can be any callable that takes a table as input and returns a
10668
- modified table. Regarding the lifetime of the transformed table, it only exists during the
10669
- validation step and is not stored in the `Validate` object or used in subsequent validation
10670
- steps.
10652
+ - `"anthropic"` (Anthropic)
10653
+ - `"openai"` (OpenAI)
10654
+ - `"ollama"` (Ollama)
10655
+ - `"bedrock"` (Amazon Bedrock)
10671
10656
 
10672
- Thresholds
10673
- ----------
10674
- The `thresholds=` parameter is used to set the failure-condition levels for the validation
10675
- step. If they are set here at the step level, these thresholds will override any thresholds
10676
- set at the global level in `Validate(thresholds=...)`.
10657
+ The model name should be the specific model to be used from the provider. Model names are
10658
+ subject to change so consult the provider's documentation for the most up-to-date model
10659
+ names.
10677
10660
 
10678
- There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
10679
- can either be set as a proportion failing of all test units (a value between `0` to `1`),
10680
- or, the absolute number of failing test units (as integer that's `1` or greater).
10661
+ Notes on Authentication
10662
+ -----------------------
10663
+ API keys are automatically loaded from environment variables or `.env` files and are **not**
10664
+ stored in the validation object for security reasons. You should consider using a secure
10665
+ method for handling API keys.
10681
10666
 
10682
- Thresholds can be defined using one of these input schemes:
10667
+ One way to do this is to load the API key from an environment variable and retrieve it using
10668
+ the `os` module (specifically the `os.getenv()` function). Places to store the API key might
10669
+ include `.bashrc`, `.bash_profile`, `.zshrc`, or `.zsh_profile`.
10683
10670
 
10684
- 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
10685
- thresholds)
10686
- 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
10687
- the 'error' level, and position `2` is the 'critical' level
10688
- 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
10689
- 'critical'
10690
- 4. a single integer/float value denoting absolute number or fraction of failing test units
10691
- for the 'warning' level only
10671
+ Another solution is to store one or more model provider API keys in an `.env` file (in the
10672
+ root of your project). If the API keys have correct names (e.g., `ANTHROPIC_API_KEY` or
10673
+ `OPENAI_API_KEY`) then the AI validation will automatically load the API key from the `.env`
10674
+ file. An `.env` file might look like this:
10692
10675
 
10693
- If the number of failing test units exceeds set thresholds, the validation step will be
10694
- marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
10695
- set, you're free to set any combination of them.
10676
+ ```plaintext
10677
+ ANTHROPIC_API_KEY="your_anthropic_api_key_here"
10678
+ OPENAI_API_KEY="your_openai_api_key_here"
10679
+ ```
10696
10680
 
10697
- Aside from reporting failure conditions, thresholds can be used to determine the actions to
10698
- take for each level of failure (using the `actions=` parameter).
10681
+ There's no need to have the `python-dotenv` package installed when using `.env` files in
10682
+ this way.
10699
10683
 
10700
- Examples
10701
- --------
10702
- ```{python}
10703
- #| echo: false
10704
- #| output: false
10705
- import pointblank as pb
10706
- pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
10707
- ```
10684
+ **Provider-specific setup**:
10708
10685
 
10709
- For the examples here, we'll use a simple Polars DataFrame with three columns (string,
10710
- integer, and float). The table is shown below:
10686
+ - **OpenAI**: set `OPENAI_API_KEY` environment variable or create `.env` file
10687
+ - **Anthropic**: set `ANTHROPIC_API_KEY` environment variable or create `.env` file
10688
+ - **Ollama**: no API key required, just ensure Ollama is running locally
10689
+ - **Bedrock**: configure AWS credentials through standard AWS methods
10711
10690
 
10712
- ```{python}
10713
- import pointblank as pb
10714
- import polars as pl
10691
+ AI Validation Process
10692
+ ---------------------
10693
+ The AI validation process works as follows:
10715
10694
 
10716
- tbl = pl.DataFrame(
10717
- {
10718
- "a": ["apple", "banana", "cherry", "date"],
10719
- "b": [1, 6, 3, 5],
10720
- "c": [1.1, 2.2, 3.3, 4.4],
10721
- }
10722
- )
10695
+ 1. data batching: the data is split into batches of the specified size
10696
+ 2. row deduplication: duplicate rows (based on selected columns) are identified and only
10697
+ unique combinations are sent to the LLM for analysis
10698
+ 3. json conversion: each batch of unique rows is converted to JSON format for the LLM
10699
+ 4. prompt construction: the user prompt is embedded in a structured system prompt
10700
+ 5. llm processing: each batch is sent to the LLM for analysis
10701
+ 6. response parsing: LLM responses are parsed to extract validation results
10702
+ 7. result projection: results are mapped back to all original rows using row signatures
10703
+ 8. result aggregation: results from all batches are combined
10723
10704
 
10724
- pb.preview(tbl)
10725
- ```
10705
+ **Performance Optimization**: the process uses row signature memoization to avoid redundant
10706
+ LLM calls. When multiple rows have identical values in the selected columns, only one
10707
+ representative row is validated, and the result is applied to all matching rows. This can
10708
+ dramatically reduce API costs and processing time for datasets with repetitive patterns.
10726
10709
 
10727
- Let's validate that the columns in the table match a predefined schema. A schema can be
10728
- defined using the [`Schema`](`pointblank.Schema`) class.
10710
+ The LLM receives data in this JSON format:
10729
10711
 
10730
- ```{python}
10731
- schema = pb.Schema(
10732
- columns=[("a", "String"), ("b", "Int64"), ("c", "Float64")]
10733
- )
10712
+ ```json
10713
+ {
10714
+ "columns": ["col1", "col2", "col3"],
10715
+ "rows": [
10716
+ {"col1": "value1", "col2": "value2", "col3": "value3", "_pb_row_index": 0},
10717
+ {"col1": "value4", "col2": "value5", "col3": "value6", "_pb_row_index": 1}
10718
+ ]
10719
+ }
10734
10720
  ```
10735
10721
 
10736
- You can print the schema object to verify that the expected schema is as intended.
10737
-
10738
- ```{python}
10739
- print(schema)
10722
+ The LLM returns validation results in this format:
10723
+ ```json
10724
+ [
10725
+ {"index": 0, "result": true},
10726
+ {"index": 1, "result": false}
10727
+ ]
10740
10728
  ```
10741
10729
 
10742
- Now, we'll use the `col_schema_match()` method to validate the table against the expected
10743
- `schema` object. There is a single test unit for this validation step (whether the schema
10744
- matches the table or not).
10730
+ Prompt Design Tips
10731
+ ------------------
10732
+ For best results, design prompts that are:
10745
10733
 
10746
- ```{python}
10747
- validation = (
10748
- pb.Validate(data=tbl)
10749
- .col_schema_match(schema=schema)
10750
- .interrogate()
10751
- )
10734
+ - boolean-oriented: frame validation criteria to elicit clear valid/invalid responses
10735
+ - specific: clearly define what makes a row valid/invalid
10736
+ - unambiguous: avoid subjective language that could be interpreted differently
10737
+ - context-aware: include relevant business rules or domain knowledge
10738
+ - example-driven: consider providing examples in the prompt when helpful
10752
10739
 
10753
- validation
10754
- ```
10740
+ **Critical**: Prompts must be designed so the LLM can determine whether each row passes or
10741
+ fails the validation criteria. The system expects binary validation responses, so avoid
10742
+ open-ended questions or prompts that might generate explanatory text instead of clear
10743
+ pass/fail judgments.
10755
10744
 
10756
- The validation table shows that the schema matches the table. The single test unit passed
10757
- since the table columns and their types match the schema.
10758
- """
10745
+ Good prompt examples:
10759
10746
 
10760
- assertion_type = _get_fn_name()
10747
+ - "Each row should contain a valid email address in the 'email' column and a non-empty name
10748
+ in the 'name' column"
10749
+ - "The 'sentiment' column should contain positive sentiment words (happy, good, excellent,
10750
+ etc.)"
10751
+ - "Product descriptions should mention at least one technical specification"
10761
10752
 
10762
- _check_pre(pre=pre)
10763
- _check_thresholds(thresholds=thresholds)
10764
- _check_boolean_input(param=active, param_name="active")
10765
- _check_boolean_input(param=complete, param_name="complete")
10766
- _check_boolean_input(param=in_order, param_name="in_order")
10767
- _check_boolean_input(param=case_sensitive_colnames, param_name="case_sensitive_colnames")
10768
- _check_boolean_input(param=case_sensitive_dtypes, param_name="case_sensitive_dtypes")
10769
- _check_boolean_input(param=full_match_dtypes, param_name="full_match_dtypes")
10753
+ Poor prompt examples (avoid these):
10770
10754
 
10771
- # Determine threshold to use (global or local) and normalize a local `thresholds=` value
10772
- thresholds = (
10773
- self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
10774
- )
10755
+ - "What do you think about this data?" (too open-ended)
10756
+ - "Describe the quality of each row" (asks for description, not validation)
10757
+ - "How would you improve this data?" (asks for suggestions, not pass/fail)
10775
10758
 
10776
- # Package up the `schema=` and boolean params into a dictionary for later interrogation
10777
- values = {
10778
- "schema": schema,
10779
- "complete": complete,
10780
- "in_order": in_order,
10781
- "case_sensitive_colnames": case_sensitive_colnames,
10782
- "case_sensitive_dtypes": case_sensitive_dtypes,
10783
- "full_match_dtypes": full_match_dtypes,
10784
- }
10759
+ Performance Considerations
10760
+ --------------------------
10761
+ AI validation is significantly slower than traditional validation methods due to API calls
10762
+ to LLM providers. However, performance varies dramatically based on data characteristics:
10785
10763
 
10786
- # Determine brief to use (global or local) and transform any shorthands of `brief=`
10787
- brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
10764
+ **High Memoization Scenarios** (seconds to minutes):
10788
10765
 
10789
- val_info = _ValidationInfo(
10790
- assertion_type=assertion_type,
10791
- values=values,
10792
- pre=pre,
10793
- thresholds=thresholds,
10794
- actions=actions,
10795
- brief=brief,
10796
- active=active,
10797
- )
10766
+ - data with many duplicate rows in the selected columns
10767
+ - low cardinality data (repeated patterns)
10768
+ - small number of unique row combinations
10798
10769
 
10799
- self._add_validation(validation_info=val_info)
10770
+ **Low Memoization Scenarios** (minutes to hours):
10800
10771
 
10801
- return self
10772
+ - high cardinality data with mostly unique rows
10773
+ - large datasets with few repeated patterns
10774
+ - all or most rows requiring individual LLM evaluation
10802
10775
 
10803
- def row_count_match(
10804
- self,
10805
- count: int | FrameT | Any,
10806
- tol: Tolerance = 0,
10807
- inverse: bool = False,
10776
+ The row signature memoization optimization can reduce processing time significantly when
10777
+ data has repetitive patterns. For datasets where every row is unique, expect longer
10778
+ processing times similar to validating each row individually.
10779
+
10780
+ **Strategies to Reduce Processing Time**:
10781
+
10782
+ - test on data slices: define a sampling function like `def sample_1000(df): return df.head(1000)`
10783
+ and use `pre=sample_1000` to validate on smaller samples
10784
+ - filter relevant data: define filter functions like `def active_only(df): return df.filter(df["status"] == "active")`
10785
+ and use `pre=active_only` to focus on a specific subset
10786
+ - optimize column selection: use `columns_subset=` to include only the columns necessary
10787
+ for validation
10788
+ - start with smaller batches: begin with `batch_size=100` for testing, then increase
10789
+ gradually
10790
+ - reduce concurrency: lower `max_concurrent=1` if hitting rate limits
10791
+ - use faster/cheaper models: consider using smaller or more efficient models for initial
10792
+ testing before switching to more capable models
10793
+
10794
+ Examples
10795
+ --------
10796
+ ```{python}
10797
+ #| echo: false
10798
+ #| output: false
10799
+ import pointblank as pb
10800
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
10801
+ ```
10802
+ The following examples demonstrate how to use AI validation for different types of data
10803
+ quality checks. These examples show both basic usage and more advanced configurations with
10804
+ custom thresholds and actions.
10805
+
10806
+ **Basic AI validation example:**
10807
+
10808
+ This first example shows a simple validation scenario where we want to check that customer
10809
+ records have both valid email addresses and non-empty names. Notice how we use
10810
+ `columns_subset=` to focus only on the relevant columns, which improves both performance
10811
+ and cost-effectiveness.
10812
+
10813
+ ```python
10814
+ import pointblank as pb
10815
+ import polars as pl
10816
+
10817
+ # Sample data with email and name columns
10818
+ tbl = pl.DataFrame({
10819
+ "email": ["john@example.com", "invalid-email", "jane@test.org"],
10820
+ "name": ["John Doe", "", "Jane Smith"],
10821
+ "age": [25, 30, 35]
10822
+ })
10823
+
10824
+ # Validate using AI
10825
+ validation = (
10826
+ pb.Validate(data=tbl)
10827
+ .prompt(
10828
+ prompt="Each row should have a valid email address and a non-empty name",
10829
+ columns_subset=["email", "name"], # Only check these columns
10830
+ model="openai:gpt-4o-mini",
10831
+ )
10832
+ .interrogate()
10833
+ )
10834
+
10835
+ validation
10836
+ ```
10837
+
10838
+ In this example, the AI will identify that the second row fails validation because it has
10839
+ an invalid email format (`"invalid-email"`) and the third row also fails because it has an
10840
+ empty name field. The validation results will show 2 out of 3 rows failing the criteria.
10841
+
10842
+ **Advanced example with custom thresholds:**
10843
+
10844
+ This more sophisticated example demonstrates how to use AI validation with custom thresholds
10845
+ and actions. Here we're validating phone number formats to ensure they include area codes,
10846
+ which is a common data quality requirement for customer contact information.
10847
+
10848
+ ```python
10849
+ customer_data = pl.DataFrame({
10850
+ "customer_id": [1, 2, 3, 4, 5],
10851
+ "name": ["John Doe", "Jane Smith", "Bob Johnson", "Alice Brown", "Charlie Davis"],
10852
+ "phone_number": [
10853
+ "(555) 123-4567", # Valid with area code
10854
+ "555-987-6543", # Valid with area code
10855
+ "123-4567", # Missing area code
10856
+ "(800) 555-1234", # Valid with area code
10857
+ "987-6543" # Missing area code
10858
+ ]
10859
+ })
10860
+
10861
+ validation = (
10862
+ pb.Validate(data=customer_data)
10863
+ .prompt(
10864
+ prompt="Do all the phone numbers include an area code?",
10865
+ columns_subset="phone_number", # Only check the `phone_number` column
10866
+ model="openai:gpt-4o",
10867
+ batch_size=500,
10868
+ max_concurrent=5,
10869
+ thresholds=pb.Thresholds(warning=0.1, error=0.2, critical=0.3),
10870
+ actions=pb.Actions(error="Too many phone numbers missing area codes.")
10871
+ )
10872
+ .interrogate()
10873
+ )
10874
+ ```
10875
+
10876
+ This validation will identify that 2 out of 5 phone numbers (40%) are missing area codes,
10877
+ which exceeds all threshold levels. The validation will trigger the specified error action
10878
+ since the failure rate (40%) is above the error threshold (20%). The AI can recognize
10879
+ various phone number formats and determine whether they include area codes.
10880
+ """
10881
+
10882
+ assertion_type = _get_fn_name()
10883
+
10884
+ # Validation of inputs
10885
+ if not isinstance(prompt, str) or not prompt.strip():
10886
+ raise ValueError("prompt must be a non-empty string")
10887
+
10888
+ # Parse the provider and model name from the `model=` argument
10889
+ try:
10890
+ provider, model_name = model.split(sep=":", maxsplit=1)
10891
+ except ValueError:
10892
+ raise ValueError(f"Model must be in format 'provider:model_name', got: {model}")
10893
+
10894
+ # Error if an unsupported provider is used
10895
+ if provider not in MODEL_PROVIDERS:
10896
+ raise ValueError(
10897
+ f"Unsupported provider: {provider}. Supported providers are {MODEL_PROVIDERS}."
10898
+ )
10899
+
10900
+ # Ensure that `batch_size` and `max_concurrent` are positive integers
10901
+ if not isinstance(batch_size, int) or batch_size < 1:
10902
+ raise ValueError("batch_size must be a positive integer")
10903
+ if not isinstance(max_concurrent, int) or max_concurrent < 1:
10904
+ raise ValueError("max_concurrent must be a positive integer")
10905
+
10906
+ _check_pre(pre=pre)
10907
+ _check_thresholds(thresholds=thresholds)
10908
+ _check_boolean_input(param=active, param_name="active")
10909
+
10910
+ # Promote a single column given as a string to a list
10911
+ if columns_subset is not None and isinstance(columns_subset, str):
10912
+ columns_subset = [columns_subset]
10913
+
10914
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
10915
+ thresholds = (
10916
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
10917
+ )
10918
+
10919
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
10920
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
10921
+
10922
+ # Package up the AI-specific parameters as a dictionary for later use
10923
+ ai_config = {
10924
+ "prompt": prompt,
10925
+ "llm_provider": provider,
10926
+ "llm_model": model_name,
10927
+ "batch_size": batch_size,
10928
+ "max_concurrent": max_concurrent,
10929
+ }
10930
+
10931
+ val_info = _ValidationInfo(
10932
+ assertion_type=assertion_type,
10933
+ column=columns_subset,
10934
+ values=ai_config,
10935
+ pre=pre,
10936
+ segments=segments,
10937
+ thresholds=thresholds,
10938
+ actions=actions,
10939
+ brief=brief,
10940
+ active=active,
10941
+ )
10942
+
10943
+ self._add_validation(validation_info=val_info)
10944
+
10945
+ return self
10946
+
10947
+ def col_schema_match(
10948
+ self,
10949
+ schema: Schema,
10950
+ complete: bool = True,
10951
+ in_order: bool = True,
10952
+ case_sensitive_colnames: bool = True,
10953
+ case_sensitive_dtypes: bool = True,
10954
+ full_match_dtypes: bool = True,
10808
10955
  pre: Callable | None = None,
10809
10956
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
10810
10957
  actions: Actions | None = None,
@@ -10812,33 +10959,40 @@ class Validate:
10812
10959
  active: bool = True,
10813
10960
  ) -> Validate:
10814
10961
  """
10815
- Validate whether the row count of the table matches a specified count.
10816
-
10817
- The `row_count_match()` method checks whether the row count of the target table matches a
10818
- specified count. This validation will operate over a single test unit, which is whether the
10819
- row count matches the specified count.
10962
+ Do columns in the table (and their types) match a predefined schema?
10820
10963
 
10821
- We also have the option to invert the validation step by setting `inverse=True`. This will
10822
- make the expectation that the row count of the target table *does not* match the specified
10823
- count.
10964
+ The `col_schema_match()` method works in conjunction with an object generated by the
10965
+ [`Schema`](`pointblank.Schema`) class. That class object is the expectation for the actual
10966
+ schema of the target table. The validation step operates over a single test unit, which is
10967
+ whether the schema matches that of the table (within the constraints enforced by the
10968
+ `complete=`, and `in_order=` options).
10824
10969
 
10825
10970
  Parameters
10826
10971
  ----------
10827
- count
10828
- The expected row count of the table. This can be an integer value, a Polars or Pandas
10829
- DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the row
10830
- count of that object will be used as the expected count.
10831
- tol
10832
- The tolerance allowable for the row count match. This can be specified as a single
10833
- numeric value (integer or float) or as a tuple of two integers representing the lower
10834
- and upper bounds of the tolerance range. If a single integer value (greater than 1) is
10835
- provided, it represents the absolute bounds of the tolerance, ie. plus or minus the value.
10836
- If a float value (between 0-1) is provided, it represents the relative tolerance, ie.
10837
- plus or minus the relative percentage of the target. If a tuple is provided, it represents
10838
- the lower and upper absolute bounds of the tolerance range. See the examples for more.
10839
- inverse
10840
- Should the validation step be inverted? If `True`, then the expectation is that the row
10841
- count of the target table should not match the specified `count=` value.
10972
+ schema
10973
+ A `Schema` object that represents the expected schema of the table. This object is
10974
+ generated by the [`Schema`](`pointblank.Schema`) class.
10975
+ complete
10976
+ Should the schema match be complete? If `True`, then the target table must have all
10977
+ columns specified in the schema. If `False`, then the table can have additional columns
10978
+ not in the schema (i.e., the schema is a subset of the target table's columns).
10979
+ in_order
10980
+ Should the schema match be in order? If `True`, then the columns in the schema must
10981
+ appear in the same order as they do in the target table. If `False`, then the order of
10982
+ columns in the schema and the target table can differ.
10983
+ case_sensitive_colnames
10984
+ Should the schema match be case-sensitive with regard to column names? If `True`, then
10985
+ the column names in the schema and the target table must match exactly. If `False`, then
10986
+ the column names are compared in a case-insensitive manner.
10987
+ case_sensitive_dtypes
10988
+ Should the schema match be case-sensitive with regard to column data types? If `True`,
10989
+ then the column data types in the schema and the target table must match exactly. If
10990
+ `False`, then the column data types are compared in a case-insensitive manner.
10991
+ full_match_dtypes
10992
+ Should the schema match require a full match of data types? If `True`, then the column
10993
+ data types in the schema and the target table must match exactly. If `False` then
10994
+ substring matches are allowed, so a schema data type of `Int` would match a target table
10995
+ data type of `Int64`.
10842
10996
  pre
10843
10997
  An optional preprocessing function or lambda to apply to the data table during
10844
10998
  interrogation. This function should take a table as input and return a modified table.
@@ -10878,10 +11032,9 @@ class Validate:
10878
11032
  before the validation step is applied.
10879
11033
 
10880
11034
  The preprocessing function can be any callable that takes a table as input and returns a
10881
- modified table. For example, you could use a lambda function to filter the table based on
10882
- certain criteria or to apply a transformation to the data. Regarding the lifetime of the
10883
- transformed table, it only exists during the validation step and is not stored in the
10884
- `Validate` object or used in subsequent validation steps.
11035
+ modified table. Regarding the lifetime of the transformed table, it only exists during the
11036
+ validation step and is not stored in the `Validate` object or used in subsequent validation
11037
+ steps.
10885
11038
 
10886
11039
  Thresholds
10887
11040
  ----------
@@ -10917,18 +11070,232 @@ class Validate:
10917
11070
  #| echo: false
10918
11071
  #| output: false
10919
11072
  import pointblank as pb
10920
- pb.config(report_incl_header=False, report_incl_footer=False)
11073
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
10921
11074
  ```
10922
11075
 
10923
- For the examples here, we'll use the built in dataset `"small_table"`. The table can be
10924
- obtained by calling `load_dataset("small_table")`.
11076
+ For the examples here, we'll use a simple Polars DataFrame with three columns (string,
11077
+ integer, and float). The table is shown below:
10925
11078
 
10926
11079
  ```{python}
10927
11080
  import pointblank as pb
11081
+ import polars as pl
10928
11082
 
10929
- small_table = pb.load_dataset("small_table")
11083
+ tbl = pl.DataFrame(
11084
+ {
11085
+ "a": ["apple", "banana", "cherry", "date"],
11086
+ "b": [1, 6, 3, 5],
11087
+ "c": [1.1, 2.2, 3.3, 4.4],
11088
+ }
11089
+ )
10930
11090
 
10931
- pb.preview(small_table)
11091
+ pb.preview(tbl)
11092
+ ```
11093
+
11094
+ Let's validate that the columns in the table match a predefined schema. A schema can be
11095
+ defined using the [`Schema`](`pointblank.Schema`) class.
11096
+
11097
+ ```{python}
11098
+ schema = pb.Schema(
11099
+ columns=[("a", "String"), ("b", "Int64"), ("c", "Float64")]
11100
+ )
11101
+ ```
11102
+
11103
+ You can print the schema object to verify that the expected schema is as intended.
11104
+
11105
+ ```{python}
11106
+ print(schema)
11107
+ ```
11108
+
11109
+ Now, we'll use the `col_schema_match()` method to validate the table against the expected
11110
+ `schema` object. There is a single test unit for this validation step (whether the schema
11111
+ matches the table or not).
11112
+
11113
+ ```{python}
11114
+ validation = (
11115
+ pb.Validate(data=tbl)
11116
+ .col_schema_match(schema=schema)
11117
+ .interrogate()
11118
+ )
11119
+
11120
+ validation
11121
+ ```
11122
+
11123
+ The validation table shows that the schema matches the table. The single test unit passed
11124
+ since the table columns and their types match the schema.
11125
+ """
11126
+
11127
+ assertion_type = _get_fn_name()
11128
+
11129
+ _check_pre(pre=pre)
11130
+ _check_thresholds(thresholds=thresholds)
11131
+ _check_boolean_input(param=active, param_name="active")
11132
+ _check_boolean_input(param=complete, param_name="complete")
11133
+ _check_boolean_input(param=in_order, param_name="in_order")
11134
+ _check_boolean_input(param=case_sensitive_colnames, param_name="case_sensitive_colnames")
11135
+ _check_boolean_input(param=case_sensitive_dtypes, param_name="case_sensitive_dtypes")
11136
+ _check_boolean_input(param=full_match_dtypes, param_name="full_match_dtypes")
11137
+
11138
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
11139
+ thresholds = (
11140
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
11141
+ )
11142
+
11143
+ # Package up the `schema=` and boolean params into a dictionary for later interrogation
11144
+ values = {
11145
+ "schema": schema,
11146
+ "complete": complete,
11147
+ "in_order": in_order,
11148
+ "case_sensitive_colnames": case_sensitive_colnames,
11149
+ "case_sensitive_dtypes": case_sensitive_dtypes,
11150
+ "full_match_dtypes": full_match_dtypes,
11151
+ }
11152
+
11153
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
11154
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
11155
+
11156
+ val_info = _ValidationInfo(
11157
+ assertion_type=assertion_type,
11158
+ values=values,
11159
+ pre=pre,
11160
+ thresholds=thresholds,
11161
+ actions=actions,
11162
+ brief=brief,
11163
+ active=active,
11164
+ )
11165
+
11166
+ self._add_validation(validation_info=val_info)
11167
+
11168
+ return self
11169
+
11170
+ def row_count_match(
11171
+ self,
11172
+ count: int | FrameT | Any,
11173
+ tol: Tolerance = 0,
11174
+ inverse: bool = False,
11175
+ pre: Callable | None = None,
11176
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
11177
+ actions: Actions | None = None,
11178
+ brief: str | bool | None = None,
11179
+ active: bool = True,
11180
+ ) -> Validate:
11181
+ """
11182
+ Validate whether the row count of the table matches a specified count.
11183
+
11184
+ The `row_count_match()` method checks whether the row count of the target table matches a
11185
+ specified count. This validation will operate over a single test unit, which is whether the
11186
+ row count matches the specified count.
11187
+
11188
+ We also have the option to invert the validation step by setting `inverse=True`. This will
11189
+ make the expectation that the row count of the target table *does not* match the specified
11190
+ count.
11191
+
11192
+ Parameters
11193
+ ----------
11194
+ count
11195
+ The expected row count of the table. This can be an integer value, a Polars or Pandas
11196
+ DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the row
11197
+ count of that object will be used as the expected count.
11198
+ tol
11199
+ The tolerance allowable for the row count match. This can be specified as a single
11200
+ numeric value (integer or float) or as a tuple of two integers representing the lower
11201
+ and upper bounds of the tolerance range. If a single integer value (greater than 1) is
11202
+ provided, it represents the absolute bounds of the tolerance, ie. plus or minus the value.
11203
+ If a float value (between 0-1) is provided, it represents the relative tolerance, ie.
11204
+ plus or minus the relative percentage of the target. If a tuple is provided, it represents
11205
+ the lower and upper absolute bounds of the tolerance range. See the examples for more.
11206
+ inverse
11207
+ Should the validation step be inverted? If `True`, then the expectation is that the row
11208
+ count of the target table should not match the specified `count=` value.
11209
+ pre
11210
+ An optional preprocessing function or lambda to apply to the data table during
11211
+ interrogation. This function should take a table as input and return a modified table.
11212
+ Have a look at the *Preprocessing* section for more information on how to use this
11213
+ argument.
11214
+ thresholds
11215
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
11216
+ The thresholds are set at the step level and will override any global thresholds set in
11217
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
11218
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
11219
+ section for information on how to set threshold levels.
11220
+ actions
11221
+ Optional actions to take when the validation step meets or exceeds any set threshold
11222
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
11223
+ define the actions.
11224
+ brief
11225
+ An optional brief description of the validation step that will be displayed in the
11226
+ reporting table. You can use the templating elements like `"{step}"` to insert
11227
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
11228
+ the entire brief will be automatically generated. If `None` (the default) then there
11229
+ won't be a brief.
11230
+ active
11231
+ A boolean value indicating whether the validation step should be active. Using `False`
11232
+ will make the validation step inactive (still reporting its presence and keeping indexes
11233
+ for the steps unchanged).
11234
+
11235
+ Returns
11236
+ -------
11237
+ Validate
11238
+ The `Validate` object with the added validation step.
11239
+
11240
+ Preprocessing
11241
+ -------------
11242
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
11243
+ table during interrogation. This function should take a table as input and return a modified
11244
+ table. This is useful for performing any necessary transformations or filtering on the data
11245
+ before the validation step is applied.
11246
+
11247
+ The preprocessing function can be any callable that takes a table as input and returns a
11248
+ modified table. For example, you could use a lambda function to filter the table based on
11249
+ certain criteria or to apply a transformation to the data. Regarding the lifetime of the
11250
+ transformed table, it only exists during the validation step and is not stored in the
11251
+ `Validate` object or used in subsequent validation steps.
11252
+
11253
+ Thresholds
11254
+ ----------
11255
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
11256
+ step. If they are set here at the step level, these thresholds will override any thresholds
11257
+ set at the global level in `Validate(thresholds=...)`.
11258
+
11259
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
11260
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
11261
+ or, the absolute number of failing test units (as integer that's `1` or greater).
11262
+
11263
+ Thresholds can be defined using one of these input schemes:
11264
+
11265
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
11266
+ thresholds)
11267
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
11268
+ the 'error' level, and position `2` is the 'critical' level
11269
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
11270
+ 'critical'
11271
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
11272
+ for the 'warning' level only
11273
+
11274
+ If the number of failing test units exceeds set thresholds, the validation step will be
11275
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
11276
+ set, you're free to set any combination of them.
11277
+
11278
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
11279
+ take for each level of failure (using the `actions=` parameter).
11280
+
11281
+ Examples
11282
+ --------
11283
+ ```{python}
11284
+ #| echo: false
11285
+ #| output: false
11286
+ import pointblank as pb
11287
+ pb.config(report_incl_header=False, report_incl_footer=False)
11288
+ ```
11289
+
11290
+ For the examples here, we'll use the built in dataset `"small_table"`. The table can be
11291
+ obtained by calling `load_dataset("small_table")`.
11292
+
11293
+ ```{python}
11294
+ import pointblank as pb
11295
+
11296
+ small_table = pb.load_dataset("small_table")
11297
+
11298
+ pb.preview(small_table)
10932
11299
  ```
10933
11300
 
10934
11301
  Let's validate that the number of rows in the table matches a fixed value. In this case, we
@@ -12227,12 +12594,19 @@ class Validate:
12227
12594
  # Generate the autobrief description for the validation step; it's important to perform
12228
12595
  # that here since text components like the column and the value(s) have been resolved
12229
12596
  # at this point
12597
+ # Get row count for col_pct_null to properly calculate absolute tolerance percentages
12598
+ n_rows = None
12599
+ if assertion_type == "col_pct_null":
12600
+ n_rows = get_row_count(data_tbl)
12601
+
12230
12602
  autobrief = _create_autobrief_or_failure_text(
12231
12603
  assertion_type=assertion_type,
12232
12604
  lang=self.lang,
12233
12605
  column=column,
12234
12606
  values=value,
12235
12607
  for_failure=False,
12608
+ locale=self.locale,
12609
+ n_rows=n_rows,
12236
12610
  )
12237
12611
 
12238
12612
  validation.autobrief = autobrief
@@ -12260,6 +12634,12 @@ class Validate:
12260
12634
  # This prevents modifications from one validation step affecting others
12261
12635
  data_tbl_step = _copy_dataframe(data_tbl)
12262
12636
 
12637
+ # Capture original table dimensions and columns before preprocessing
12638
+ # (only if preprocessing is present - we'll set these inside the preprocessing block)
12639
+ original_rows = None
12640
+ original_cols = None
12641
+ original_column_names = None
12642
+
12263
12643
  # ------------------------------------------------
12264
12644
  # Preprocessing stage
12265
12645
  # ------------------------------------------------
@@ -12267,6 +12647,16 @@ class Validate:
12267
12647
  # Determine whether any preprocessing functions are to be applied to the table
12268
12648
  if validation.pre is not None:
12269
12649
  try:
12650
+ # Capture original table dimensions before preprocessing
12651
+ # Use get_row_count() instead of len() for compatibility with PySpark, etc.
12652
+ original_rows = get_row_count(data_tbl_step)
12653
+ original_cols = get_column_count(data_tbl_step)
12654
+ original_column_names = set(
12655
+ data_tbl_step.columns
12656
+ if hasattr(data_tbl_step, "columns")
12657
+ else list(data_tbl_step.columns)
12658
+ )
12659
+
12270
12660
  # Read the text of the preprocessing function
12271
12661
  pre_text = _pre_processing_funcs_to_str(validation.pre)
12272
12662
 
@@ -12299,6 +12689,62 @@ class Validate:
12299
12689
  elif isinstance(validation.pre, Callable):
12300
12690
  data_tbl_step = validation.pre(data_tbl_step)
12301
12691
 
12692
+ # After successful preprocessing, check dimensions and create notes
12693
+ # Use get_row_count() and get_column_count() for compatibility
12694
+ processed_rows = get_row_count(data_tbl_step)
12695
+ processed_cols = get_column_count(data_tbl_step)
12696
+
12697
+ # Always add a note when preprocessing is applied
12698
+ if original_rows != processed_rows or original_cols != processed_cols:
12699
+ # Dimensions changed - show the change
12700
+ note_html = _create_preprocessing_note_html(
12701
+ original_rows=original_rows,
12702
+ original_cols=original_cols,
12703
+ processed_rows=processed_rows,
12704
+ processed_cols=processed_cols,
12705
+ locale=self.locale,
12706
+ )
12707
+ note_text = _create_preprocessing_note_text(
12708
+ original_rows=original_rows,
12709
+ original_cols=original_cols,
12710
+ processed_rows=processed_rows,
12711
+ processed_cols=processed_cols,
12712
+ )
12713
+ else:
12714
+ # No dimension change - just indicate preprocessing was applied
12715
+ note_html = _create_preprocessing_no_change_note_html(locale=self.locale)
12716
+ note_text = _create_preprocessing_no_change_note_text()
12717
+
12718
+ validation._add_note(
12719
+ key="pre_applied",
12720
+ markdown=note_html,
12721
+ text=note_text,
12722
+ )
12723
+
12724
+ # Check if target column is synthetic (exists in processed but not original)
12725
+ # Only check for single column names (not lists used in rows_distinct, etc.)
12726
+ if column is not None and isinstance(column, str):
12727
+ processed_column_names = set(
12728
+ data_tbl_step.columns
12729
+ if hasattr(data_tbl_step, "columns")
12730
+ else list(data_tbl_step.columns)
12731
+ )
12732
+
12733
+ # Check if the target column is in the processed table but not in original
12734
+ if column in processed_column_names and column not in original_column_names:
12735
+ note_html = _create_synthetic_target_column_note_html(
12736
+ column_name=column,
12737
+ locale=self.locale,
12738
+ )
12739
+ note_text = _create_synthetic_target_column_note_text(
12740
+ column_name=column,
12741
+ )
12742
+ validation._add_note(
12743
+ key="syn_target_col",
12744
+ markdown=note_html,
12745
+ text=note_text,
12746
+ )
12747
+
12302
12748
  except Exception:
12303
12749
  # If preprocessing fails, mark the validation as having an eval_error
12304
12750
  validation.eval_error = True
@@ -12488,6 +12934,21 @@ class Validate:
12488
12934
  tbl=tbl, column=column, values=value, na_pass=na_pass
12489
12935
  )
12490
12936
 
12937
+ elif assertion_type == "col_pct_null":
12938
+ result_bool = col_pct_null(
12939
+ data_tbl=data_tbl_step,
12940
+ column=column,
12941
+ p=value["p"],
12942
+ bound_finder=value["bound_finder"],
12943
+ )
12944
+
12945
+ validation.all_passed = result_bool
12946
+ validation.n = 1
12947
+ validation.n_passed = int(result_bool)
12948
+ validation.n_failed = 1 - int(result_bool)
12949
+
12950
+ results_tbl = None
12951
+
12491
12952
  elif assertion_type == "col_vals_expr":
12492
12953
  results_tbl = col_vals_expr(
12493
12954
  data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
@@ -12547,10 +13008,21 @@ class Validate:
12547
13008
  # Add the schema validation info to the validation object
12548
13009
  validation.val_info = schema_validation_info
12549
13010
 
13011
+ # Add a note with the schema expectation and results
13012
+ schema_note_html = _create_col_schema_match_note_html(
13013
+ schema_info=schema_validation_info, locale=self.locale
13014
+ )
13015
+ schema_note_text = _create_col_schema_match_note_text(
13016
+ schema_info=schema_validation_info
13017
+ )
13018
+ validation._add_note(
13019
+ key="schema_check", markdown=schema_note_html, text=schema_note_text
13020
+ )
13021
+
12550
13022
  validation.all_passed = result_bool
12551
13023
  validation.n = 1
12552
13024
  validation.n_passed = int(result_bool)
12553
- validation.n_failed = 1 - result_bool
13025
+ validation.n_failed = 1 - int(result_bool)
12554
13026
 
12555
13027
  results_tbl = None
12556
13028
 
@@ -12565,7 +13037,7 @@ class Validate:
12565
13037
  validation.all_passed = result_bool
12566
13038
  validation.n = 1
12567
13039
  validation.n_passed = int(result_bool)
12568
- validation.n_failed = 1 - result_bool
13040
+ validation.n_failed = 1 - int(result_bool)
12569
13041
 
12570
13042
  results_tbl = None
12571
13043
 
@@ -12577,7 +13049,7 @@ class Validate:
12577
13049
  validation.all_passed = result_bool
12578
13050
  validation.n = 1
12579
13051
  validation.n_passed = int(result_bool)
12580
- validation.n_failed = 1 - result_bool
13052
+ validation.n_failed = 1 - int(result_bool)
12581
13053
 
12582
13054
  results_tbl = None
12583
13055
 
@@ -12596,7 +13068,7 @@ class Validate:
12596
13068
  validation.all_passed = result_bool
12597
13069
  validation.n = 1
12598
13070
  validation.n_passed = int(result_bool)
12599
- validation.n_failed = 1 - result_bool
13071
+ validation.n_failed = 1 - int(result_bool)
12600
13072
 
12601
13073
  results_tbl = None
12602
13074
 
@@ -12614,8 +13086,9 @@ class Validate:
12614
13086
  ) # pragma: no cover
12615
13087
 
12616
13088
  except Exception as e:
12617
- # Only catch specific data quality comparison errors, not programming errors
13089
+ # Catch data quality errors and column not found errors
12618
13090
  error_msg = str(e).lower()
13091
+
12619
13092
  is_comparison_error = (
12620
13093
  "boolean value of na is ambiguous" in error_msg
12621
13094
  or "cannot compare" in error_msg
@@ -12626,20 +13099,101 @@ class Validate:
12626
13099
  or ("dtype" in error_msg and "compare" in error_msg)
12627
13100
  )
12628
13101
 
12629
- if is_comparison_error: # pragma: no cover
12630
- # If data quality comparison fails, mark the validation as having an eval_error
13102
+ is_column_not_found = "column" in error_msg and "not found" in error_msg
13103
+
13104
+ is_comparison_column_not_found = (
13105
+ "unable to find column" in error_msg and "valid columns" in error_msg
13106
+ )
13107
+
13108
+ if (
13109
+ is_comparison_error or is_column_not_found or is_comparison_column_not_found
13110
+ ): # pragma: no cover
13111
+ # If data quality comparison fails or column not found, mark as eval_error
12631
13112
  validation.eval_error = True # pragma: no cover
13113
+
13114
+ # Add a note for column not found errors (target column)
13115
+ if is_column_not_found:
13116
+ note_html = _create_column_not_found_note_html(
13117
+ column_name=column,
13118
+ available_columns=list(data_tbl_step.columns)
13119
+ if hasattr(data_tbl_step, "columns")
13120
+ else [],
13121
+ locale=self.locale,
13122
+ )
13123
+ note_text = _create_column_not_found_note_text(
13124
+ column_name=column,
13125
+ available_columns=list(data_tbl_step.columns)
13126
+ if hasattr(data_tbl_step, "columns")
13127
+ else [],
13128
+ )
13129
+ validation._add_note(
13130
+ key="column_not_found",
13131
+ markdown=note_html,
13132
+ text=note_text,
13133
+ )
13134
+
13135
+ # Add a note for comparison column not found errors
13136
+ elif is_comparison_column_not_found:
13137
+ # Extract column name from error message
13138
+ # Error format: 'unable to find column "col_name"; valid columns: ...'
13139
+ match = re.search(r'unable to find column "([^"]+)"', str(e))
13140
+
13141
+ if match:
13142
+ missing_col_name = match.group(1)
13143
+
13144
+ # Determine position for between/outside validations
13145
+ position = None
13146
+ if assertion_type in ["col_vals_between", "col_vals_outside"]:
13147
+ # Check if missing column is in left or right position
13148
+ from pointblank.column import Column
13149
+
13150
+ if (
13151
+ isinstance(value[0], Column)
13152
+ and value[0].exprs == missing_col_name
13153
+ ):
13154
+ position = "left"
13155
+ elif (
13156
+ isinstance(value[1], Column)
13157
+ and value[1].exprs == missing_col_name
13158
+ ):
13159
+ position = "right"
13160
+
13161
+ note_html = _create_comparison_column_not_found_note_html(
13162
+ column_name=missing_col_name,
13163
+ position=position,
13164
+ available_columns=list(data_tbl_step.columns)
13165
+ if hasattr(data_tbl_step, "columns")
13166
+ else [],
13167
+ locale=self.locale,
13168
+ )
13169
+ note_text = _create_comparison_column_not_found_note_text(
13170
+ column_name=missing_col_name,
13171
+ position=position,
13172
+ available_columns=list(data_tbl_step.columns)
13173
+ if hasattr(data_tbl_step, "columns")
13174
+ else [],
13175
+ )
13176
+ validation._add_note(
13177
+ key="comparison_column_not_found",
13178
+ markdown=note_html,
13179
+ text=note_text,
13180
+ )
13181
+
12632
13182
  end_time = datetime.datetime.now(datetime.timezone.utc) # pragma: no cover
13183
+
12633
13184
  validation.proc_duration_s = (
12634
13185
  end_time - start_time
12635
13186
  ).total_seconds() # pragma: no cover
13187
+
12636
13188
  validation.time_processed = end_time.isoformat(
12637
13189
  timespec="milliseconds"
12638
13190
  ) # pragma: no cover
13191
+
12639
13192
  validation.active = False # pragma: no cover
13193
+
12640
13194
  continue # pragma: no cover
12641
13195
  else:
12642
- # For other errors (like missing columns), let them propagate
13196
+ # For other unexpected errors, let them propagate
12643
13197
  raise
12644
13198
 
12645
13199
  else:
@@ -12722,6 +13276,34 @@ class Validate:
12722
13276
  ),
12723
13277
  )
12724
13278
 
13279
+ # Add note for local thresholds (if they differ from global thresholds)
13280
+ if threshold != self.thresholds:
13281
+ if threshold != Thresholds():
13282
+ # Local thresholds are set - generate threshold note
13283
+ threshold_note_html = _create_local_threshold_note_html(
13284
+ thresholds=threshold, locale=self.locale
13285
+ )
13286
+ threshold_note_text = _create_local_threshold_note_text(thresholds=threshold)
13287
+
13288
+ # Add the note to the validation step
13289
+ validation._add_note(
13290
+ key="local_thresholds",
13291
+ markdown=threshold_note_html,
13292
+ text=threshold_note_text,
13293
+ )
13294
+
13295
+ elif self.thresholds != Thresholds():
13296
+ # Thresholds explicitly reset to empty when global thresholds exist
13297
+ reset_note_html = _create_threshold_reset_note_html(locale=self.locale)
13298
+ reset_note_text = _create_threshold_reset_note_text()
13299
+
13300
+ # Add the note to the validation step
13301
+ validation._add_note(
13302
+ key="local_threshold_reset",
13303
+ markdown=reset_note_html,
13304
+ text=reset_note_text,
13305
+ )
13306
+
12725
13307
  # If there is any threshold level that has been exceeded, then produce and
12726
13308
  # set the general failure text for the validation step
12727
13309
  if validation.warning or validation.error or validation.critical:
@@ -12732,6 +13314,8 @@ class Validate:
12732
13314
  column=column,
12733
13315
  values=value,
12734
13316
  for_failure=True,
13317
+ locale=self.locale,
13318
+ n_rows=n_rows,
12735
13319
  )
12736
13320
 
12737
13321
  # Set the failure text in the validation step
@@ -14217,11 +14801,15 @@ class Validate:
14217
14801
  - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
14218
14802
  - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
14219
14803
  - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
14804
+ - [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
14805
+ - [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
14220
14806
  - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
14221
14807
  - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
14222
14808
  - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
14809
+ - [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
14223
14810
  - [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
14224
14811
  - [`conjointly()`](`pointblank.Validate.conjointly`)
14812
+ - [`prompt()`](`pointblank.Validate.prompt`)
14225
14813
 
14226
14814
  An extracted row for these validation methods means that a test unit failed for that row in
14227
14815
  the validation step.
@@ -14806,7 +15394,12 @@ class Validate:
14806
15394
  return None
14807
15395
 
14808
15396
  def get_tabular_report(
14809
- self, title: str | None = ":default:", incl_header: bool = None, incl_footer: bool = None
15397
+ self,
15398
+ title: str | None = ":default:",
15399
+ incl_header: bool = None,
15400
+ incl_footer: bool = None,
15401
+ incl_footer_timings: bool = None,
15402
+ incl_footer_notes: bool = None,
14810
15403
  ) -> GT:
14811
15404
  """
14812
15405
  Validation report as a GT table.
@@ -14829,6 +15422,20 @@ class Validate:
14829
15422
  name of the table as the title for the report. If no title is wanted, then `":none:"`
14830
15423
  can be used. Aside from keyword options, text can be provided for the title. This will
14831
15424
  be interpreted as Markdown text and transformed internally to HTML.
15425
+ incl_header
15426
+ Controls whether the header section should be displayed. If `None`, uses the global
15427
+ configuration setting. The header contains the table name, label, and threshold
15428
+ information.
15429
+ incl_footer
15430
+ Controls whether the footer section should be displayed. If `None`, uses the global
15431
+ configuration setting. The footer can contain validation timing information and notes.
15432
+ incl_footer_timings
15433
+ Controls whether validation timing information (start time, duration, end time) should
15434
+ be displayed in the footer. If `None`, uses the global configuration setting. Only
15435
+ applies when `incl_footer=True`.
15436
+ incl_footer_notes
15437
+ Controls whether notes from validation steps should be displayed in the footer. If
15438
+ `None`, uses the global configuration setting. Only applies when `incl_footer=True`.
14832
15439
 
14833
15440
  Returns
14834
15441
  -------
@@ -14888,6 +15495,10 @@ class Validate:
14888
15495
  incl_header = global_config.report_incl_header
14889
15496
  if incl_footer is None:
14890
15497
  incl_footer = global_config.report_incl_footer
15498
+ if incl_footer_timings is None:
15499
+ incl_footer_timings = global_config.report_incl_footer_timings
15500
+ if incl_footer_notes is None:
15501
+ incl_footer_notes = global_config.report_incl_footer_notes
14891
15502
 
14892
15503
  # Do we have a DataFrame library to work with?
14893
15504
  _check_any_df_lib(method_used="get_tabular_report")
@@ -15126,30 +15737,53 @@ class Validate:
15126
15737
  columns_upd = []
15127
15738
 
15128
15739
  columns = validation_info_dict["column"]
15740
+ notes = validation_info_dict["notes"]
15129
15741
 
15130
15742
  assertion_type = validation_info_dict["assertion_type"]
15131
15743
 
15132
15744
  # Iterate over the values in the `column` entry
15133
15745
  for i, column in enumerate(columns):
15746
+ # Check if this validation has a synthetic target column note
15747
+ has_synthetic_column = (
15748
+ notes[i] is not None and isinstance(notes[i], dict) and "syn_target_col" in notes[i]
15749
+ )
15750
+
15751
+ column_text = None
15752
+
15134
15753
  if assertion_type[i] in [
15135
15754
  "col_schema_match",
15136
15755
  "row_count_match",
15137
15756
  "col_count_match",
15138
15757
  "col_vals_expr",
15139
15758
  ]:
15140
- columns_upd.append("&mdash;")
15759
+ column_text = "&mdash;"
15141
15760
  elif assertion_type[i] in ["rows_distinct", "rows_complete", "prompt"]:
15142
15761
  if not column:
15143
15762
  # If there is no column subset, then all columns are used
15144
- columns_upd.append("ALL COLUMNS")
15763
+ column_text = "ALL COLUMNS"
15145
15764
  else:
15146
15765
  # With a column subset list, format with commas between the column names
15147
- columns_upd.append(", ".join(column))
15148
-
15766
+ column_text = ", ".join(column)
15149
15767
  elif assertion_type[i] in ["conjointly", "specially"]:
15150
- columns_upd.append("")
15768
+ column_text = ""
15151
15769
  else:
15152
- columns_upd.append(str(column))
15770
+ column_text = str(column)
15771
+
15772
+ # Apply underline styling for synthetic columns (using the purple color from the icon)
15773
+ # Only apply styling if column_text is not empty and not a special marker
15774
+ if (
15775
+ has_synthetic_column
15776
+ and column_text
15777
+ and column_text not in ["&mdash;", "ALL COLUMNS", ""]
15778
+ ):
15779
+ column_text = (
15780
+ f'<span style="text-decoration: underline; '
15781
+ f"text-decoration-color: #9A7CB4; text-decoration-thickness: 1px; "
15782
+ f'text-underline-offset: 3px;">'
15783
+ f"{column_text}</span>"
15784
+ )
15785
+
15786
+ columns_upd.append(column_text)
15153
15787
 
15154
15788
  # Add the `columns_upd` entry to the dictionary
15155
15789
  validation_info_dict["columns_upd"] = columns_upd
@@ -15205,6 +15839,15 @@ class Validate:
15205
15839
  ]:
15206
15840
  values_upd.append("&mdash;")
15207
15841
 
15842
+ elif assertion_type[i] in ["col_pct_null"]:
15843
+ # Extract p and tol from the values dict for nice formatting
15844
+ p_value = value["p"]
15845
+
15846
+ # Extract tol from the bound_finder partial function
15847
+ bound_finder = value.get("bound_finder")
15848
+ tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
15849
+ values_upd.append(f"p = {p_value}<br/>tol = {tol_value}")
15850
+
15208
15851
  elif assertion_type[i] in ["col_schema_match"]:
15209
15852
  values_upd.append("SCHEMA")
15210
15853
 
@@ -15680,13 +16323,15 @@ class Validate:
15680
16323
  gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
15681
16324
 
15682
16325
  if incl_footer:
15683
- # Add table time as HTML source note
15684
- gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
16326
+ # Add table time as HTML source note if enabled
16327
+ if incl_footer_timings:
16328
+ gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
15685
16329
 
15686
- # Create notes markdown from validation steps and add as separate source note
15687
- notes_markdown = _create_notes_html(self.validation_info)
15688
- if notes_markdown:
15689
- gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
16330
+ # Create notes markdown from validation steps and add as separate source note if enabled
16331
+ if incl_footer_notes:
16332
+ notes_markdown = _create_notes_html(self.validation_info)
16333
+ if notes_markdown:
16334
+ gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
15690
16335
 
15691
16336
  # If the interrogation has not been performed, then style the table columns dealing with
15692
16337
  # interrogation data as grayed out
@@ -15795,11 +16440,15 @@ class Validate:
15795
16440
  - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
15796
16441
  - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
15797
16442
  - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
16443
+ - [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
16444
+ - [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
15798
16445
  - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
15799
16446
  - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
15800
16447
  - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
16448
+ - [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
15801
16449
  - [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
15802
16450
  - [`conjointly()`](`pointblank.Validate.conjointly`)
16451
+ - [`prompt()`](`pointblank.Validate.prompt`)
15803
16452
  - [`rows_complete()`](`pointblank.Validate.rows_complete`)
15804
16453
 
15805
16454
  The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
@@ -16099,6 +16748,12 @@ class Validate:
16099
16748
 
16100
16749
  except Exception: # pragma: no cover
16101
16750
  validation.eval_error = True
16751
+ columns_resolved = []
16752
+ # Store columns list for note generation
16753
+ try:
16754
+ columns = list(table.columns) if "table" in locals() else []
16755
+ except Exception:
16756
+ columns = []
16102
16757
 
16103
16758
  # If no columns were resolved, then create a patched validation step with the
16104
16759
  # `eval_error` and `column` attributes set
@@ -16106,6 +16761,22 @@ class Validate:
16106
16761
  validation.eval_error = True
16107
16762
  validation.column = str(column_expr)
16108
16763
 
16764
+ # Add a helpful note explaining that no columns were resolved
16765
+ note_html = _create_no_columns_resolved_note_html(
16766
+ column_expr=str(column_expr),
16767
+ available_columns=columns,
16768
+ locale=self.locale,
16769
+ )
16770
+ note_text = _create_no_columns_resolved_note_text(
16771
+ column_expr=str(column_expr),
16772
+ available_columns=columns,
16773
+ )
16774
+ validation._add_note(
16775
+ key="no_columns_resolved",
16776
+ markdown=note_html,
16777
+ text=note_text,
16778
+ )
16779
+
16109
16780
  expanded_validation_info.append(validation)
16110
16781
  continue
16111
16782
 
@@ -16664,7 +17335,13 @@ def _process_action_str(
16664
17335
 
16665
17336
 
16666
17337
  def _create_autobrief_or_failure_text(
16667
- assertion_type: str, lang: str, column: str | None, values: str | None, for_failure: bool
17338
+ assertion_type: str,
17339
+ lang: str,
17340
+ column: str | None,
17341
+ values: str | None,
17342
+ for_failure: bool,
17343
+ locale: str | None = None,
17344
+ n_rows: int | None = None,
16668
17345
  ) -> str:
16669
17346
  if assertion_type in [
16670
17347
  "col_vals_gt",
@@ -16788,6 +17465,16 @@ def _create_autobrief_or_failure_text(
16788
17465
  for_failure=for_failure,
16789
17466
  )
16790
17467
 
17468
+ if assertion_type == "col_pct_null":
17469
+ return _create_text_col_pct_null(
17470
+ lang=lang,
17471
+ column=column,
17472
+ value=values,
17473
+ for_failure=for_failure,
17474
+ locale=locale if locale else lang,
17475
+ n_rows=n_rows,
17476
+ )
17477
+
16791
17478
  if assertion_type == "conjointly":
16792
17479
  return _create_text_conjointly(lang=lang, for_failure=for_failure)
16793
17480
 
@@ -17010,6 +17697,115 @@ def _create_text_col_count_match(lang: str, value: int, for_failure: bool = Fals
17010
17697
  return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text)
17011
17698
 
17012
17699
 
17700
+ def _create_text_col_pct_null(
17701
+ lang: str,
17702
+ column: str | None,
17703
+ value: dict,
17704
+ for_failure: bool = False,
17705
+ locale: str | None = None,
17706
+ n_rows: int | None = None,
17707
+ ) -> str:
17708
+ """Create text for col_pct_null validation with tolerance handling."""
17709
+ type_ = _expect_failure_type(for_failure=for_failure)
17710
+
17711
+ column_text = _prep_column_text(column=column)
17712
+
17713
+ # Use locale for number formatting, defaulting to lang if not provided
17714
+ fmt_locale = locale if locale else lang
17715
+
17716
+ # Extract p and tol from the values dict
17717
+ p_value = value.get("p", 0) * 100 # Convert to percentage
17718
+ p_value_original = value.get("p", 0) # Keep original value for deviation format
17719
+
17720
+ # Extract tol from the bound_finder partial function
17721
+ bound_finder = value.get("bound_finder")
17722
+ tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
17723
+
17724
+ # Handle different tolerance types
17725
+ has_tolerance = False
17726
+ is_asymmetric = False
17727
+
17728
+ if isinstance(tol_value, tuple):
17729
+ # Tuple tolerance: can be (lower, upper) in absolute or relative terms
17730
+ tol_lower, tol_upper = tol_value
17731
+
17732
+ # Check if we have any non-zero tolerance
17733
+ has_tolerance = tol_lower != 0 or tol_upper != 0
17734
+ is_asymmetric = tol_lower != tol_upper
17735
+
17736
+ # For relative tolerances (floats < 1), we can compute exact percentage bounds
17737
+ # For absolute tolerances (ints >= 1), calculate based on actual row count if available
17738
+ if tol_lower < 1:
17739
+ # Relative tolerance (float)
17740
+ lower_pct_delta = tol_lower * 100
17741
+ else:
17742
+ # Absolute tolerance (int); uses actual row count if available
17743
+ if n_rows is not None and n_rows > 0:
17744
+ lower_pct_delta = (tol_lower / n_rows) * 100
17745
+ else:
17746
+ lower_pct_delta = tol_lower # Fallback approximation
17747
+
17748
+ if tol_upper < 1:
17749
+ # Relative tolerance (float)
17750
+ upper_pct_delta = tol_upper * 100
17751
+ else:
17752
+ # Absolute tolerance (int); uses actual row count if available
17753
+ if n_rows is not None and n_rows > 0:
17754
+ upper_pct_delta = (tol_upper / n_rows) * 100
17755
+ else:
17756
+ upper_pct_delta = tol_upper # Fallback approximation
17757
+ else:
17758
+ # Single value tolerance: symmetric
17759
+ has_tolerance = tol_value != 0
17760
+
17761
+ if tol_value < 1:
17762
+ # Relative tolerance (float)
17763
+ tol_pct = tol_value * 100
17764
+ else:
17765
+ # Absolute tolerance (int) - use actual row count if available
17766
+ if n_rows is not None and n_rows > 0:
17767
+ tol_pct = (tol_value / n_rows) * 100
17768
+ else:
17769
+ tol_pct = tol_value # Fallback approximation
17770
+
17771
+ lower_pct_delta = tol_pct
17772
+ upper_pct_delta = tol_pct
17773
+
17774
+ # Format numbers with locale-aware formatting
17775
+ p_formatted = _format_number_safe(p_value, decimals=1, locale=fmt_locale)
17776
+ p_original_formatted = _format_number_safe(p_value_original, decimals=2, locale=fmt_locale)
17777
+
17778
+ # Choose the appropriate translation key based on tolerance
17779
+ if not has_tolerance:
17780
+ # No tolerance - use simple text
17781
+ text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text"][lang].format(
17782
+ column_text=column_text,
17783
+ p=p_formatted,
17784
+ )
17785
+ elif is_asymmetric or isinstance(tol_value, tuple):
17786
+ # Use deviation format for tuple tolerances (including symmetric ones)
17787
+ # Format the deviation values with signs (using proper minus sign U+2212)
17788
+ lower_dev = f"−{_format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)}%"
17789
+ upper_dev = f"+{_format_number_safe(upper_pct_delta, decimals=1, locale=fmt_locale)}%"
17790
+
17791
+ text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol_deviation"][lang].format(
17792
+ column_text=column_text,
17793
+ lower_dev=lower_dev,
17794
+ upper_dev=upper_dev,
17795
+ p=p_original_formatted,
17796
+ )
17797
+ else:
17798
+ # Single value tolerance - use the symmetric ± format
17799
+ tol_formatted = _format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)
17800
+ text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol"][lang].format(
17801
+ column_text=column_text,
17802
+ p=p_formatted,
17803
+ tol=tol_formatted,
17804
+ )
17805
+
17806
+ return text
17807
+
17808
+
17013
17809
  def _create_text_conjointly(lang: str, for_failure: bool = False) -> str:
17014
17810
  type_ = _expect_failure_type(for_failure=for_failure)
17015
17811
 
@@ -17408,6 +18204,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
17408
18204
 
17409
18205
  def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]:
17410
18206
  # For each icon, get the assertion icon SVG test from SVG_ICONS_FOR_ASSERTION_TYPES dictionary
18207
+ # TODO: No point in using `get` if we can't handle missing keys anyways
17411
18208
  icon_svg = [SVG_ICONS_FOR_ASSERTION_TYPES.get(icon) for icon in icon]
17412
18209
 
17413
18210
  # Replace the width and height in the SVG string
@@ -17866,267 +18663,1078 @@ def _create_table_time_html(
17866
18663
  )
17867
18664
 
17868
18665
 
17869
- def _create_notes_html(validation_info: list) -> str:
18666
+ def _create_notes_html(validation_info: list) -> str:
18667
+ """
18668
+ Create markdown text for validation notes/footnotes.
18669
+
18670
+ This function collects notes from all validation steps and formats them as footnotes
18671
+ for display in the report footer. Each note is prefixed with the step number in
18672
+ uppercase small caps bold formatting, and the note content is rendered as markdown.
18673
+
18674
+ Parameters
18675
+ ----------
18676
+ validation_info
18677
+ List of _ValidationInfo objects from which to extract notes.
18678
+
18679
+ Returns
18680
+ -------
18681
+ str
18682
+ Markdown string containing formatted footnotes, or empty string if no notes exist.
18683
+ """
18684
+ # Collect all notes from validation steps
18685
+ all_notes = []
18686
+ for step in validation_info:
18687
+ if step.notes:
18688
+ for key, content in step.notes.items():
18689
+ # Store note with step number for context
18690
+ all_notes.append(
18691
+ {
18692
+ "step": step.i,
18693
+ "key": key,
18694
+ "markdown": content["markdown"],
18695
+ "text": content["text"],
18696
+ }
18697
+ )
18698
+
18699
+ # If no notes, return empty string
18700
+ if not all_notes:
18701
+ return ""
18702
+
18703
+ # Build markdown for notes section
18704
+ # Start with a styled horizontal rule and bold "Notes" header
18705
+ notes_parts = [
18706
+ (
18707
+ "<hr style='border: none; border-top-width: 1px; border-top-style: dotted; "
18708
+ "border-top-color: #B5B5B5; margin-top: -3px; margin-bottom: 3px;'>"
18709
+ ),
18710
+ "<strong>Notes</strong>",
18711
+ "",
18712
+ ]
18713
+
18714
+ previous_step = None
18715
+ for note in all_notes:
18716
+ # Determine if this is the first note for this step
18717
+ is_first_for_step = note["step"] != previous_step
18718
+ previous_step = note["step"]
18719
+
18720
+ # Format step label with HTML for uppercase small caps bold
18721
+ # Use lighter color for subsequent notes of the same step
18722
+ step_color = "#333333" if is_first_for_step else "#999999"
18723
+ step_label = (
18724
+ f"<span style='font-variant: small-caps; font-weight: bold; font-size: smaller; "
18725
+ f"text-transform: uppercase; color: {step_color};'>Step {note['step']}</span>"
18726
+ )
18727
+
18728
+ # Format note key in monospaced font with smaller size
18729
+ note_key = f"<span style='font-family: \"IBM Plex Mono\", monospace; font-size: smaller;'>({note['key']})</span>"
18730
+
18731
+ # Combine step label, note key, and markdown content
18732
+ note_text = f"{step_label} {note_key} {note['markdown']}"
18733
+ notes_parts.append(note_text)
18734
+ notes_parts.append("") # Add blank line between notes
18735
+
18736
+ # Remove trailing blank line
18737
+ if notes_parts[-1] == "":
18738
+ notes_parts.pop()
18739
+
18740
+ # Join with newlines to create markdown text
18741
+ notes_markdown = "\n".join(notes_parts)
18742
+
18743
+ return notes_markdown
18744
+
18745
+
18746
+ def _create_label_html(label: str | None, start_time: str) -> str:
18747
+ if label is None:
18748
+ # Remove the decimal and everything beyond that
18749
+ start_time = str(start_time).split(".")[0]
18750
+
18751
+ # Replace the space character with a pipe character
18752
+ start_time = start_time.replace(" ", "|")
18753
+
18754
+ label = start_time
18755
+
18756
+ return (
18757
+ f"<span style='text-decoration-style: solid; text-decoration-color: #ADD8E6; "
18758
+ f"text-decoration-line: underline; text-underline-position: under; color: #333333; "
18759
+ f"font-variant-numeric: tabular-nums; padding-left: 4px; margin-right: 5px; "
18760
+ f"padding-right: 2px;'>{label}</span>"
18761
+ )
18762
+
18763
+
18764
+ def _format_single_integer_with_gt(value: int, locale: str = "en", df_lib=None) -> str:
18765
+ """Format a single integer using Great Tables GT object to avoid pandas dependency."""
18766
+ if df_lib is None:
18767
+ # Use library detection to select appropriate DataFrame library
18768
+ if _is_lib_present("polars"):
18769
+ import polars as pl
18770
+
18771
+ df_lib = pl
18772
+ elif _is_lib_present("pandas"): # pragma: no cover
18773
+ import pandas as pd # pragma: no cover
18774
+
18775
+ df_lib = pd # pragma: no cover
18776
+ else: # pragma: no cover
18777
+ raise ImportError(
18778
+ "Neither Polars nor Pandas is available for formatting"
18779
+ ) # pragma: no cover
18780
+
18781
+ # Create a single-row, single-column DataFrame using the specified library
18782
+ df = df_lib.DataFrame({"value": [value]})
18783
+
18784
+ # Create GT object and format the column
18785
+ gt_obj = GT(df).fmt_integer(columns="value", locale=locale)
18786
+
18787
+ # Extract the formatted value using _get_column_of_values
18788
+ formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
18789
+
18790
+ return formatted_values[0] # Return the single formatted value
18791
+
18792
+
18793
+ def _format_single_float_with_gt_custom(
18794
+ value: float,
18795
+ decimals: int = 2,
18796
+ drop_trailing_zeros: bool = False,
18797
+ locale: str = "en",
18798
+ df_lib=None,
18799
+ ) -> str:
18800
+ """Format a single float with custom options using Great Tables GT object to avoid pandas dependency."""
18801
+ if df_lib is None:
18802
+ # Use library detection to select appropriate DataFrame library
18803
+ if _is_lib_present("polars"):
18804
+ import polars as pl
18805
+
18806
+ df_lib = pl
18807
+ elif _is_lib_present("pandas"): # pragma: no cover
18808
+ import pandas as pd # pragma: no cover
18809
+
18810
+ df_lib = pd # pragma: no cover
18811
+ else: # pragma: no cover
18812
+ raise ImportError(
18813
+ "Neither Polars nor Pandas is available for formatting"
18814
+ ) # pragma: no cover
18815
+
18816
+ # Create a single-row, single-column DataFrame using the specified library
18817
+ df = df_lib.DataFrame({"value": [value]})
18818
+
18819
+ # Create GT object and format the column
18820
+ gt_obj = GT(df).fmt_number(
18821
+ columns="value", decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
18822
+ )
18823
+
18824
+ # Extract the formatted value using _get_column_of_values
18825
+ formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
18826
+
18827
+ return formatted_values[0] # Return the single formatted value
18828
+
18829
+
18830
+ def _format_number_safe(
18831
+ value: float, decimals: int, drop_trailing_zeros: bool = False, locale: str = "en", df_lib=None
18832
+ ) -> str:
18833
+ """
18834
+ Safely format a float value with locale support.
18835
+
18836
+ Uses GT-based formatting when a DataFrame library is available, otherwise falls back to
18837
+ vals.fmt_number. This helper is used by threshold formatting functions.
18838
+ """
18839
+ if df_lib is not None and value is not None:
18840
+ # Use GT-based formatting to avoid Pandas dependency completely
18841
+ return _format_single_float_with_gt_custom(
18842
+ value,
18843
+ decimals=decimals,
18844
+ drop_trailing_zeros=drop_trailing_zeros,
18845
+ locale=locale,
18846
+ df_lib=df_lib,
18847
+ )
18848
+ else:
18849
+ # Fallback to the original behavior
18850
+ return fmt_number(
18851
+ value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
18852
+ )[0] # pragma: no cover
18853
+
18854
+
18855
+ def _format_integer_safe(value: int, locale: str = "en", df_lib=None) -> str:
18856
+ """
18857
+ Safely format an integer value with locale support.
18858
+
18859
+ Uses GT-based formatting when a DataFrame library is available, otherwise falls back to
18860
+ vals.fmt_integer. This helper is used by threshold formatting functions.
18861
+ """
18862
+ if df_lib is not None and value is not None:
18863
+ # Use GT-based formatting to avoid Pandas dependency completely
18864
+ return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
18865
+ else:
18866
+ # Fallback to the original behavior
18867
+ return fmt_integer(value, locale=locale)[0]
18868
+
18869
+
18870
+ def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> str:
18871
+ if thresholds == Thresholds():
18872
+ return ""
18873
+
18874
+ warning = (
18875
+ _format_number_safe(
18876
+ thresholds.warning_fraction,
18877
+ decimals=3,
18878
+ drop_trailing_zeros=True,
18879
+ locale=locale,
18880
+ df_lib=df_lib,
18881
+ )
18882
+ if thresholds.warning_fraction is not None
18883
+ else (
18884
+ _format_integer_safe(thresholds.warning_count, locale=locale, df_lib=df_lib)
18885
+ if thresholds.warning_count is not None
18886
+ else "&mdash;"
18887
+ )
18888
+ )
18889
+
18890
+ error = (
18891
+ _format_number_safe(
18892
+ thresholds.error_fraction,
18893
+ decimals=3,
18894
+ drop_trailing_zeros=True,
18895
+ locale=locale,
18896
+ df_lib=df_lib,
18897
+ )
18898
+ if thresholds.error_fraction is not None
18899
+ else (
18900
+ _format_integer_safe(thresholds.error_count, locale=locale, df_lib=df_lib)
18901
+ if thresholds.error_count is not None
18902
+ else "&mdash;"
18903
+ )
18904
+ )
18905
+
18906
+ critical = (
18907
+ _format_number_safe(
18908
+ thresholds.critical_fraction,
18909
+ decimals=3,
18910
+ drop_trailing_zeros=True,
18911
+ locale=locale,
18912
+ df_lib=df_lib,
18913
+ )
18914
+ if thresholds.critical_fraction is not None
18915
+ else (
18916
+ _format_integer_safe(thresholds.critical_count, locale=locale, df_lib=df_lib)
18917
+ if thresholds.critical_count is not None
18918
+ else "&mdash;"
18919
+ )
18920
+ )
18921
+
18922
+ warning_color = SEVERITY_LEVEL_COLORS["warning"]
18923
+ error_color = SEVERITY_LEVEL_COLORS["error"]
18924
+ critical_color = SEVERITY_LEVEL_COLORS["critical"]
18925
+
18926
+ return (
18927
+ "<span>"
18928
+ f'<span style="background-color: {warning_color}; color: white; '
18929
+ "padding: 0.5em 0.5em; position: inherit; text-transform: uppercase; "
18930
+ f"margin: 5px 0px 5px 5px; border: solid 1px {warning_color}; "
18931
+ 'font-weight: bold; padding: 2px 15px 2px 15px; font-size: smaller;">WARNING</span>'
18932
+ '<span style="background-color: none; color: #333333; padding: 0.5em 0.5em; '
18933
+ "position: inherit; margin: 5px 0px 5px -4px; font-weight: bold; "
18934
+ f"border: solid 1px {warning_color}; padding: 2px 15px 2px 15px; "
18935
+ 'font-size: smaller; margin-right: 5px;">'
18936
+ f"{warning}"
18937
+ "</span>"
18938
+ f'<span style="background-color: {error_color}; color: white; '
18939
+ "padding: 0.5em 0.5em; position: inherit; text-transform: uppercase; "
18940
+ f"margin: 5px 0px 5px 1px; border: solid 1px {error_color}; "
18941
+ 'font-weight: bold; padding: 2px 15px 2px 15px; font-size: smaller;">ERROR</span>'
18942
+ '<span style="background-color: none; color: #333333; padding: 0.5em 0.5em; '
18943
+ "position: inherit; margin: 5px 0px 5px -4px; font-weight: bold; "
18944
+ f"border: solid 1px {error_color}; padding: 2px 15px 2px 15px; "
18945
+ 'font-size: smaller; margin-right: 5px;">'
18946
+ f"{error}"
18947
+ "</span>"
18948
+ f'<span style="background-color: {critical_color}; color: white; '
18949
+ "padding: 0.5em 0.5em; position: inherit; text-transform: uppercase; "
18950
+ f"margin: 5px 0px 5px 1px; border: solid 1px {critical_color}; "
18951
+ 'font-weight: bold; padding: 2px 15px 2px 15px; font-size: smaller;">CRITICAL</span>'
18952
+ '<span style="background-color: none; color: #333333; padding: 0.5em 0.5em; '
18953
+ "position: inherit; margin: 5px 0px 5px -4px; font-weight: bold; "
18954
+ f"border: solid 1px {critical_color}; padding: 2px 15px 2px 15px; "
18955
+ 'font-size: smaller;">'
18956
+ f"{critical}"
18957
+ "</span>"
18958
+ "</span>"
18959
+ )
18960
+
18961
+
18962
+ def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en") -> str:
18963
+ """
18964
+ Create a miniature HTML representation of local thresholds for display in notes.
18965
+
18966
+ This function generates a compact HTML representation of threshold values that is suitable for
18967
+ display in validation step notes/footnotes. It follows a similar visual style to the global
18968
+ thresholds shown in the header, but with a more compact format.
18969
+
18970
+ Parameters
18971
+ ----------
18972
+ thresholds
18973
+ The Thresholds object containing the local threshold values.
18974
+ locale
18975
+ The locale to use for formatting numbers (default: "en").
18976
+
18977
+ Returns
18978
+ -------
18979
+ str
18980
+ HTML string containing the formatted threshold information.
18981
+ """
18982
+ if thresholds == Thresholds():
18983
+ return ""
18984
+
18985
+ # Get df_lib for formatting
18986
+ df_lib = None
18987
+ if _is_lib_present("polars"):
18988
+ import polars as pl
18989
+
18990
+ df_lib = pl
18991
+ elif _is_lib_present("pandas"):
18992
+ import pandas as pd
18993
+
18994
+ df_lib = pd
18995
+
18996
+ # Helper function to format threshold values using the shared formatting functions
18997
+ def _format_threshold_value(fraction: float | None, count: int | None) -> str:
18998
+ if fraction is not None:
18999
+ # Format as fraction/percentage with locale formatting
19000
+ if fraction == 0:
19001
+ return "0"
19002
+ elif fraction < 0.01:
19003
+ # For very small fractions, show "<0.01" with locale formatting
19004
+ formatted = _format_number_safe(0.01, decimals=2, locale=locale, df_lib=df_lib)
19005
+ return f"&lt;{formatted}"
19006
+ else:
19007
+ # Use shared formatting function with drop_trailing_zeros
19008
+ formatted = _format_number_safe(
19009
+ fraction, decimals=2, drop_trailing_zeros=True, locale=locale, df_lib=df_lib
19010
+ )
19011
+ return formatted
19012
+ elif count is not None:
19013
+ # Format integer count using shared formatting function
19014
+ return _format_integer_safe(count, locale=locale, df_lib=df_lib)
19015
+ else:
19016
+ return "&mdash;"
19017
+
19018
+ warning = _format_threshold_value(thresholds.warning_fraction, thresholds.warning_count)
19019
+ error = _format_threshold_value(thresholds.error_fraction, thresholds.error_count)
19020
+ critical = _format_threshold_value(thresholds.critical_fraction, thresholds.critical_count)
19021
+
19022
+ warning_color = SEVERITY_LEVEL_COLORS["warning"]
19023
+ error_color = SEVERITY_LEVEL_COLORS["error"]
19024
+ critical_color = SEVERITY_LEVEL_COLORS["critical"]
19025
+
19026
+ # Build threshold parts with colored letters in monospace font
19027
+ threshold_parts = []
19028
+
19029
+ # Add warning threshold if set
19030
+ if thresholds.warning is not None:
19031
+ threshold_parts.append(
19032
+ f'<span style="color: {warning_color}; font-weight: bold;">W</span>:{warning}'
19033
+ )
19034
+
19035
+ # Add error threshold if set
19036
+ if thresholds.error is not None:
19037
+ threshold_parts.append(
19038
+ f'<span style="color: {error_color}; font-weight: bold;">E</span>:{error}'
19039
+ )
19040
+
19041
+ # Add critical threshold if set
19042
+ if thresholds.critical is not None:
19043
+ threshold_parts.append(
19044
+ f'<span style="color: {critical_color}; font-weight: bold;">C</span>:{critical}'
19045
+ )
19046
+
19047
+ # Join with "|" separator (only between multiple thresholds)
19048
+ thresholds_html = f'<span style="font-family: monospace;">{"|".join(threshold_parts)}</span>'
19049
+
19050
+ # Get localized text and format with threshold HTML
19051
+ localized_text = NOTES_TEXT["local_threshold"].get(locale, NOTES_TEXT["local_threshold"]["en"])
19052
+ note_html = localized_text.replace("{thresholds}", thresholds_html)
19053
+
19054
+ return note_html
19055
+
19056
+
19057
+ def _create_local_threshold_note_text(thresholds: Thresholds) -> str:
19058
+ """
19059
+ Create a plain text representation of local thresholds for display in logs.
19060
+
19061
+ This function generates a plain text representation of threshold values that is
19062
+ suitable for display in text-based output such as logs or console output.
19063
+
19064
+ Parameters
19065
+ ----------
19066
+ thresholds
19067
+ The Thresholds object containing the local threshold values.
19068
+
19069
+ Returns
19070
+ -------
19071
+ str
19072
+ Plain text string containing the formatted threshold information.
19073
+ """
19074
+ if thresholds == Thresholds():
19075
+ return ""
19076
+
19077
+ # Helper function to format threshold values
19078
+ def _format_threshold_value(fraction: float | None, count: int | None) -> str:
19079
+ if fraction is not None:
19080
+ if fraction == 0:
19081
+ return "0"
19082
+ elif fraction < 0.01:
19083
+ return "<0.01"
19084
+ else:
19085
+ return f"{fraction:.2f}".rstrip("0").rstrip(".")
19086
+ elif count is not None:
19087
+ return str(count)
19088
+ else:
19089
+ return "—"
19090
+
19091
+ parts = []
19092
+
19093
+ if thresholds.warning is not None:
19094
+ warning = _format_threshold_value(thresholds.warning_fraction, thresholds.warning_count)
19095
+ parts.append(f"W: {warning}")
19096
+
19097
+ if thresholds.error is not None:
19098
+ error = _format_threshold_value(thresholds.error_fraction, thresholds.error_count)
19099
+ parts.append(f"E: {error}")
19100
+
19101
+ if thresholds.critical is not None:
19102
+ critical = _format_threshold_value(thresholds.critical_fraction, thresholds.critical_count)
19103
+ parts.append(f"C: {critical}")
19104
+
19105
+ if parts:
19106
+ return "Step-specific thresholds set: " + ", ".join(parts)
19107
+ else:
19108
+ return ""
19109
+
19110
+
19111
+ def _create_threshold_reset_note_html(locale: str = "en") -> str:
19112
+ """
19113
+ Create an HTML note for when thresholds are explicitly reset to empty.
19114
+
19115
+ Parameters
19116
+ ----------
19117
+ locale
19118
+ The locale string (e.g., 'en', 'fr').
19119
+
19120
+ Returns
19121
+ -------
19122
+ str
19123
+ HTML-formatted note text.
19124
+ """
19125
+ text = NOTES_TEXT.get("local_threshold_reset", {}).get(
19126
+ locale, NOTES_TEXT.get("local_threshold_reset", {}).get("en", "")
19127
+ )
19128
+ return text
19129
+
19130
+
19131
+ def _create_threshold_reset_note_text() -> str:
19132
+ """
19133
+ Create a plain text note for when thresholds are explicitly reset to empty.
19134
+
19135
+ Returns
19136
+ -------
19137
+ str
19138
+ Plain text note.
19139
+ """
19140
+ return "Global thresholds explicitly not used for this step."
19141
+
19142
+
19143
+ def _create_no_columns_resolved_note_html(
19144
+ column_expr: str, available_columns: list[str], locale: str = "en"
19145
+ ) -> str:
19146
+ """
19147
+ Create an HTML note explaining that a column expression resolved to no columns.
19148
+
19149
+ Parameters
19150
+ ----------
19151
+ column_expr
19152
+ The column expression that failed to resolve columns (as a string).
19153
+ available_columns
19154
+ List of available column names in the table.
19155
+ locale
19156
+ The locale string (e.g., 'en', 'fr').
19157
+
19158
+ Returns
19159
+ -------
19160
+ str
19161
+ HTML-formatted note text.
19162
+ """
19163
+ # Get translated strings
19164
+ intro = NOTES_TEXT.get("column_not_found_intro", {}).get(
19165
+ locale, NOTES_TEXT.get("column_not_found_intro", {}).get("en", "The column expression")
19166
+ )
19167
+ no_resolve = NOTES_TEXT.get("column_not_found_no_resolve", {}).get(
19168
+ locale,
19169
+ NOTES_TEXT.get("column_not_found_no_resolve", {}).get(
19170
+ "en", "does not resolve to any columns"
19171
+ ),
19172
+ )
19173
+
19174
+ # Format the column expression with monospace font
19175
+ col_expr_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_expr}</code>"
19176
+
19177
+ # Build the HTML note
19178
+ html = f"{intro} {col_expr_html} {no_resolve}."
19179
+
19180
+ return html
19181
+
19182
+
19183
+ def _create_no_columns_resolved_note_text(column_expr: str, available_columns: list[str]) -> str:
19184
+ """
19185
+ Create a plain text note explaining that a column expression resolved to no columns.
19186
+
19187
+ Parameters
19188
+ ----------
19189
+ column_expr
19190
+ The column expression that failed to resolve columns (as a string).
19191
+ available_columns
19192
+ List of available column names in the table.
19193
+
19194
+ Returns
19195
+ -------
19196
+ str
19197
+ Plain text note.
19198
+ """
19199
+ return f"The column expression `{column_expr}` does not resolve to any columns."
19200
+
19201
+
19202
+ def _create_column_not_found_note_html(
19203
+ column_name: str, available_columns: list[str], locale: str = "en"
19204
+ ) -> str:
19205
+ """
19206
+ Create an HTML note explaining that a specific column was not found.
19207
+
19208
+ Parameters
19209
+ ----------
19210
+ column_name
19211
+ The column name that was not found.
19212
+ available_columns
19213
+ List of available column names in the table.
19214
+ locale
19215
+ The locale string (e.g., 'en', 'fr').
19216
+
19217
+ Returns
19218
+ -------
19219
+ str
19220
+ HTML-formatted note text.
19221
+ """
19222
+ # Get translated strings
19223
+ intro = NOTES_TEXT.get("target_column_provided", {}).get(
19224
+ locale, NOTES_TEXT.get("target_column_provided", {}).get("en", "The target column provided")
19225
+ )
19226
+ not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get(
19227
+ locale,
19228
+ NOTES_TEXT.get("does_not_match_any_columns", {}).get(
19229
+ "en", "does not match any columns in the table"
19230
+ ),
19231
+ )
19232
+
19233
+ # Format the column name with monospace font
19234
+ col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
19235
+
19236
+ # Build the HTML note
19237
+ html = f"{intro} ({col_name_html}) {not_found}."
19238
+
19239
+ return html
19240
+
19241
+
19242
+ def _create_column_not_found_note_text(column_name: str, available_columns: list[str]) -> str:
19243
+ """
19244
+ Create a plain text note explaining that a specific column was not found.
19245
+
19246
+ Parameters
19247
+ ----------
19248
+ column_name
19249
+ The column name that was not found.
19250
+ available_columns
19251
+ List of available column names in the table.
19252
+
19253
+ Returns
19254
+ -------
19255
+ str
19256
+ Plain text note.
19257
+ """
19258
+ return f"The target column provided ({column_name}) does not match any columns in the table."
19259
+
19260
+
19261
+ def _create_comparison_column_not_found_note_html(
19262
+ column_name: str, position: str | None, available_columns: list[str], locale: str = "en"
19263
+ ) -> str:
19264
+ """
19265
+ Create an HTML note explaining that a comparison column was not found.
19266
+
19267
+ Parameters
19268
+ ----------
19269
+ column_name
19270
+ The comparison column name that was not found.
19271
+ position
19272
+ Optional position indicator ("left", "right") for between/outside validations.
19273
+ available_columns
19274
+ List of available column names in the table.
19275
+ locale
19276
+ The locale string (e.g., 'en', 'fr').
19277
+
19278
+ Returns
19279
+ -------
19280
+ str
19281
+ HTML-formatted note text.
19282
+ """
19283
+ # Get translated strings
19284
+ intro = NOTES_TEXT.get("comparison_column_provided", {}).get(
19285
+ locale,
19286
+ NOTES_TEXT.get("comparison_column_provided", {}).get(
19287
+ "en", "The comparison column provided"
19288
+ ),
19289
+ )
19290
+ intro_with_for = NOTES_TEXT.get("comparison_column_for", {}).get(
19291
+ locale,
19292
+ NOTES_TEXT.get("comparison_column_for", {}).get("en", "The comparison column provided for"),
19293
+ )
19294
+ not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get(
19295
+ locale,
19296
+ NOTES_TEXT.get("does_not_match_any_columns", {}).get(
19297
+ "en", "does not match any columns in the table"
19298
+ ),
19299
+ )
19300
+
19301
+ # Format the column name with monospace font
19302
+ col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
19303
+
19304
+ # Add position if provided (for between/outside validations)
19305
+ if position:
19306
+ # Format position parameter with monospace font (e.g., "left=", "right=")
19307
+ position_param = (
19308
+ f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{position}=</code>"
19309
+ )
19310
+ # Use the "for" version of the intro text
19311
+ html = f"{intro_with_for} {position_param} ({col_name_html}) {not_found}."
19312
+ else:
19313
+ # Use the standard intro text without "for"
19314
+ html = f"{intro} ({col_name_html}) {not_found}."
19315
+
19316
+ return html
19317
+
19318
+
19319
+ def _create_comparison_column_not_found_note_text(
19320
+ column_name: str, position: str | None, available_columns: list[str]
19321
+ ) -> str:
19322
+ """
19323
+ Create a plain text note explaining that a comparison column was not found.
19324
+
19325
+ Parameters
19326
+ ----------
19327
+ column_name
19328
+ The comparison column name that was not found.
19329
+ position
19330
+ Optional position indicator ("left", "right") for between/outside validations.
19331
+ available_columns
19332
+ List of available column names in the table.
19333
+
19334
+ Returns
19335
+ -------
19336
+ str
19337
+ Plain text note.
19338
+ """
19339
+ if position:
19340
+ position_text = f" for {position}="
19341
+ else:
19342
+ position_text = ""
19343
+
19344
+ return (
19345
+ f"The comparison column provided{position_text} ({column_name}) "
19346
+ f"does not match any columns in the table."
19347
+ )
19348
+
19349
+
19350
+ def _create_preprocessing_note_html(
19351
+ original_rows: int,
19352
+ original_cols: int,
19353
+ processed_rows: int,
19354
+ processed_cols: int,
19355
+ locale: str = "en",
19356
+ ) -> str:
19357
+ """
19358
+ Create an HTML note showing table dimension changes from preprocessing.
19359
+
19360
+ Parameters
19361
+ ----------
19362
+ original_rows
19363
+ Number of rows in the original table.
19364
+ original_cols
19365
+ Number of columns in the original table.
19366
+ processed_rows
19367
+ Number of rows after preprocessing.
19368
+ processed_cols
19369
+ Number of columns after preprocessing.
19370
+ locale
19371
+ The locale string (e.g., 'en', 'fr').
19372
+
19373
+ Returns
19374
+ -------
19375
+ str
19376
+ HTML-formatted note text.
19377
+ """
19378
+ # Get translated strings
19379
+ precondition_text = NOTES_TEXT.get("precondition_applied", {}).get(
19380
+ locale, NOTES_TEXT.get("precondition_applied", {}).get("en", "Precondition applied")
19381
+ )
19382
+ table_dims_text = NOTES_TEXT.get("table_dimensions", {}).get(
19383
+ locale, NOTES_TEXT.get("table_dimensions", {}).get("en", "table dimensions")
19384
+ )
19385
+
19386
+ # Helper function to get singular or plural form
19387
+ def get_row_text(count: int) -> str:
19388
+ if count == 1:
19389
+ return NOTES_TEXT.get("row", {}).get(locale, NOTES_TEXT.get("row", {}).get("en", "row"))
19390
+ return NOTES_TEXT.get("rows", {}).get(locale, NOTES_TEXT.get("rows", {}).get("en", "rows"))
19391
+
19392
+ def get_col_text(count: int) -> str:
19393
+ if count == 1:
19394
+ return NOTES_TEXT.get("column", {}).get(
19395
+ locale, NOTES_TEXT.get("column", {}).get("en", "column")
19396
+ )
19397
+ return NOTES_TEXT.get("columns", {}).get(
19398
+ locale, NOTES_TEXT.get("columns", {}).get("en", "columns")
19399
+ )
19400
+
19401
+ # Determine which dimensions changed
19402
+ rows_changed = original_rows != processed_rows
19403
+ cols_changed = original_cols != processed_cols
19404
+
19405
+ # Format original dimensions
19406
+ original_rows_text = get_row_text(original_rows)
19407
+ original_cols_text = get_col_text(original_cols)
19408
+ original_dim = (
19409
+ f'<span style="font-family: monospace;">'
19410
+ f"[{original_rows:,} {original_rows_text}, {original_cols} {original_cols_text}]"
19411
+ f"</span>"
19412
+ )
19413
+
19414
+ # Format processed dimensions with bold for changed values
19415
+ processed_rows_text = get_row_text(processed_rows)
19416
+ processed_cols_text = get_col_text(processed_cols)
19417
+
19418
+ if rows_changed:
19419
+ rows_display = f"<strong>{processed_rows:,}</strong> {processed_rows_text}"
19420
+ else:
19421
+ rows_display = f"{processed_rows:,} {processed_rows_text}"
19422
+
19423
+ if cols_changed:
19424
+ cols_display = f"<strong>{processed_cols}</strong> {processed_cols_text}"
19425
+ else:
19426
+ cols_display = f"{processed_cols} {processed_cols_text}"
19427
+
19428
+ processed_dim = f'<span style="font-family: monospace;">[{rows_display}, {cols_display}]</span>'
19429
+
19430
+ # Build the HTML note
19431
+ html = f"{precondition_text}: {table_dims_text} {original_dim} → {processed_dim}."
19432
+
19433
+ return html
19434
+
19435
+
19436
+ def _create_preprocessing_note_text(
19437
+ original_rows: int,
19438
+ original_cols: int,
19439
+ processed_rows: int,
19440
+ processed_cols: int,
19441
+ ) -> str:
19442
+ """
19443
+ Create a plain text note showing table dimension changes from preprocessing.
19444
+
19445
+ Parameters
19446
+ ----------
19447
+ original_rows
19448
+ Number of rows in the original table.
19449
+ original_cols
19450
+ Number of columns in the original table.
19451
+ processed_rows
19452
+ Number of rows after preprocessing.
19453
+ processed_cols
19454
+ Number of columns after preprocessing.
19455
+
19456
+ Returns
19457
+ -------
19458
+ str
19459
+ Plain text note.
19460
+ """
19461
+ # Get singular or plural forms
19462
+ original_rows_text = "row" if original_rows == 1 else "rows"
19463
+ original_cols_text = "column" if original_cols == 1 else "columns"
19464
+ processed_rows_text = "row" if processed_rows == 1 else "rows"
19465
+ processed_cols_text = "column" if processed_cols == 1 else "columns"
19466
+
19467
+ return (
19468
+ f"Precondition applied: table dimensions "
19469
+ f"[{original_rows:,} {original_rows_text}, {original_cols} {original_cols_text}] → "
19470
+ f"[{processed_rows:,} {processed_rows_text}, {processed_cols} {processed_cols_text}]."
19471
+ )
19472
+
19473
+
19474
+ def _create_preprocessing_no_change_note_html(locale: str = "en") -> str:
19475
+ """
19476
+ Create an HTML note indicating preprocessing was applied with no dimension change.
19477
+
19478
+ Parameters
19479
+ ----------
19480
+ locale
19481
+ The locale string (e.g., 'en', 'fr').
19482
+
19483
+ Returns
19484
+ -------
19485
+ str
19486
+ HTML-formatted note text.
19487
+ """
19488
+ # Get translated string
19489
+ note_text = NOTES_TEXT.get("precondition_applied_no_change", {}).get(
19490
+ locale,
19491
+ NOTES_TEXT.get("precondition_applied_no_change", {}).get(
19492
+ "en", "Precondition applied: no table dimension change"
19493
+ ),
19494
+ )
19495
+
19496
+ return f"{note_text}."
19497
+
19498
+
19499
+ def _create_preprocessing_no_change_note_text() -> str:
17870
19500
  """
17871
- Create markdown text for validation notes/footnotes.
19501
+ Create a plain text note indicating preprocessing was applied with no dimension change.
17872
19502
 
17873
- This function collects notes from all validation steps and formats them as footnotes
17874
- for display in the report footer. Each note is prefixed with the step number in
17875
- uppercase small caps bold formatting, and the note content is rendered as markdown.
19503
+ Returns
19504
+ -------
19505
+ str
19506
+ Plain text note.
19507
+ """
19508
+ return "Precondition applied: no table dimension change."
19509
+
19510
+
19511
+ def _create_synthetic_target_column_note_html(column_name: str, locale: str = "en") -> str:
19512
+ """
19513
+ Create an HTML note indicating that the target column was created via preprocessing.
17876
19514
 
17877
19515
  Parameters
17878
19516
  ----------
17879
- validation_info
17880
- List of _ValidationInfo objects from which to extract notes.
19517
+ column_name
19518
+ The name of the synthetic target column.
19519
+ locale
19520
+ The locale string (e.g., 'en', 'fr').
17881
19521
 
17882
19522
  Returns
17883
19523
  -------
17884
19524
  str
17885
- Markdown string containing formatted footnotes, or empty string if no notes exist.
19525
+ HTML-formatted note text.
17886
19526
  """
17887
- # Collect all notes from validation steps
17888
- all_notes = []
17889
- for step in validation_info:
17890
- if step.notes:
17891
- for key, content in step.notes.items():
17892
- # Store note with step number for context
17893
- all_notes.append(
17894
- {
17895
- "step": step.i,
17896
- "key": key,
17897
- "markdown": content["markdown"],
17898
- "text": content["text"],
17899
- }
17900
- )
17901
-
17902
- # If no notes, return empty string
17903
- if not all_notes:
17904
- return ""
19527
+ # Get translated strings
19528
+ synthetic_text = NOTES_TEXT.get("synthetic_target_column", {}).get(
19529
+ locale, NOTES_TEXT.get("synthetic_target_column", {}).get("en", "Synthetic target column")
19530
+ )
19531
+ created_via_text = NOTES_TEXT.get("created_via_preprocessing", {}).get(
19532
+ locale,
19533
+ NOTES_TEXT.get("created_via_preprocessing", {}).get("en", "created via preprocessing"),
19534
+ )
17905
19535
 
17906
- # Build markdown for notes section
17907
- # Start with a styled horizontal rule and bold "Notes" header
17908
- notes_parts = [
17909
- (
17910
- "<hr style='border: none; border-top-width: 1px; border-top-style: dotted; "
17911
- "border-top-color: #B5B5B5; margin-top: -3px; margin-bottom: 3px;'>"
17912
- ),
17913
- "<strong>Notes</strong>",
17914
- "",
17915
- ]
19536
+ # Format the column name with monospace font
19537
+ col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
17916
19538
 
17917
- previous_step = None
17918
- for note in all_notes:
17919
- # Determine if this is the first note for this step
17920
- is_first_for_step = note["step"] != previous_step
17921
- previous_step = note["step"]
19539
+ # Build the HTML note
19540
+ html = f"{synthetic_text} {col_name_html} {created_via_text}."
17922
19541
 
17923
- # Format step label with HTML for uppercase small caps bold
17924
- # Use lighter color for subsequent notes of the same step
17925
- step_color = "#333333" if is_first_for_step else "#999999"
17926
- step_label = (
17927
- f"<span style='font-variant: small-caps; font-weight: bold; font-size: smaller; "
17928
- f"text-transform: uppercase; color: {step_color};'>Step {note['step']}</span>"
17929
- )
19542
+ return html
17930
19543
 
17931
- # Format note key in monospaced font with smaller size
17932
- note_key = f"<span style='font-family: \"IBM Plex Mono\", monospace; font-size: smaller;'>({note['key']})</span>"
17933
19544
 
17934
- # Combine step label, note key, and markdown content
17935
- note_text = f"{step_label} {note_key} {note['markdown']}"
17936
- notes_parts.append(note_text)
17937
- notes_parts.append("") # Add blank line between notes
19545
+ def _create_synthetic_target_column_note_text(column_name: str) -> str:
19546
+ """
19547
+ Create a plain text note indicating that the target column was created via preprocessing.
17938
19548
 
17939
- # Remove trailing blank line
17940
- if notes_parts[-1] == "":
17941
- notes_parts.pop()
19549
+ Parameters
19550
+ ----------
19551
+ column_name
19552
+ The name of the synthetic target column.
17942
19553
 
17943
- # Join with newlines to create markdown text
17944
- notes_markdown = "\n".join(notes_parts)
19554
+ Returns
19555
+ -------
19556
+ str
19557
+ Plain text note.
19558
+ """
19559
+ return f"Synthetic target column ({column_name}) created via preprocessing."
17945
19560
 
17946
- return notes_markdown
17947
19561
 
19562
+ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> str:
19563
+ """
19564
+ Create an HTML note with collapsible schema expectation and results.
17948
19565
 
17949
- def _create_label_html(label: str | None, start_time: str) -> str:
17950
- if label is None:
17951
- # Remove the decimal and everything beyond that
17952
- start_time = str(start_time).split(".")[0]
19566
+ This generates a disclosure-style note showing:
19567
+ 1. A summary of what failed (if anything)
19568
+ 2. The full step report table (collapsible)
17953
19569
 
17954
- # Replace the space character with a pipe character
17955
- start_time = start_time.replace(" ", "|")
19570
+ Parameters
19571
+ ----------
19572
+ schema_info
19573
+ The schema validation information dictionary from interrogation.
19574
+ locale
19575
+ The locale string (e.g., 'en', 'fr').
17956
19576
 
17957
- label = start_time
19577
+ Returns
19578
+ -------
19579
+ str
19580
+ HTML-formatted note with collapsible schema details.
19581
+ """
19582
+ passed = schema_info["passed"]
19583
+ expect_schema = schema_info["expect_schema"]
19584
+ target_schema = schema_info["target_schema"]
19585
+ params = schema_info["params"]
19586
+ columns_dict = schema_info["columns"]
19587
+ in_order = params["in_order"]
17958
19588
 
17959
- return (
17960
- f"<span style='text-decoration-style: solid; text-decoration-color: #ADD8E6; "
17961
- f"text-decoration-line: underline; text-underline-position: under; color: #333333; "
17962
- f"font-variant-numeric: tabular-nums; padding-left: 4px; margin-right: 5px; "
17963
- f"padding-right: 2px;'>{label}</span>"
19589
+ # Get translations for the locale
19590
+ passed_text = VALIDATION_REPORT_TEXT["note_schema_comparison_passed"].get(
19591
+ locale, VALIDATION_REPORT_TEXT["note_schema_comparison_passed"]["en"]
19592
+ )
19593
+ failed_text = VALIDATION_REPORT_TEXT["note_schema_comparison_failed"].get(
19594
+ locale, VALIDATION_REPORT_TEXT["note_schema_comparison_failed"]["en"]
19595
+ )
19596
+ disclosure_text = VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"].get(
19597
+ locale, VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"]["en"]
19598
+ )
19599
+ settings_title_text = VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"].get(
19600
+ locale, VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"]["en"]
17964
19601
  )
17965
19602
 
19603
+ # Build summary message
19604
+ if passed:
19605
+ summary = f'<span style="color:#4CA64C;">✓</span> {passed_text}.'
19606
+ else:
19607
+ # Analyze what failed
19608
+ failures = []
17966
19609
 
17967
- def _format_single_integer_with_gt(value: int, locale: str = "en", df_lib=None) -> str:
17968
- """Format a single integer using Great Tables GT object to avoid pandas dependency."""
17969
- if df_lib is None:
17970
- # Use library detection to select appropriate DataFrame library
17971
- if _is_lib_present("polars"):
17972
- import polars as pl
17973
-
17974
- df_lib = pl
17975
- elif _is_lib_present("pandas"): # pragma: no cover
17976
- import pandas as pd # pragma: no cover
17977
-
17978
- df_lib = pd # pragma: no cover
17979
- else: # pragma: no cover
17980
- raise ImportError(
17981
- "Neither Polars nor Pandas is available for formatting"
17982
- ) # pragma: no cover
17983
-
17984
- # Create a single-row, single-column DataFrame using the specified library
17985
- df = df_lib.DataFrame({"value": [value]})
17986
-
17987
- # Create GT object and format the column
17988
- gt_obj = GT(df).fmt_integer(columns="value", locale=locale)
19610
+ # Check column count mismatch
19611
+ n_expect = len(expect_schema)
19612
+ n_target = len(target_schema)
19613
+ if n_expect != n_target:
19614
+ count_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"].get(
19615
+ locale, VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"]["en"]
19616
+ )
19617
+ failures.append(count_mismatch_text.format(n_expect=n_expect, n_target=n_target))
17989
19618
 
17990
- # Extract the formatted value using _get_column_of_values
17991
- formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
19619
+ # Check for unmatched columns
19620
+ unmatched_cols = [col for col, info in columns_dict.items() if not info["colname_matched"]]
19621
+ if unmatched_cols:
19622
+ unmatched_text = VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"].get(
19623
+ locale, VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"]["en"]
19624
+ )
19625
+ failures.append(unmatched_text.format(n=len(unmatched_cols)))
19626
+
19627
+ # Check for wrong order (if in_order=True)
19628
+ if params["in_order"]:
19629
+ wrong_order = [
19630
+ col
19631
+ for col, info in columns_dict.items()
19632
+ if info["colname_matched"] and not info["index_matched"]
19633
+ ]
19634
+ if wrong_order:
19635
+ wrong_order_text = VALIDATION_REPORT_TEXT["note_schema_wrong_order"].get(
19636
+ locale, VALIDATION_REPORT_TEXT["note_schema_wrong_order"]["en"]
19637
+ )
19638
+ failures.append(wrong_order_text.format(n=len(wrong_order)))
17992
19639
 
17993
- return formatted_values[0] # Return the single formatted value
19640
+ # Check for dtype mismatches
19641
+ dtype_mismatches = [
19642
+ col
19643
+ for col, info in columns_dict.items()
19644
+ if info["colname_matched"] and info["dtype_present"] and not info["dtype_matched"]
19645
+ ]
19646
+ if dtype_mismatches:
19647
+ dtype_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"].get(
19648
+ locale, VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"]["en"]
19649
+ )
19650
+ failures.append(dtype_mismatch_text.format(n=len(dtype_mismatches)))
17994
19651
 
19652
+ if failures:
19653
+ summary = (
19654
+ f'<span style="color:#FF3300;">✗</span> {failed_text}: ' + ", ".join(failures) + "."
19655
+ )
19656
+ else:
19657
+ summary = f'<span style="color:#FF3300;">✗</span> {failed_text}.'
17995
19658
 
17996
- def _format_single_float_with_gt_custom(
17997
- value: float,
17998
- decimals: int = 2,
17999
- drop_trailing_zeros: bool = False,
18000
- locale: str = "en",
18001
- df_lib=None,
18002
- ) -> str:
18003
- """Format a single float with custom options using Great Tables GT object to avoid pandas dependency."""
18004
- if df_lib is None:
18005
- # Use library detection to select appropriate DataFrame library
18006
- if _is_lib_present("polars"):
18007
- import polars as pl
19659
+ # Generate the step report table using the existing function
19660
+ # We'll call either _step_report_schema_in_order or _step_report_schema_any_order
19661
+ # depending on the in_order parameter
19662
+ if in_order:
19663
+ step_report_gt = _step_report_schema_in_order(
19664
+ step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
19665
+ )
19666
+ else:
19667
+ step_report_gt = _step_report_schema_any_order(
19668
+ step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
19669
+ )
19670
+
19671
+ # Generate the settings HTML using the existing function
19672
+ settings_html = _create_col_schema_match_params_html(
19673
+ lang=locale,
19674
+ complete=params["complete"],
19675
+ in_order=params["in_order"],
19676
+ case_sensitive_colnames=params["case_sensitive_colnames"],
19677
+ case_sensitive_dtypes=params["case_sensitive_dtypes"],
19678
+ full_match_dtypes=params["full_match_dtypes"],
19679
+ )
18008
19680
 
18009
- df_lib = pl
18010
- elif _is_lib_present("pandas"): # pragma: no cover
18011
- import pandas as pd # pragma: no cover
19681
+ # Remove the inner div containing column_schema_match_str
19682
+ settings_html = re.sub(r'<div style="margin-right: 5px;">.*?</div>', "", settings_html, count=1)
18012
19683
 
18013
- df_lib = pd # pragma: no cover
18014
- else: # pragma: no cover
18015
- raise ImportError(
18016
- "Neither Polars nor Pandas is available for formatting"
18017
- ) # pragma: no cover
19684
+ # Change padding-top from 7px to 2px
19685
+ settings_html = settings_html.replace("padding-top: 7px;", "padding-top: 2px;")
18018
19686
 
18019
- # Create a single-row, single-column DataFrame using the specified library
18020
- df = df_lib.DataFrame({"value": [value]})
19687
+ # Create new source note HTML that includes both settings and schema
19688
+ source_note_html = f"""
19689
+ <div style='padding-bottom: 2px;'>{settings_title_text}</div>
19690
+ <div style='padding-bottom: 4px;'>{settings_html}</div>
19691
+ """
18021
19692
 
18022
- # Create GT object and format the column
18023
- gt_obj = GT(df).fmt_number(
18024
- columns="value", decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
18025
- )
19693
+ # Add the settings as an additional source note to the step report
19694
+ step_report_gt = step_report_gt.tab_source_note(source_note=html(source_note_html))
18026
19695
 
18027
- # Extract the formatted value using _get_column_of_values
18028
- formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
19696
+ # Extract the HTML from the GT object
19697
+ step_report_html = step_report_gt._repr_html_()
18029
19698
 
18030
- return formatted_values[0] # Return the single formatted value
19699
+ # Create collapsible section with the step report
19700
+ note_html = f"""
19701
+ {summary}
18031
19702
 
19703
+ <details style="margin-top: 2px; margin-bottom: 8px; font-size: 12px; text-indent: 12px;">
19704
+ <summary style="cursor: pointer; font-weight: bold; color: #555; margin-bottom: -5px;">{disclosure_text}</summary>
19705
+ <div style="margin-top: 6px; padding-left: 15px; padding-right: 15px;">
18032
19706
 
18033
- def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> str:
18034
- if thresholds == Thresholds():
18035
- return ""
19707
+ {step_report_html}
18036
19708
 
18037
- # Helper functions to format numbers safely
18038
- def _format_number_safe(value: float, decimals: int, drop_trailing_zeros: bool = False) -> str:
18039
- if df_lib is not None and value is not None:
18040
- # Use GT-based formatting to avoid Pandas dependency completely
18041
- return _format_single_float_with_gt_custom(
18042
- value,
18043
- decimals=decimals,
18044
- drop_trailing_zeros=drop_trailing_zeros,
18045
- locale=locale,
18046
- df_lib=df_lib,
18047
- )
18048
- else:
18049
- # Fallback to the original behavior
18050
- return fmt_number(
18051
- value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
18052
- )[0] # pragma: no cover
19709
+ </div>
19710
+ </details>
19711
+ """
18053
19712
 
18054
- def _format_integer_safe(value: int) -> str:
18055
- if df_lib is not None and value is not None:
18056
- # Use GT-based formatting to avoid Pandas dependency completely
18057
- return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
18058
- else:
18059
- # Fallback to the original behavior
18060
- return fmt_integer(value, locale=locale)[0]
19713
+ return note_html.strip()
18061
19714
 
18062
- warning = (
18063
- _format_number_safe(thresholds.warning_fraction, decimals=3, drop_trailing_zeros=True)
18064
- if thresholds.warning_fraction is not None
18065
- else (
18066
- _format_integer_safe(thresholds.warning_count)
18067
- if thresholds.warning_count is not None
18068
- else "&mdash;"
18069
- )
18070
- )
18071
19715
 
18072
- error = (
18073
- _format_number_safe(thresholds.error_fraction, decimals=3, drop_trailing_zeros=True)
18074
- if thresholds.error_fraction is not None
18075
- else (
18076
- _format_integer_safe(thresholds.error_count)
18077
- if thresholds.error_count is not None
18078
- else "&mdash;"
18079
- )
18080
- )
19716
+ def _create_col_schema_match_note_text(schema_info: dict) -> str:
19717
+ """
19718
+ Create a plain text note for schema validation.
18081
19719
 
18082
- critical = (
18083
- _format_number_safe(thresholds.critical_fraction, decimals=3, drop_trailing_zeros=True)
18084
- if thresholds.critical_fraction is not None
18085
- else (
18086
- _format_integer_safe(thresholds.critical_count)
18087
- if thresholds.critical_count is not None
18088
- else "&mdash;"
18089
- )
18090
- )
19720
+ Parameters
19721
+ ----------
19722
+ schema_info
19723
+ The schema validation information dictionary from interrogation.
18091
19724
 
18092
- warning_color = SEVERITY_LEVEL_COLORS["warning"]
18093
- error_color = SEVERITY_LEVEL_COLORS["error"]
18094
- critical_color = SEVERITY_LEVEL_COLORS["critical"]
19725
+ Returns
19726
+ -------
19727
+ str
19728
+ Plain text note.
19729
+ """
19730
+ passed = schema_info["passed"]
19731
+ expect_schema = schema_info["expect_schema"]
19732
+ target_schema = schema_info["target_schema"]
18095
19733
 
18096
- return (
18097
- "<span>"
18098
- f'<span style="background-color: {warning_color}; color: white; '
18099
- "padding: 0.5em 0.5em; position: inherit; text-transform: uppercase; "
18100
- f"margin: 5px 0px 5px 5px; border: solid 1px {warning_color}; "
18101
- 'font-weight: bold; padding: 2px 15px 2px 15px; font-size: smaller;">WARNING</span>'
18102
- '<span style="background-color: none; color: #333333; padding: 0.5em 0.5em; '
18103
- "position: inherit; margin: 5px 0px 5px -4px; font-weight: bold; "
18104
- f"border: solid 1px {warning_color}; padding: 2px 15px 2px 15px; "
18105
- 'font-size: smaller; margin-right: 5px;">'
18106
- f"{warning}"
18107
- "</span>"
18108
- f'<span style="background-color: {error_color}; color: white; '
18109
- "padding: 0.5em 0.5em; position: inherit; text-transform: uppercase; "
18110
- f"margin: 5px 0px 5px 1px; border: solid 1px {error_color}; "
18111
- 'font-weight: bold; padding: 2px 15px 2px 15px; font-size: smaller;">ERROR</span>'
18112
- '<span style="background-color: none; color: #333333; padding: 0.5em 0.5em; '
18113
- "position: inherit; margin: 5px 0px 5px -4px; font-weight: bold; "
18114
- f"border: solid 1px {error_color}; padding: 2px 15px 2px 15px; "
18115
- 'font-size: smaller; margin-right: 5px;">'
18116
- f"{error}"
18117
- "</span>"
18118
- f'<span style="background-color: {critical_color}; color: white; '
18119
- "padding: 0.5em 0.5em; position: inherit; text-transform: uppercase; "
18120
- f"margin: 5px 0px 5px 1px; border: solid 1px {critical_color}; "
18121
- 'font-weight: bold; padding: 2px 15px 2px 15px; font-size: smaller;">CRITICAL</span>'
18122
- '<span style="background-color: none; color: #333333; padding: 0.5em 0.5em; '
18123
- "position: inherit; margin: 5px 0px 5px -4px; font-weight: bold; "
18124
- f"border: solid 1px {critical_color}; padding: 2px 15px 2px 15px; "
18125
- 'font-size: smaller;">'
18126
- f"{critical}"
18127
- "</span>"
18128
- "</span>"
18129
- )
19734
+ if passed:
19735
+ return f"Schema validation passed. Expected {len(expect_schema)} column(s), found {len(target_schema)}."
19736
+ else:
19737
+ return f"Schema validation failed. Expected {len(expect_schema)} column(s), found {len(target_schema)}."
18130
19738
 
18131
19739
 
18132
19740
  def _step_report_row_based(
@@ -18576,16 +20184,33 @@ def _step_report_schema_in_order(
18576
20184
  dtype_exp = []
18577
20185
  dtype_exp_correct = []
18578
20186
 
18579
- for i in range(len(exp_columns_dict)):
20187
+ for i in range(len(expect_schema)):
18580
20188
  #
18581
20189
  # `col_name_exp` values
18582
20190
  #
18583
20191
 
18584
- # The column name is the key in the dictionary, get the column name and
18585
- # append it to the `col_name_exp` list
18586
- col_name_exp.append(list(exp_columns_dict.keys())[i])
20192
+ # Get the column name from expect_schema (which can have duplicates)
20193
+ column_name_exp_i = expect_schema[i][0]
20194
+ col_name_exp.append(column_name_exp_i)
20195
+
20196
+ # Check if this column exists in exp_columns_dict (it might not if it's a duplicate)
20197
+ # For duplicates, we need to handle them specially
20198
+ if column_name_exp_i not in exp_columns_dict:
20199
+ # This is a duplicate or invalid column, mark it as incorrect
20200
+ col_exp_correct.append(CROSS_MARK_SPAN)
20201
+
20202
+ # For dtype, check if there's a dtype specified in the schema
20203
+ if len(expect_schema[i]) > 1:
20204
+ dtype_value = expect_schema[i][1]
20205
+ if isinstance(dtype_value, list):
20206
+ dtype_exp.append(" | ".join(dtype_value))
20207
+ else:
20208
+ dtype_exp.append(str(dtype_value))
20209
+ else:
20210
+ dtype_exp.append("&mdash;")
18587
20211
 
18588
- column_name_exp_i = col_name_exp[i]
20212
+ dtype_exp_correct.append("&mdash;")
20213
+ continue
18589
20214
 
18590
20215
  #
18591
20216
  # `col_exp_correct` values