pointblank 0.15.0__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_constants.py +25 -1
- pointblank/_constants_translations.py +2361 -2
- pointblank/_interrogation.py +24 -0
- pointblank/_typing.py +37 -9
- pointblank/_utils.py +0 -355
- pointblank/_utils_llms_txt.py +661 -0
- pointblank/column.py +24 -0
- pointblank/data/api-docs.txt +336 -3
- pointblank/validate.py +2551 -926
- pointblank/yaml.py +10 -2
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/METADATA +9 -4
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/RECORD +17 -16
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/WHEEL +0 -0
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.15.0.dist-info → pointblank-0.17.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -12,6 +12,7 @@ import tempfile
|
|
|
12
12
|
import threading
|
|
13
13
|
from dataclasses import dataclass
|
|
14
14
|
from enum import Enum
|
|
15
|
+
from functools import partial
|
|
15
16
|
from importlib.metadata import version
|
|
16
17
|
from pathlib import Path
|
|
17
18
|
from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
@@ -45,6 +46,7 @@ from pointblank._constants import (
|
|
|
45
46
|
)
|
|
46
47
|
from pointblank._constants_translations import (
|
|
47
48
|
EXPECT_FAIL_TEXT,
|
|
49
|
+
NOTES_TEXT,
|
|
48
50
|
STEP_REPORT_TEXT,
|
|
49
51
|
VALIDATION_REPORT_TEXT,
|
|
50
52
|
)
|
|
@@ -53,6 +55,7 @@ from pointblank._interrogation import (
|
|
|
53
55
|
SpeciallyValidation,
|
|
54
56
|
col_count_match,
|
|
55
57
|
col_exists,
|
|
58
|
+
col_pct_null,
|
|
56
59
|
col_schema_match,
|
|
57
60
|
col_vals_expr,
|
|
58
61
|
conjointly_validation,
|
|
@@ -122,6 +125,7 @@ __all__ = [
|
|
|
122
125
|
"write_file",
|
|
123
126
|
"config",
|
|
124
127
|
"connect_to_table",
|
|
128
|
+
"print_database_tables",
|
|
125
129
|
"preview",
|
|
126
130
|
"missing_vals_tbl",
|
|
127
131
|
"get_action_metadata",
|
|
@@ -361,12 +365,16 @@ class PointblankConfig:
|
|
|
361
365
|
|
|
362
366
|
report_incl_header: bool = True
|
|
363
367
|
report_incl_footer: bool = True
|
|
368
|
+
report_incl_footer_timings: bool = True
|
|
369
|
+
report_incl_footer_notes: bool = True
|
|
364
370
|
preview_incl_header: bool = True
|
|
365
371
|
|
|
366
372
|
def __repr__(self):
|
|
367
373
|
return (
|
|
368
374
|
f"PointblankConfig(report_incl_header={self.report_incl_header}, "
|
|
369
375
|
f"report_incl_footer={self.report_incl_footer}, "
|
|
376
|
+
f"report_incl_footer_timings={self.report_incl_footer_timings}, "
|
|
377
|
+
f"report_incl_footer_notes={self.report_incl_footer_notes}, "
|
|
370
378
|
f"preview_incl_header={self.preview_incl_header})"
|
|
371
379
|
)
|
|
372
380
|
|
|
@@ -378,6 +386,8 @@ global_config = PointblankConfig()
|
|
|
378
386
|
def config(
|
|
379
387
|
report_incl_header: bool = True,
|
|
380
388
|
report_incl_footer: bool = True,
|
|
389
|
+
report_incl_footer_timings: bool = True,
|
|
390
|
+
report_incl_footer_notes: bool = True,
|
|
381
391
|
preview_incl_header: bool = True,
|
|
382
392
|
) -> PointblankConfig:
|
|
383
393
|
"""
|
|
@@ -391,7 +401,13 @@ def config(
|
|
|
391
401
|
threshold levels (if set).
|
|
392
402
|
report_incl_footer
|
|
393
403
|
Should the footer of the validation table report be displayed? The footer contains the
|
|
394
|
-
starting and ending times of the interrogation.
|
|
404
|
+
starting and ending times of the interrogation and any notes added to validation steps.
|
|
405
|
+
report_incl_footer_timings
|
|
406
|
+
Controls whether the validation timing information (start time, duration, and end time)
|
|
407
|
+
should be displayed in the footer. Only applies when `report_incl_footer=True`.
|
|
408
|
+
report_incl_footer_notes
|
|
409
|
+
Controls whether the notes from validation steps should be displayed in the footer. Only
|
|
410
|
+
applies when `report_incl_footer=True`.
|
|
395
411
|
preview_incl_header
|
|
396
412
|
Whether the header should be present in any preview table (generated via the
|
|
397
413
|
[`preview()`](`pointblank.preview`) function).
|
|
@@ -405,6 +421,8 @@ def config(
|
|
|
405
421
|
global global_config
|
|
406
422
|
global_config.report_incl_header = report_incl_header # pragma: no cover
|
|
407
423
|
global_config.report_incl_footer = report_incl_footer # pragma: no cover
|
|
424
|
+
global_config.report_incl_footer_timings = report_incl_footer_timings # pragma: no cover
|
|
425
|
+
global_config.report_incl_footer_notes = report_incl_footer_notes # pragma: no cover
|
|
408
426
|
global_config.preview_incl_header = preview_incl_header # pragma: no cover
|
|
409
427
|
|
|
410
428
|
|
|
@@ -3918,6 +3936,47 @@ class _ValidationInfo:
|
|
|
3918
3936
|
return self.notes is not None and len(self.notes) > 0
|
|
3919
3937
|
|
|
3920
3938
|
|
|
3939
|
+
def _handle_connection_errors(e: Exception, connection_string: str) -> None:
|
|
3940
|
+
"""
|
|
3941
|
+
Shared error handling for database connection failures.
|
|
3942
|
+
|
|
3943
|
+
Raises appropriate ConnectionError with helpful messages based on the exception.
|
|
3944
|
+
"""
|
|
3945
|
+
|
|
3946
|
+
error_str = str(e).lower()
|
|
3947
|
+
backend_install_map = {
|
|
3948
|
+
"duckdb": "pip install 'ibis-framework[duckdb]'",
|
|
3949
|
+
"postgresql": "pip install 'ibis-framework[postgres]'",
|
|
3950
|
+
"postgres": "pip install 'ibis-framework[postgres]'",
|
|
3951
|
+
"mysql": "pip install 'ibis-framework[mysql]'",
|
|
3952
|
+
"sqlite": "pip install 'ibis-framework[sqlite]'",
|
|
3953
|
+
"bigquery": "pip install 'ibis-framework[bigquery]'",
|
|
3954
|
+
"snowflake": "pip install 'ibis-framework[snowflake]'",
|
|
3955
|
+
}
|
|
3956
|
+
|
|
3957
|
+
# Check if this is a missing backend dependency
|
|
3958
|
+
for backend, install_cmd in backend_install_map.items():
|
|
3959
|
+
if backend in error_str and ("not found" in error_str or "no module" in error_str):
|
|
3960
|
+
raise ConnectionError(
|
|
3961
|
+
f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
|
|
3962
|
+
f" {install_cmd}\n\n"
|
|
3963
|
+
f"Original error: {e}"
|
|
3964
|
+
) from e
|
|
3965
|
+
|
|
3966
|
+
# Generic connection error
|
|
3967
|
+
raise ConnectionError( # pragma: no cover
|
|
3968
|
+
f"Failed to connect using: {connection_string}\n"
|
|
3969
|
+
f"Error: {e}\n\n"
|
|
3970
|
+
f"Supported connection string formats:\n"
|
|
3971
|
+
f"- DuckDB: 'duckdb:///path/to/file.ddb'\n"
|
|
3972
|
+
f"- SQLite: 'sqlite:///path/to/file.db'\n"
|
|
3973
|
+
f"- PostgreSQL: 'postgresql://user:pass@host:port/db'\n"
|
|
3974
|
+
f"- MySQL: 'mysql://user:pass@host:port/db'\n"
|
|
3975
|
+
f"- BigQuery: 'bigquery://project/dataset'\n"
|
|
3976
|
+
f"- Snowflake: 'snowflake://user:pass@account/db/schema'"
|
|
3977
|
+
) from e
|
|
3978
|
+
|
|
3979
|
+
|
|
3921
3980
|
def connect_to_table(connection_string: str) -> Any:
|
|
3922
3981
|
"""
|
|
3923
3982
|
Connect to a database table using a connection string.
|
|
@@ -3997,7 +4056,11 @@ def connect_to_table(connection_string: str) -> Any:
|
|
|
3997
4056
|
pip install 'ibis-framework[duckdb]' # for DuckDB
|
|
3998
4057
|
pip install 'ibis-framework[postgres]' # for PostgreSQL
|
|
3999
4058
|
```
|
|
4059
|
+
See Also
|
|
4060
|
+
--------
|
|
4061
|
+
print_database_tables : List all available tables in a database for discovery
|
|
4000
4062
|
"""
|
|
4063
|
+
|
|
4001
4064
|
# Check if Ibis is available
|
|
4002
4065
|
if not _is_lib_present(lib_name="ibis"):
|
|
4003
4066
|
raise ImportError(
|
|
@@ -4011,14 +4074,10 @@ def connect_to_table(connection_string: str) -> Any:
|
|
|
4011
4074
|
if "::" not in connection_string:
|
|
4012
4075
|
# Try to connect to get available tables for helpful error message
|
|
4013
4076
|
try:
|
|
4014
|
-
# Extract the base connection string (without table name)
|
|
4015
4077
|
base_connection = connection_string
|
|
4016
|
-
|
|
4017
|
-
# Connect to the database
|
|
4018
4078
|
conn = ibis.connect(base_connection)
|
|
4019
4079
|
|
|
4020
|
-
#
|
|
4021
|
-
try:
|
|
4080
|
+
try: # pragma: no cover
|
|
4022
4081
|
available_tables = conn.list_tables()
|
|
4023
4082
|
except Exception: # pragma: no cover
|
|
4024
4083
|
available_tables = []
|
|
@@ -4035,7 +4094,6 @@ def connect_to_table(connection_string: str) -> Any:
|
|
|
4035
4094
|
f" {connection_string}::TABLE_NAME\n\n"
|
|
4036
4095
|
f"Examples:\n"
|
|
4037
4096
|
)
|
|
4038
|
-
# Add examples with first few table names
|
|
4039
4097
|
for table in available_tables[:3]:
|
|
4040
4098
|
error_msg += f" {connection_string}::{table}\n"
|
|
4041
4099
|
else:
|
|
@@ -4050,43 +4108,8 @@ def connect_to_table(connection_string: str) -> Any:
|
|
|
4050
4108
|
|
|
4051
4109
|
except Exception as e:
|
|
4052
4110
|
if isinstance(e, ValueError):
|
|
4053
|
-
raise
|
|
4054
|
-
|
|
4055
|
-
# Check for backend-specific errors and provide installation guidance
|
|
4056
|
-
error_str = str(e).lower()
|
|
4057
|
-
backend_install_map = {
|
|
4058
|
-
"duckdb": "pip install 'ibis-framework[duckdb]'",
|
|
4059
|
-
"postgresql": "pip install 'ibis-framework[postgres]'",
|
|
4060
|
-
"postgres": "pip install 'ibis-framework[postgres]'",
|
|
4061
|
-
"mysql": "pip install 'ibis-framework[mysql]'",
|
|
4062
|
-
"sqlite": "pip install 'ibis-framework[sqlite]'",
|
|
4063
|
-
"bigquery": "pip install 'ibis-framework[bigquery]'",
|
|
4064
|
-
"snowflake": "pip install 'ibis-framework[snowflake]'",
|
|
4065
|
-
}
|
|
4066
|
-
|
|
4067
|
-
# Check if this is a missing backend dependency
|
|
4068
|
-
for backend, install_cmd in backend_install_map.items(): # pragma: no cover
|
|
4069
|
-
if backend in error_str and ("not found" in error_str or "no module" in error_str):
|
|
4070
|
-
raise ConnectionError(
|
|
4071
|
-
f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
|
|
4072
|
-
f" {install_cmd}\n\n"
|
|
4073
|
-
f"Original error: {e}\n\n"
|
|
4074
|
-
f"Supported connection string formats:\n"
|
|
4075
|
-
f"- DuckDB: 'duckdb:///path/to/file.ddb::table_name'\n"
|
|
4076
|
-
f"- SQLite: 'sqlite:///path/to/file.db::table_name'\n"
|
|
4077
|
-
f"- PostgreSQL: 'postgresql://user:pass@host:port/db::table_name'\n"
|
|
4078
|
-
f"- MySQL: 'mysql://user:pass@host:port/db::table_name'\n"
|
|
4079
|
-
f"- BigQuery: 'bigquery://project/dataset::table_name'\n"
|
|
4080
|
-
f"- Snowflake: 'snowflake://user:pass@account/db/schema::table_name'\n"
|
|
4081
|
-
f"\nNote: Use '::table_name' to specify the table within the database."
|
|
4082
|
-
) from e
|
|
4083
|
-
|
|
4084
|
-
# Generic connection error
|
|
4085
|
-
raise ConnectionError( # pragma: no cover
|
|
4086
|
-
f"Failed to connect to database using connection string: {connection_string}\n"
|
|
4087
|
-
f"Error: {e}\n\n"
|
|
4088
|
-
f"No table specified. Use the format: {connection_string}::TABLE_NAME"
|
|
4089
|
-
) from e
|
|
4111
|
+
raise
|
|
4112
|
+
_handle_connection_errors(e, connection_string)
|
|
4090
4113
|
|
|
4091
4114
|
# Split connection string and table name
|
|
4092
4115
|
try:
|
|
@@ -4099,32 +4122,14 @@ def connect_to_table(connection_string: str) -> Any:
|
|
|
4099
4122
|
conn = ibis.connect(base_connection)
|
|
4100
4123
|
table = conn.table(table_name)
|
|
4101
4124
|
return table
|
|
4102
|
-
|
|
4103
4125
|
except Exception as e:
|
|
4104
|
-
# Check for backend-specific errors and provide installation guidance
|
|
4105
4126
|
error_str = str(e).lower()
|
|
4106
|
-
backend_install_map = {
|
|
4107
|
-
"duckdb": "pip install 'ibis-framework[duckdb]'",
|
|
4108
|
-
"postgresql": "pip install 'ibis-framework[postgres]'",
|
|
4109
|
-
"postgres": "pip install 'ibis-framework[postgres]'",
|
|
4110
|
-
"mysql": "pip install 'ibis-framework[mysql]'",
|
|
4111
|
-
"sqlite": "pip install 'ibis-framework[sqlite]'",
|
|
4112
|
-
"bigquery": "pip install 'ibis-framework[bigquery]'",
|
|
4113
|
-
"snowflake": "pip install 'ibis-framework[snowflake]'",
|
|
4114
|
-
}
|
|
4115
|
-
|
|
4116
|
-
# Check if this is a missing backend dependency
|
|
4117
|
-
for backend, install_cmd in backend_install_map.items():
|
|
4118
|
-
if backend in error_str and ("not found" in error_str or "no module" in error_str):
|
|
4119
|
-
raise ConnectionError(
|
|
4120
|
-
f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
|
|
4121
|
-
f" {install_cmd}\n\n"
|
|
4122
|
-
f"Original error: {e}"
|
|
4123
|
-
) from e
|
|
4124
4127
|
|
|
4125
|
-
# Check if table
|
|
4126
|
-
if "table" in error_str and (
|
|
4127
|
-
|
|
4128
|
+
# Check if this is a "table not found" error
|
|
4129
|
+
if "table" in error_str and (
|
|
4130
|
+
"not found" in error_str or "does not exist" in error_str or "not exist" in error_str
|
|
4131
|
+
):
|
|
4132
|
+
# Try to get available tables for a helpful error message
|
|
4128
4133
|
try: # pragma: no cover
|
|
4129
4134
|
available_tables = conn.list_tables()
|
|
4130
4135
|
if available_tables:
|
|
@@ -4132,23 +4137,79 @@ def connect_to_table(connection_string: str) -> Any:
|
|
|
4132
4137
|
raise ValueError(
|
|
4133
4138
|
f"Table '{table_name}' not found in database.\n\n"
|
|
4134
4139
|
f"Available tables:\n{table_list}\n\n"
|
|
4135
|
-
f"
|
|
4136
|
-
f" {base_connection}::CORRECT_TABLE_NAME"
|
|
4137
|
-
) from e
|
|
4138
|
-
else:
|
|
4139
|
-
raise ValueError(
|
|
4140
|
-
f"Table '{table_name}' not found and no tables available in database."
|
|
4140
|
+
f"Connection: {base_connection}"
|
|
4141
4141
|
) from e
|
|
4142
|
+
except ValueError:
|
|
4143
|
+
# Re-raise the table-specific ValueError
|
|
4144
|
+
raise
|
|
4142
4145
|
except Exception:
|
|
4143
|
-
raise
|
|
4144
|
-
|
|
4145
|
-
|
|
4146
|
-
|
|
4146
|
+
# If we can't list tables, just raise a simple error
|
|
4147
|
+
pass
|
|
4148
|
+
|
|
4149
|
+
raise ValueError(
|
|
4150
|
+
f"Table '{table_name}' not found in database.\n"
|
|
4151
|
+
f"Connection: {base_connection}\n\n"
|
|
4152
|
+
f"Original error: {e}"
|
|
4153
|
+
) from e
|
|
4154
|
+
|
|
4155
|
+
# For other errors, use the generic connection error handler
|
|
4156
|
+
_handle_connection_errors(e, base_connection)
|
|
4157
|
+
|
|
4158
|
+
|
|
4159
|
+
def print_database_tables(connection_string: str) -> list[str]:
|
|
4160
|
+
"""
|
|
4161
|
+
List all tables in a database from a connection string.
|
|
4162
|
+
|
|
4163
|
+
The `print_database_tables()` function connects to a database and returns a list of all
|
|
4164
|
+
available tables. This is particularly useful for discovering what tables exist in a database
|
|
4165
|
+
before connecting to a specific table with `connect_to_table(). The function automatically
|
|
4166
|
+
filters out temporary Ibis tables (memtables) to show only user tables. It supports all database
|
|
4167
|
+
backends available through Ibis, including DuckDB, SQLite, PostgreSQL, MySQL, BigQuery, and
|
|
4168
|
+
Snowflake.
|
|
4169
|
+
|
|
4170
|
+
Parameters
|
|
4171
|
+
----------
|
|
4172
|
+
connection_string
|
|
4173
|
+
A database connection string *without* the `::table_name` suffix. Example:
|
|
4174
|
+
`"duckdb:///path/to/database.ddb"`.
|
|
4175
|
+
|
|
4176
|
+
Returns
|
|
4177
|
+
-------
|
|
4178
|
+
list[str]
|
|
4179
|
+
List of table names, excluding temporary Ibis tables.
|
|
4180
|
+
|
|
4181
|
+
See Also
|
|
4182
|
+
--------
|
|
4183
|
+
connect_to_table : Connect to a database table with full connection string documentation
|
|
4184
|
+
"""
|
|
4185
|
+
# Check if connection string includes table specification (which is not allowed)
|
|
4186
|
+
if "::" in connection_string:
|
|
4187
|
+
raise ValueError(
|
|
4188
|
+
"Connection string should not include table specification (::table_name).\n"
|
|
4189
|
+
f"You've supplied: {connection_string}\n"
|
|
4190
|
+
f"Expected format: 'duckdb:///path/to/database.ddb' (without ::table_name)"
|
|
4191
|
+
)
|
|
4192
|
+
|
|
4193
|
+
# Check if Ibis is available
|
|
4194
|
+
if not _is_lib_present(lib_name="ibis"):
|
|
4195
|
+
raise ImportError(
|
|
4196
|
+
"The Ibis library is not installed but is required for database connection strings.\n"
|
|
4197
|
+
"Install it with: pip install 'ibis-framework[duckdb]' (or other backend as needed)"
|
|
4198
|
+
)
|
|
4199
|
+
|
|
4200
|
+
import ibis
|
|
4201
|
+
|
|
4202
|
+
try:
|
|
4203
|
+
# Connect to database
|
|
4204
|
+
conn = ibis.connect(connection_string)
|
|
4205
|
+
# Get all tables and filter out temporary Ibis tables
|
|
4206
|
+
all_tables = conn.list_tables()
|
|
4207
|
+
user_tables = [t for t in all_tables if "memtable" not in t]
|
|
4147
4208
|
|
|
4148
|
-
|
|
4149
|
-
|
|
4150
|
-
|
|
4151
|
-
)
|
|
4209
|
+
return user_tables
|
|
4210
|
+
|
|
4211
|
+
except Exception as e:
|
|
4212
|
+
_handle_connection_errors(e, connection_string)
|
|
4152
4213
|
|
|
4153
4214
|
|
|
4154
4215
|
@dataclass
|
|
@@ -4430,6 +4491,16 @@ class Validate:
|
|
|
4430
4491
|
- Vietnamese (`"vi"`)
|
|
4431
4492
|
- Indonesian (`"id"`)
|
|
4432
4493
|
- Ukrainian (`"uk"`)
|
|
4494
|
+
- Bulgarian (`"bg"`)
|
|
4495
|
+
- Croatian (`"hr"`)
|
|
4496
|
+
- Estonian (`"et"`)
|
|
4497
|
+
- Hungarian (`"hu"`)
|
|
4498
|
+
- Irish (`"ga"`)
|
|
4499
|
+
- Latvian (`"lv"`)
|
|
4500
|
+
- Lithuanian (`"lt"`)
|
|
4501
|
+
- Maltese (`"mt"`)
|
|
4502
|
+
- Slovak (`"sk"`)
|
|
4503
|
+
- Slovenian (`"sl"`)
|
|
4433
4504
|
- Hebrew (`"he"`)
|
|
4434
4505
|
- Thai (`"th"`)
|
|
4435
4506
|
- Persian (`"fa"`)
|
|
@@ -9700,40 +9771,41 @@ class Validate:
|
|
|
9700
9771
|
|
|
9701
9772
|
return self
|
|
9702
9773
|
|
|
9703
|
-
def
|
|
9774
|
+
def col_pct_null(
|
|
9704
9775
|
self,
|
|
9705
|
-
|
|
9706
|
-
|
|
9707
|
-
|
|
9708
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9776
|
+
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
9777
|
+
p: float,
|
|
9778
|
+
tol: Tolerance = 0,
|
|
9779
|
+
thresholds: int | float | None | bool | tuple | dict | Thresholds = None,
|
|
9709
9780
|
actions: Actions | None = None,
|
|
9710
9781
|
brief: str | bool | None = None,
|
|
9711
9782
|
active: bool = True,
|
|
9712
9783
|
) -> Validate:
|
|
9713
9784
|
"""
|
|
9714
|
-
Validate whether
|
|
9785
|
+
Validate whether a column has a specific percentage of Null values.
|
|
9715
9786
|
|
|
9716
|
-
The `
|
|
9717
|
-
|
|
9718
|
-
|
|
9787
|
+
The `col_pct_null()` validation method checks whether the percentage of Null values in a
|
|
9788
|
+
column matches a specified percentage `p=` (within an optional tolerance `tol=`). This
|
|
9789
|
+
validation operates at the column level, generating a single validation step per column that
|
|
9790
|
+
passes or fails based on whether the actual percentage of Null values falls within the
|
|
9791
|
+
acceptable range defined by `p ± tol`.
|
|
9719
9792
|
|
|
9720
9793
|
Parameters
|
|
9721
9794
|
----------
|
|
9722
|
-
|
|
9723
|
-
A single column or a list of columns to
|
|
9724
|
-
|
|
9725
|
-
columns are supplied
|
|
9726
|
-
|
|
9727
|
-
|
|
9728
|
-
|
|
9729
|
-
|
|
9730
|
-
|
|
9731
|
-
|
|
9732
|
-
|
|
9733
|
-
|
|
9734
|
-
|
|
9735
|
-
|
|
9736
|
-
(provided as a list). Read the *Segmentation* section for usage information.
|
|
9795
|
+
columns
|
|
9796
|
+
A single column or a list of columns to validate. Can also use
|
|
9797
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
9798
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
9799
|
+
generated for each column.
|
|
9800
|
+
p
|
|
9801
|
+
The expected percentage of Null values in the column, expressed as a decimal between
|
|
9802
|
+
`0.0` and `1.0`. For example, `p=0.5` means 50% of values should be Null.
|
|
9803
|
+
tol
|
|
9804
|
+
The tolerance allowed when comparing the actual percentage of Null values to the
|
|
9805
|
+
expected percentage `p=`. The validation passes if the actual percentage falls within
|
|
9806
|
+
the range `[p - tol, p + tol]`. Default is `0`, meaning an exact match is required. See
|
|
9807
|
+
the *Tolerance* section for details on all supported formats (absolute, relative,
|
|
9808
|
+
symmetric, and asymmetric bounds).
|
|
9737
9809
|
thresholds
|
|
9738
9810
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
9739
9811
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -9741,7 +9813,7 @@ class Validate:
|
|
|
9741
9813
|
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
9742
9814
|
section for information on how to set threshold levels.
|
|
9743
9815
|
actions
|
|
9744
|
-
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
9816
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
9745
9817
|
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
9746
9818
|
define the actions.
|
|
9747
9819
|
brief
|
|
@@ -9760,60 +9832,30 @@ class Validate:
|
|
|
9760
9832
|
Validate
|
|
9761
9833
|
The `Validate` object with the added validation step.
|
|
9762
9834
|
|
|
9763
|
-
|
|
9764
|
-
|
|
9765
|
-
The `
|
|
9766
|
-
|
|
9767
|
-
table. This is useful for performing any necessary transformations or filtering on the data
|
|
9768
|
-
before the validation step is applied.
|
|
9769
|
-
|
|
9770
|
-
The preprocessing function can be any callable that takes a table as input and returns a
|
|
9771
|
-
modified table. For example, you could use a lambda function to filter the table based on
|
|
9772
|
-
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
9773
|
-
columns via `columns_subset=` that are expected to be present in the transformed table, but
|
|
9774
|
-
may not exist in the table before preprocessing. Regarding the lifetime of the transformed
|
|
9775
|
-
table, it only exists during the validation step and is not stored in the `Validate` object
|
|
9776
|
-
or used in subsequent validation steps.
|
|
9777
|
-
|
|
9778
|
-
Segmentation
|
|
9779
|
-
------------
|
|
9780
|
-
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
9781
|
-
segments. This is useful for applying the same validation step to different subsets of the
|
|
9782
|
-
data. The segmentation can be done based on a single column or specific fields within a
|
|
9783
|
-
column.
|
|
9784
|
-
|
|
9785
|
-
Providing a single column name will result in a separate validation step for each unique
|
|
9786
|
-
value in that column. For example, if you have a column called `"region"` with values
|
|
9787
|
-
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
9788
|
-
region.
|
|
9789
|
-
|
|
9790
|
-
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
9791
|
-
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
9792
|
-
segment on only specific dates, you can provide a tuple like
|
|
9793
|
-
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
9794
|
-
(i.e., no validation steps will be created for them).
|
|
9835
|
+
Tolerance
|
|
9836
|
+
---------
|
|
9837
|
+
The `tol=` parameter accepts several different formats to specify the acceptable deviation
|
|
9838
|
+
from the expected percentage `p=`. The tolerance can be expressed as:
|
|
9795
9839
|
|
|
9796
|
-
|
|
9797
|
-
|
|
9840
|
+
1. *single integer* (absolute tolerance): the exact number of test units that can deviate.
|
|
9841
|
+
For example, `tol=2` means the actual count can differ from the expected count by up to 2
|
|
9842
|
+
units in either direction.
|
|
9798
9843
|
|
|
9799
|
-
|
|
9800
|
-
|
|
9801
|
-
|
|
9802
|
-
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
9844
|
+
2. *single float between 0 and 1* (relative tolerance): a proportion of the expected
|
|
9845
|
+
count. For example, if the expected count is 50 and `tol=0.1`, the acceptable range is
|
|
9846
|
+
45 to 55 (50 ± 10% of 50 = 50 ± 5).
|
|
9803
9847
|
|
|
9804
|
-
|
|
9805
|
-
|
|
9806
|
-
|
|
9848
|
+
3. *tuple of two integers* (absolute bounds): explicitly specify the lower and upper
|
|
9849
|
+
bounds as absolute deviations. For example, `tol=(1, 3)` means the actual count can be
|
|
9850
|
+
1 unit below or 3 units above the expected count.
|
|
9807
9851
|
|
|
9808
|
-
|
|
9809
|
-
|
|
9810
|
-
|
|
9811
|
-
identify issues within specific segments.
|
|
9852
|
+
4. *tuple of two floats between 0 and 1* (relative bounds): explicitly specify the lower
|
|
9853
|
+
and upper bounds as proportional deviations. For example, `tol=(0.05, 0.15)` means the
|
|
9854
|
+
lower bound is 5% below and the upper bound is 15% above the expected count.
|
|
9812
9855
|
|
|
9813
|
-
|
|
9814
|
-
|
|
9815
|
-
|
|
9816
|
-
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
9856
|
+
When using a single value (integer or float), the tolerance is applied symmetrically in both
|
|
9857
|
+
directions. When using a tuple, you can specify asymmetric tolerances where the lower and
|
|
9858
|
+
upper bounds differ.
|
|
9817
9859
|
|
|
9818
9860
|
Thresholds
|
|
9819
9861
|
----------
|
|
@@ -9851,8 +9893,8 @@ class Validate:
|
|
|
9851
9893
|
import pointblank as pb
|
|
9852
9894
|
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
9853
9895
|
```
|
|
9854
|
-
For the examples here, we'll use a simple Polars DataFrame with three
|
|
9855
|
-
|
|
9896
|
+
For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`,
|
|
9897
|
+
and `c`) that have different percentages of Null values. The table is shown below:
|
|
9856
9898
|
|
|
9857
9899
|
```{python}
|
|
9858
9900
|
import pointblank as pb
|
|
@@ -9860,56 +9902,133 @@ class Validate:
|
|
|
9860
9902
|
|
|
9861
9903
|
tbl = pl.DataFrame(
|
|
9862
9904
|
{
|
|
9863
|
-
"
|
|
9864
|
-
"
|
|
9865
|
-
"
|
|
9905
|
+
"a": [1, 2, 3, 4, 5, 6, 7, 8],
|
|
9906
|
+
"b": [1, None, 3, None, 5, None, 7, None],
|
|
9907
|
+
"c": [None, None, None, None, None, None, 1, 2],
|
|
9866
9908
|
}
|
|
9867
9909
|
)
|
|
9868
9910
|
|
|
9869
9911
|
pb.preview(tbl)
|
|
9870
9912
|
```
|
|
9871
9913
|
|
|
9872
|
-
Let's validate that
|
|
9873
|
-
determine if this validation had any failing test units (there are four test units, one for
|
|
9874
|
-
each row). A failing test units means that a given row is not distinct from every other row.
|
|
9914
|
+
Let's validate that column `a` has 0% Null values (i.e., no Null values at all).
|
|
9875
9915
|
|
|
9876
9916
|
```{python}
|
|
9877
9917
|
validation = (
|
|
9878
9918
|
pb.Validate(data=tbl)
|
|
9879
|
-
.
|
|
9919
|
+
.col_pct_null(columns="a", p=0.0)
|
|
9880
9920
|
.interrogate()
|
|
9881
9921
|
)
|
|
9882
9922
|
|
|
9883
9923
|
validation
|
|
9884
9924
|
```
|
|
9885
9925
|
|
|
9886
|
-
|
|
9887
|
-
table
|
|
9926
|
+
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
9927
|
+
The validation table shows the single entry that corresponds to the validation step created
|
|
9928
|
+
by using `col_pct_null()`. The validation passed since column `a` has no Null values.
|
|
9888
9929
|
|
|
9889
|
-
|
|
9890
|
-
using columns `col_2` and `col_3` for the next validation.
|
|
9930
|
+
Now, let's check that column `b` has exactly 50% Null values.
|
|
9891
9931
|
|
|
9892
9932
|
```{python}
|
|
9893
9933
|
validation = (
|
|
9894
9934
|
pb.Validate(data=tbl)
|
|
9895
|
-
.
|
|
9935
|
+
.col_pct_null(columns="b", p=0.5)
|
|
9896
9936
|
.interrogate()
|
|
9897
9937
|
)
|
|
9898
9938
|
|
|
9899
9939
|
validation
|
|
9900
9940
|
```
|
|
9901
9941
|
|
|
9902
|
-
|
|
9903
|
-
|
|
9904
|
-
|
|
9905
|
-
|
|
9906
|
-
|
|
9942
|
+
This validation also passes, as column `b` has exactly 4 out of 8 values as Null (50%).
|
|
9943
|
+
|
|
9944
|
+
Finally, let's validate column `c` with a tolerance. Column `c` has 75% Null values, so
|
|
9945
|
+
we'll check if it's approximately 70% Null with a tolerance of 10%.
|
|
9946
|
+
|
|
9947
|
+
```{python}
|
|
9948
|
+
validation = (
|
|
9949
|
+
pb.Validate(data=tbl)
|
|
9950
|
+
.col_pct_null(columns="c", p=0.70, tol=0.10)
|
|
9951
|
+
.interrogate()
|
|
9952
|
+
)
|
|
9953
|
+
|
|
9954
|
+
validation
|
|
9955
|
+
```
|
|
9956
|
+
|
|
9957
|
+
This validation passes because the actual percentage (75%) falls within the acceptable
|
|
9958
|
+
range of 60% to 80% (70% ± 10%).
|
|
9959
|
+
|
|
9960
|
+
The `tol=` parameter supports multiple formats to express tolerance. Let's explore all the
|
|
9961
|
+
different ways to specify tolerance using column `b`, which has exactly 50% Null values
|
|
9962
|
+
(4 out of 8 values).
|
|
9963
|
+
|
|
9964
|
+
*Using an absolute tolerance (integer)*: Specify the exact number of rows that can
|
|
9965
|
+
deviate. With `tol=1`, we allow the count to differ by 1 row in either direction.
|
|
9966
|
+
|
|
9967
|
+
```{python}
|
|
9968
|
+
validation = (
|
|
9969
|
+
pb.Validate(data=tbl)
|
|
9970
|
+
.col_pct_null(columns="b", p=0.375, tol=1) # Expect 3 nulls, allow ±1 (range: 2-4)
|
|
9971
|
+
.interrogate()
|
|
9972
|
+
)
|
|
9973
|
+
|
|
9974
|
+
validation
|
|
9975
|
+
```
|
|
9907
9976
|
|
|
9977
|
+
This passes because column `b` has 4 Null values, which falls within the acceptable range
|
|
9978
|
+
of 2 to 4 (3 ± 1).
|
|
9979
|
+
|
|
9980
|
+
*Using a relative tolerance (float)*: Specify the tolerance as a proportion of the
|
|
9981
|
+
expected count. With `tol=0.25`, we allow a 25% deviation from the expected count.
|
|
9982
|
+
|
|
9983
|
+
```{python}
|
|
9984
|
+
validation = (
|
|
9985
|
+
pb.Validate(data=tbl)
|
|
9986
|
+
.col_pct_null(columns="b", p=0.375, tol=0.25) # Expect 3 nulls, allow ±25% (range: 2.25-3.75)
|
|
9987
|
+
.interrogate()
|
|
9988
|
+
)
|
|
9989
|
+
|
|
9990
|
+
validation
|
|
9991
|
+
```
|
|
9992
|
+
|
|
9993
|
+
This passes because 4 Null values falls within the acceptable range (3 ± 0.75 calculates
|
|
9994
|
+
to 2.25 to 3.75, which rounds down to 2 to 3 rows).
|
|
9995
|
+
|
|
9996
|
+
*Using asymmetric absolute bounds (tuple of integers)*: Specify different lower and
|
|
9997
|
+
upper bounds as absolute values. With `tol=(0, 2)`, we allow no deviation below but up
|
|
9998
|
+
to 2 rows above the expected count.
|
|
9999
|
+
|
|
10000
|
+
```{python}
|
|
10001
|
+
validation = (
|
|
10002
|
+
pb.Validate(data=tbl)
|
|
10003
|
+
.col_pct_null(columns="b", p=0.25, tol=(0, 2)) # Expect 2 Nulls, allow +0/-2 (range: 2-4)
|
|
10004
|
+
.interrogate()
|
|
10005
|
+
)
|
|
10006
|
+
|
|
10007
|
+
validation
|
|
10008
|
+
```
|
|
10009
|
+
|
|
10010
|
+
This passes because 4 Null values falls within the acceptable range of 2 to 4.
|
|
10011
|
+
|
|
10012
|
+
*Using asymmetric relative bounds (tuple of floats)*: Specify different lower and upper
|
|
10013
|
+
bounds as proportions. With `tol=(0.1, 0.3)`, we allow 10% below and 30% above the
|
|
10014
|
+
expected count.
|
|
10015
|
+
|
|
10016
|
+
```{python}
|
|
10017
|
+
validation = (
|
|
10018
|
+
pb.Validate(data=tbl)
|
|
10019
|
+
.col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3)) # Expect 3 Nulls, allow -10%/+30%
|
|
10020
|
+
.interrogate()
|
|
10021
|
+
)
|
|
10022
|
+
|
|
10023
|
+
validation
|
|
10024
|
+
```
|
|
10025
|
+
|
|
10026
|
+
This passes because 4 Null values falls within the acceptable range (3 - 0.3 to 3 + 0.9
|
|
10027
|
+
calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows).
|
|
10028
|
+
"""
|
|
9908
10029
|
assertion_type = _get_fn_name()
|
|
9909
10030
|
|
|
9910
|
-
|
|
9911
|
-
# TODO: add check for segments
|
|
9912
|
-
# _check_segments(segments=segments)
|
|
10031
|
+
_check_column(column=columns)
|
|
9913
10032
|
_check_thresholds(thresholds=thresholds)
|
|
9914
10033
|
_check_boolean_input(param=active, param_name="active")
|
|
9915
10034
|
|
|
@@ -9918,31 +10037,38 @@ class Validate:
|
|
|
9918
10037
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
9919
10038
|
)
|
|
9920
10039
|
|
|
9921
|
-
|
|
9922
|
-
|
|
10040
|
+
# If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
|
|
10041
|
+
# resolve the columns
|
|
10042
|
+
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
10043
|
+
columns = col(columns)
|
|
9923
10044
|
|
|
9924
|
-
#
|
|
10045
|
+
# If `columns` is Column value or a string, place it in a list for iteration
|
|
10046
|
+
if isinstance(columns, (Column, str)):
|
|
10047
|
+
columns = [columns]
|
|
9925
10048
|
|
|
9926
10049
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
9927
10050
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
9928
10051
|
|
|
9929
|
-
|
|
9930
|
-
assertion_type=assertion_type,
|
|
9931
|
-
column=columns_subset,
|
|
9932
|
-
pre=pre,
|
|
9933
|
-
segments=segments,
|
|
9934
|
-
thresholds=thresholds,
|
|
9935
|
-
actions=actions,
|
|
9936
|
-
brief=brief,
|
|
9937
|
-
active=active,
|
|
9938
|
-
)
|
|
9939
|
-
|
|
9940
|
-
self._add_validation(validation_info=val_info)
|
|
10052
|
+
bound_finder: Callable[[int], AbsoluteBounds] = partial(_derive_bounds, tol=tol)
|
|
9941
10053
|
|
|
9942
|
-
|
|
9943
|
-
|
|
9944
|
-
|
|
9945
|
-
|
|
10054
|
+
# Iterate over the columns and create a validation step for each
|
|
10055
|
+
for column in columns:
|
|
10056
|
+
val_info = _ValidationInfo(
|
|
10057
|
+
assertion_type=assertion_type,
|
|
10058
|
+
column=column,
|
|
10059
|
+
values={"p": p, "bound_finder": bound_finder},
|
|
10060
|
+
thresholds=thresholds,
|
|
10061
|
+
actions=actions,
|
|
10062
|
+
brief=brief,
|
|
10063
|
+
active=active,
|
|
10064
|
+
)
|
|
10065
|
+
|
|
10066
|
+
self._add_validation(validation_info=val_info)
|
|
10067
|
+
|
|
10068
|
+
return self
|
|
10069
|
+
|
|
10070
|
+
def rows_distinct(
|
|
10071
|
+
self,
|
|
9946
10072
|
columns_subset: str | list[str] | None = None,
|
|
9947
10073
|
pre: Callable | None = None,
|
|
9948
10074
|
segments: SegmentSpec | None = None,
|
|
@@ -9952,19 +10078,19 @@ class Validate:
|
|
|
9952
10078
|
active: bool = True,
|
|
9953
10079
|
) -> Validate:
|
|
9954
10080
|
"""
|
|
9955
|
-
Validate whether
|
|
10081
|
+
Validate whether rows in the table are distinct.
|
|
9956
10082
|
|
|
9957
|
-
The `
|
|
9958
|
-
|
|
9959
|
-
|
|
9960
|
-
after any `pre=` mutation has been applied). A subset of columns can be specified for the
|
|
9961
|
-
completeness check. If no subset is provided, all columns in the table will be used.
|
|
10083
|
+
The `rows_distinct()` method checks whether rows in the table are distinct. This validation
|
|
10084
|
+
will operate over the number of test units that is equal to the number of rows in the table
|
|
10085
|
+
(determined after any `pre=` mutation has been applied).
|
|
9962
10086
|
|
|
9963
10087
|
Parameters
|
|
9964
10088
|
----------
|
|
9965
10089
|
columns_subset
|
|
9966
|
-
A single column or a list of columns to use as a subset for the
|
|
9967
|
-
`None
|
|
10090
|
+
A single column or a list of columns to use as a subset for the distinct comparison.
|
|
10091
|
+
If `None`, then all columns in the table will be used for the comparison. If multiple
|
|
10092
|
+
columns are supplied, the distinct comparison will be made over the combination of
|
|
10093
|
+
values in those columns.
|
|
9968
10094
|
pre
|
|
9969
10095
|
An optional preprocessing function or lambda to apply to the data table during
|
|
9970
10096
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -10101,48 +10227,48 @@ class Validate:
|
|
|
10101
10227
|
|
|
10102
10228
|
tbl = pl.DataFrame(
|
|
10103
10229
|
{
|
|
10104
|
-
"col_1": ["a",
|
|
10105
|
-
"col_2": ["a", "a", "c",
|
|
10106
|
-
"col_3": ["a", "a", "d",
|
|
10230
|
+
"col_1": ["a", "b", "c", "d"],
|
|
10231
|
+
"col_2": ["a", "a", "c", "d"],
|
|
10232
|
+
"col_3": ["a", "a", "d", "e"],
|
|
10107
10233
|
}
|
|
10108
10234
|
)
|
|
10109
10235
|
|
|
10110
10236
|
pb.preview(tbl)
|
|
10111
10237
|
```
|
|
10112
10238
|
|
|
10113
|
-
Let's validate that the rows in the table are
|
|
10239
|
+
Let's validate that the rows in the table are distinct with `rows_distinct()`. We'll
|
|
10114
10240
|
determine if this validation had any failing test units (there are four test units, one for
|
|
10115
|
-
each row). A failing test units means that a given row is not
|
|
10116
|
-
one missing value).
|
|
10241
|
+
each row). A failing test units means that a given row is not distinct from every other row.
|
|
10117
10242
|
|
|
10118
10243
|
```{python}
|
|
10119
10244
|
validation = (
|
|
10120
10245
|
pb.Validate(data=tbl)
|
|
10121
|
-
.
|
|
10246
|
+
.rows_distinct()
|
|
10122
10247
|
.interrogate()
|
|
10123
10248
|
)
|
|
10124
10249
|
|
|
10125
10250
|
validation
|
|
10126
10251
|
```
|
|
10127
10252
|
|
|
10128
|
-
From this validation table we see that there are
|
|
10129
|
-
|
|
10253
|
+
From this validation table we see that there are no failing test units. All rows in the
|
|
10254
|
+
table are distinct from one another.
|
|
10130
10255
|
|
|
10131
|
-
We can also use a subset of columns to determine
|
|
10256
|
+
We can also use a subset of columns to determine distinctness. Let's specify the subset
|
|
10132
10257
|
using columns `col_2` and `col_3` for the next validation.
|
|
10133
10258
|
|
|
10134
10259
|
```{python}
|
|
10135
10260
|
validation = (
|
|
10136
10261
|
pb.Validate(data=tbl)
|
|
10137
|
-
.
|
|
10262
|
+
.rows_distinct(columns_subset=["col_2", "col_3"])
|
|
10138
10263
|
.interrogate()
|
|
10139
10264
|
)
|
|
10140
10265
|
|
|
10141
10266
|
validation
|
|
10142
10267
|
```
|
|
10143
10268
|
|
|
10144
|
-
The validation table reports
|
|
10145
|
-
values in
|
|
10269
|
+
The validation table reports two failing test units. The first and second rows are
|
|
10270
|
+
duplicated when considering only the values in columns `col_2` and `col_3`. There's only
|
|
10271
|
+
one set of duplicates but there are two failing test units since each row is compared to all
|
|
10146
10272
|
others.
|
|
10147
10273
|
"""
|
|
10148
10274
|
|
|
@@ -10159,8 +10285,8 @@ class Validate:
|
|
|
10159
10285
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
10160
10286
|
)
|
|
10161
10287
|
|
|
10162
|
-
if columns_subset is not None and isinstance(columns_subset, str):
|
|
10163
|
-
columns_subset = [columns_subset]
|
|
10288
|
+
if columns_subset is not None and isinstance(columns_subset, str):
|
|
10289
|
+
columns_subset = [columns_subset]
|
|
10164
10290
|
|
|
10165
10291
|
# TODO: incorporate Column object
|
|
10166
10292
|
|
|
@@ -10182,13 +10308,9 @@ class Validate:
|
|
|
10182
10308
|
|
|
10183
10309
|
return self
|
|
10184
10310
|
|
|
10185
|
-
def
|
|
10311
|
+
def rows_complete(
|
|
10186
10312
|
self,
|
|
10187
|
-
prompt: str,
|
|
10188
|
-
model: str,
|
|
10189
10313
|
columns_subset: str | list[str] | None = None,
|
|
10190
|
-
batch_size: int = 1000,
|
|
10191
|
-
max_concurrent: int = 3,
|
|
10192
10314
|
pre: Callable | None = None,
|
|
10193
10315
|
segments: SegmentSpec | None = None,
|
|
10194
10316
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
@@ -10197,66 +10319,35 @@ class Validate:
|
|
|
10197
10319
|
active: bool = True,
|
|
10198
10320
|
) -> Validate:
|
|
10199
10321
|
"""
|
|
10200
|
-
Validate
|
|
10201
|
-
|
|
10202
|
-
The `prompt()` validation method uses Large Language Models (LLMs) to validate rows of data
|
|
10203
|
-
based on natural language criteria. Similar to other Pointblank validation methods, this
|
|
10204
|
-
generates binary test results (pass/fail) that integrate seamlessly with the standard
|
|
10205
|
-
reporting framework.
|
|
10206
|
-
|
|
10207
|
-
Like `col_vals_*()` methods, `prompt()` evaluates data against specific criteria, but
|
|
10208
|
-
instead of using programmatic rules, it uses natural language prompts interpreted by an LLM.
|
|
10209
|
-
Like `rows_distinct()` and `rows_complete()`, it operates at the row level and allows you to
|
|
10210
|
-
specify a subset of columns for evaluation using `columns_subset=`.
|
|
10211
|
-
|
|
10212
|
-
The system automatically combines your validation criteria from the `prompt=` parameter with
|
|
10213
|
-
the necessary technical context, data formatting instructions, and response structure
|
|
10214
|
-
requirements. This is all so you only need to focus on describing your validation logic in
|
|
10215
|
-
plain language.
|
|
10322
|
+
Validate whether row data are complete by having no missing values.
|
|
10216
10323
|
|
|
10217
|
-
|
|
10218
|
-
|
|
10219
|
-
|
|
10220
|
-
|
|
10221
|
-
|
|
10324
|
+
The `rows_complete()` method checks whether rows in the table are complete. Completeness
|
|
10325
|
+
of a row means that there are no missing values within the row. This validation will operate
|
|
10326
|
+
over the number of test units that is equal to the number of rows in the table (determined
|
|
10327
|
+
after any `pre=` mutation has been applied). A subset of columns can be specified for the
|
|
10328
|
+
completeness check. If no subset is provided, all columns in the table will be used.
|
|
10222
10329
|
|
|
10223
10330
|
Parameters
|
|
10224
10331
|
----------
|
|
10225
|
-
prompt
|
|
10226
|
-
A natural language description of the validation criteria. This prompt should clearly
|
|
10227
|
-
describe what constitutes valid vs invalid rows. Some examples:
|
|
10228
|
-
`"Each row should contain a valid email address and a realistic person name"`,
|
|
10229
|
-
`"Values should indicate positive sentiment"`,
|
|
10230
|
-
`"The description should mention a country name"`.
|
|
10231
10332
|
columns_subset
|
|
10232
|
-
A single column or list of columns to
|
|
10233
|
-
|
|
10234
|
-
so try to include only the columns necessary for the validation.
|
|
10235
|
-
model
|
|
10236
|
-
The model to be used. This should be in the form of `provider:model` (e.g.,
|
|
10237
|
-
`"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
|
|
10238
|
-
`"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from
|
|
10239
|
-
the provider. Model names are subject to change so consult the provider's documentation
|
|
10240
|
-
for the most up-to-date model names.
|
|
10241
|
-
batch_size
|
|
10242
|
-
Number of rows to process in each batch. Larger batches are more efficient but may hit
|
|
10243
|
-
API limits. Default is `1000`.
|
|
10244
|
-
max_concurrent
|
|
10245
|
-
Maximum number of concurrent API requests. Higher values speed up processing but may
|
|
10246
|
-
hit rate limits. Default is `3`.
|
|
10333
|
+
A single column or a list of columns to use as a subset for the completeness check. If
|
|
10334
|
+
`None` (the default), then all columns in the table will be used.
|
|
10247
10335
|
pre
|
|
10248
10336
|
An optional preprocessing function or lambda to apply to the data table during
|
|
10249
10337
|
interrogation. This function should take a table as input and return a modified table.
|
|
10338
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
10339
|
+
argument.
|
|
10250
10340
|
segments
|
|
10251
10341
|
An optional directive on segmentation, which serves to split a validation step into
|
|
10252
10342
|
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
10253
10343
|
column name and its corresponding values to segment on, or a combination of both
|
|
10254
|
-
(provided as a list).
|
|
10344
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
10255
10345
|
thresholds
|
|
10256
10346
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
10257
10347
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
10258
10348
|
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
10259
|
-
be set locally and global thresholds (if any) will take effect.
|
|
10349
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
10350
|
+
section for information on how to set threshold levels.
|
|
10260
10351
|
actions
|
|
10261
10352
|
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
10262
10353
|
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
@@ -10277,152 +10368,88 @@ class Validate:
|
|
|
10277
10368
|
Validate
|
|
10278
10369
|
The `Validate` object with the added validation step.
|
|
10279
10370
|
|
|
10280
|
-
|
|
10281
|
-
|
|
10282
|
-
The `
|
|
10283
|
-
|
|
10371
|
+
Preprocessing
|
|
10372
|
+
-------------
|
|
10373
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
10374
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
10375
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
10376
|
+
before the validation step is applied.
|
|
10284
10377
|
|
|
10285
|
-
|
|
10286
|
-
|
|
10287
|
-
|
|
10288
|
-
|
|
10378
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
10379
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
10380
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
10381
|
+
columns via `columns_subset=` that are expected to be present in the transformed table, but
|
|
10382
|
+
may not exist in the table before preprocessing. Regarding the lifetime of the transformed
|
|
10383
|
+
table, it only exists during the validation step and is not stored in the `Validate` object
|
|
10384
|
+
or used in subsequent validation steps.
|
|
10289
10385
|
|
|
10290
|
-
|
|
10291
|
-
|
|
10292
|
-
|
|
10386
|
+
Segmentation
|
|
10387
|
+
------------
|
|
10388
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
10389
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
10390
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
10391
|
+
column.
|
|
10293
10392
|
|
|
10294
|
-
|
|
10295
|
-
|
|
10296
|
-
|
|
10297
|
-
|
|
10298
|
-
method for handling API keys.
|
|
10393
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
10394
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
10395
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
10396
|
+
region.
|
|
10299
10397
|
|
|
10300
|
-
|
|
10301
|
-
|
|
10302
|
-
|
|
10398
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
10399
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
10400
|
+
segment on only specific dates, you can provide a tuple like
|
|
10401
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
10402
|
+
(i.e., no validation steps will be created for them).
|
|
10303
10403
|
|
|
10304
|
-
|
|
10305
|
-
|
|
10306
|
-
`OPENAI_API_KEY`) then the AI validation will automatically load the API key from the `.env`
|
|
10307
|
-
file. An `.env` file might look like this:
|
|
10404
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
10405
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
10308
10406
|
|
|
10309
|
-
```plaintext
|
|
10310
|
-
ANTHROPIC_API_KEY="your_anthropic_api_key_here"
|
|
10311
|
-
OPENAI_API_KEY="your_openai_api_key_here"
|
|
10312
10407
|
```
|
|
10408
|
+
# Segments from all unique values in the `region` column
|
|
10409
|
+
# and specific dates in the `date` column
|
|
10410
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
10313
10411
|
|
|
10314
|
-
|
|
10315
|
-
|
|
10412
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
10413
|
+
segments=["region", "date"]
|
|
10414
|
+
```
|
|
10316
10415
|
|
|
10317
|
-
|
|
10416
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
10417
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
10418
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
10419
|
+
identify issues within specific segments.
|
|
10318
10420
|
|
|
10319
|
-
|
|
10320
|
-
|
|
10321
|
-
|
|
10322
|
-
|
|
10421
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
10422
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
10423
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
10424
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
10323
10425
|
|
|
10324
|
-
|
|
10325
|
-
|
|
10326
|
-
The
|
|
10426
|
+
Thresholds
|
|
10427
|
+
----------
|
|
10428
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
10429
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
10430
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
10327
10431
|
|
|
10328
|
-
|
|
10329
|
-
|
|
10330
|
-
|
|
10331
|
-
3. json conversion: each batch of unique rows is converted to JSON format for the LLM
|
|
10332
|
-
4. prompt construction: the user prompt is embedded in a structured system prompt
|
|
10333
|
-
5. llm processing: each batch is sent to the LLM for analysis
|
|
10334
|
-
6. response parsing: LLM responses are parsed to extract validation results
|
|
10335
|
-
7. result projection: results are mapped back to all original rows using row signatures
|
|
10336
|
-
8. result aggregation: results from all batches are combined
|
|
10432
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
10433
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
10434
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
10337
10435
|
|
|
10338
|
-
|
|
10339
|
-
LLM calls. When multiple rows have identical values in the selected columns, only one
|
|
10340
|
-
representative row is validated, and the result is applied to all matching rows. This can
|
|
10341
|
-
dramatically reduce API costs and processing time for datasets with repetitive patterns.
|
|
10436
|
+
Thresholds can be defined using one of these input schemes:
|
|
10342
10437
|
|
|
10343
|
-
|
|
10438
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
10439
|
+
thresholds)
|
|
10440
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
10441
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
10442
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
10443
|
+
'critical'
|
|
10444
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
10445
|
+
for the 'warning' level only
|
|
10344
10446
|
|
|
10345
|
-
|
|
10346
|
-
|
|
10347
|
-
|
|
10348
|
-
"rows": [
|
|
10349
|
-
{"col1": "value1", "col2": "value2", "col3": "value3", "_pb_row_index": 0},
|
|
10350
|
-
{"col1": "value4", "col2": "value5", "col3": "value6", "_pb_row_index": 1}
|
|
10351
|
-
]
|
|
10352
|
-
}
|
|
10353
|
-
```
|
|
10447
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
10448
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
10449
|
+
set, you're free to set any combination of them.
|
|
10354
10450
|
|
|
10355
|
-
|
|
10356
|
-
|
|
10357
|
-
[
|
|
10358
|
-
{"index": 0, "result": true},
|
|
10359
|
-
{"index": 1, "result": false}
|
|
10360
|
-
]
|
|
10361
|
-
```
|
|
10362
|
-
|
|
10363
|
-
Prompt Design Tips
|
|
10364
|
-
------------------
|
|
10365
|
-
For best results, design prompts that are:
|
|
10366
|
-
|
|
10367
|
-
- boolean-oriented: frame validation criteria to elicit clear valid/invalid responses
|
|
10368
|
-
- specific: clearly define what makes a row valid/invalid
|
|
10369
|
-
- unambiguous: avoid subjective language that could be interpreted differently
|
|
10370
|
-
- context-aware: include relevant business rules or domain knowledge
|
|
10371
|
-
- example-driven: consider providing examples in the prompt when helpful
|
|
10372
|
-
|
|
10373
|
-
**Critical**: Prompts must be designed so the LLM can determine whether each row passes or
|
|
10374
|
-
fails the validation criteria. The system expects binary validation responses, so avoid
|
|
10375
|
-
open-ended questions or prompts that might generate explanatory text instead of clear
|
|
10376
|
-
pass/fail judgments.
|
|
10377
|
-
|
|
10378
|
-
Good prompt examples:
|
|
10379
|
-
|
|
10380
|
-
- "Each row should contain a valid email address in the 'email' column and a non-empty name
|
|
10381
|
-
in the 'name' column"
|
|
10382
|
-
- "The 'sentiment' column should contain positive sentiment words (happy, good, excellent,
|
|
10383
|
-
etc.)"
|
|
10384
|
-
- "Product descriptions should mention at least one technical specification"
|
|
10385
|
-
|
|
10386
|
-
Poor prompt examples (avoid these):
|
|
10387
|
-
|
|
10388
|
-
- "What do you think about this data?" (too open-ended)
|
|
10389
|
-
- "Describe the quality of each row" (asks for description, not validation)
|
|
10390
|
-
- "How would you improve this data?" (asks for suggestions, not pass/fail)
|
|
10391
|
-
|
|
10392
|
-
Performance Considerations
|
|
10393
|
-
--------------------------
|
|
10394
|
-
AI validation is significantly slower than traditional validation methods due to API calls
|
|
10395
|
-
to LLM providers. However, performance varies dramatically based on data characteristics:
|
|
10396
|
-
|
|
10397
|
-
**High Memoization Scenarios** (seconds to minutes):
|
|
10398
|
-
|
|
10399
|
-
- data with many duplicate rows in the selected columns
|
|
10400
|
-
- low cardinality data (repeated patterns)
|
|
10401
|
-
- small number of unique row combinations
|
|
10402
|
-
|
|
10403
|
-
**Low Memoization Scenarios** (minutes to hours):
|
|
10404
|
-
|
|
10405
|
-
- high cardinality data with mostly unique rows
|
|
10406
|
-
- large datasets with few repeated patterns
|
|
10407
|
-
- all or most rows requiring individual LLM evaluation
|
|
10408
|
-
|
|
10409
|
-
The row signature memoization optimization can reduce processing time significantly when
|
|
10410
|
-
data has repetitive patterns. For datasets where every row is unique, expect longer
|
|
10411
|
-
processing times similar to validating each row individually.
|
|
10412
|
-
|
|
10413
|
-
**Strategies to Reduce Processing Time**:
|
|
10414
|
-
|
|
10415
|
-
- test on data slices: define a sampling function like `def sample_1000(df): return df.head(1000)`
|
|
10416
|
-
and use `pre=sample_1000` to validate on smaller samples
|
|
10417
|
-
- filter relevant data: define filter functions like `def active_only(df): return df.filter(df["status"] == "active")`
|
|
10418
|
-
and use `pre=active_only` to focus on a specific subset
|
|
10419
|
-
- optimize column selection: use `columns_subset=` to include only the columns necessary
|
|
10420
|
-
for validation
|
|
10421
|
-
- start with smaller batches: begin with `batch_size=100` for testing, then increase
|
|
10422
|
-
gradually
|
|
10423
|
-
- reduce concurrency: lower `max_concurrent=1` if hitting rate limits
|
|
10424
|
-
- use faster/cheaper models: consider using smaller or more efficient models for initial
|
|
10425
|
-
testing before switching to more capable models
|
|
10451
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
10452
|
+
take for each level of failure (using the `actions=` parameter).
|
|
10426
10453
|
|
|
10427
10454
|
Examples
|
|
10428
10455
|
--------
|
|
@@ -10432,139 +10459,84 @@ class Validate:
|
|
|
10432
10459
|
import pointblank as pb
|
|
10433
10460
|
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
10434
10461
|
```
|
|
10435
|
-
|
|
10436
|
-
|
|
10437
|
-
custom thresholds and actions.
|
|
10438
|
-
|
|
10439
|
-
**Basic AI validation example:**
|
|
10440
|
-
|
|
10441
|
-
This first example shows a simple validation scenario where we want to check that customer
|
|
10442
|
-
records have both valid email addresses and non-empty names. Notice how we use
|
|
10443
|
-
`columns_subset=` to focus only on the relevant columns, which improves both performance
|
|
10444
|
-
and cost-effectiveness.
|
|
10462
|
+
For the examples here, we'll use a simple Polars DataFrame with three string columns
|
|
10463
|
+
(`col_1`, `col_2`, and `col_3`). The table is shown below:
|
|
10445
10464
|
|
|
10446
|
-
```python
|
|
10465
|
+
```{python}
|
|
10447
10466
|
import pointblank as pb
|
|
10448
10467
|
import polars as pl
|
|
10449
10468
|
|
|
10450
|
-
|
|
10451
|
-
|
|
10452
|
-
|
|
10453
|
-
|
|
10454
|
-
|
|
10455
|
-
|
|
10469
|
+
tbl = pl.DataFrame(
|
|
10470
|
+
{
|
|
10471
|
+
"col_1": ["a", None, "c", "d"],
|
|
10472
|
+
"col_2": ["a", "a", "c", None],
|
|
10473
|
+
"col_3": ["a", "a", "d", None],
|
|
10474
|
+
}
|
|
10475
|
+
)
|
|
10456
10476
|
|
|
10457
|
-
|
|
10477
|
+
pb.preview(tbl)
|
|
10478
|
+
```
|
|
10479
|
+
|
|
10480
|
+
Let's validate that the rows in the table are complete with `rows_complete()`. We'll
|
|
10481
|
+
determine if this validation had any failing test units (there are four test units, one for
|
|
10482
|
+
each row). A failing test units means that a given row is not complete (i.e., has at least
|
|
10483
|
+
one missing value).
|
|
10484
|
+
|
|
10485
|
+
```{python}
|
|
10458
10486
|
validation = (
|
|
10459
10487
|
pb.Validate(data=tbl)
|
|
10460
|
-
.
|
|
10461
|
-
prompt="Each row should have a valid email address and a non-empty name",
|
|
10462
|
-
columns_subset=["email", "name"], # Only check these columns
|
|
10463
|
-
model="openai:gpt-4o-mini",
|
|
10464
|
-
)
|
|
10488
|
+
.rows_complete()
|
|
10465
10489
|
.interrogate()
|
|
10466
10490
|
)
|
|
10467
10491
|
|
|
10468
10492
|
validation
|
|
10469
10493
|
```
|
|
10470
10494
|
|
|
10471
|
-
|
|
10472
|
-
|
|
10473
|
-
empty name field. The validation results will show 2 out of 3 rows failing the criteria.
|
|
10474
|
-
|
|
10475
|
-
**Advanced example with custom thresholds:**
|
|
10476
|
-
|
|
10477
|
-
This more sophisticated example demonstrates how to use AI validation with custom thresholds
|
|
10478
|
-
and actions. Here we're validating phone number formats to ensure they include area codes,
|
|
10479
|
-
which is a common data quality requirement for customer contact information.
|
|
10495
|
+
From this validation table we see that there are two failing test units. This is because
|
|
10496
|
+
two rows in the table have at least one missing value (the second row and the last row).
|
|
10480
10497
|
|
|
10481
|
-
|
|
10482
|
-
|
|
10483
|
-
"customer_id": [1, 2, 3, 4, 5],
|
|
10484
|
-
"name": ["John Doe", "Jane Smith", "Bob Johnson", "Alice Brown", "Charlie Davis"],
|
|
10485
|
-
"phone_number": [
|
|
10486
|
-
"(555) 123-4567", # Valid with area code
|
|
10487
|
-
"555-987-6543", # Valid with area code
|
|
10488
|
-
"123-4567", # Missing area code
|
|
10489
|
-
"(800) 555-1234", # Valid with area code
|
|
10490
|
-
"987-6543" # Missing area code
|
|
10491
|
-
]
|
|
10492
|
-
})
|
|
10498
|
+
We can also use a subset of columns to determine completeness. Let's specify the subset
|
|
10499
|
+
using columns `col_2` and `col_3` for the next validation.
|
|
10493
10500
|
|
|
10501
|
+
```{python}
|
|
10494
10502
|
validation = (
|
|
10495
|
-
pb.Validate(data=
|
|
10496
|
-
.
|
|
10497
|
-
prompt="Do all the phone numbers include an area code?",
|
|
10498
|
-
columns_subset="phone_number", # Only check the `phone_number` column
|
|
10499
|
-
model="openai:gpt-4o",
|
|
10500
|
-
batch_size=500,
|
|
10501
|
-
max_concurrent=5,
|
|
10502
|
-
thresholds=pb.Thresholds(warning=0.1, error=0.2, critical=0.3),
|
|
10503
|
-
actions=pb.Actions(error="Too many phone numbers missing area codes.")
|
|
10504
|
-
)
|
|
10503
|
+
pb.Validate(data=tbl)
|
|
10504
|
+
.rows_complete(columns_subset=["col_2", "col_3"])
|
|
10505
10505
|
.interrogate()
|
|
10506
10506
|
)
|
|
10507
|
+
|
|
10508
|
+
validation
|
|
10507
10509
|
```
|
|
10508
10510
|
|
|
10509
|
-
|
|
10510
|
-
|
|
10511
|
-
|
|
10512
|
-
various phone number formats and determine whether they include area codes.
|
|
10511
|
+
The validation table reports a single failing test units. The last row contains missing
|
|
10512
|
+
values in both the `col_2` and `col_3` columns.
|
|
10513
|
+
others.
|
|
10513
10514
|
"""
|
|
10514
10515
|
|
|
10515
10516
|
assertion_type = _get_fn_name()
|
|
10516
10517
|
|
|
10517
|
-
# Validation of inputs
|
|
10518
|
-
if not isinstance(prompt, str) or not prompt.strip():
|
|
10519
|
-
raise ValueError("prompt must be a non-empty string")
|
|
10520
|
-
|
|
10521
|
-
# Parse the provider and model name from the `model=` argument
|
|
10522
|
-
try:
|
|
10523
|
-
provider, model_name = model.split(sep=":", maxsplit=1)
|
|
10524
|
-
except ValueError:
|
|
10525
|
-
raise ValueError(f"Model must be in format 'provider:model_name', got: {model}")
|
|
10526
|
-
|
|
10527
|
-
# Error if an unsupported provider is used
|
|
10528
|
-
if provider not in MODEL_PROVIDERS:
|
|
10529
|
-
raise ValueError(
|
|
10530
|
-
f"Unsupported provider: {provider}. Supported providers are {MODEL_PROVIDERS}."
|
|
10531
|
-
)
|
|
10532
|
-
|
|
10533
|
-
# Ensure that `batch_size` and `max_concurrent` are positive integers
|
|
10534
|
-
if not isinstance(batch_size, int) or batch_size < 1:
|
|
10535
|
-
raise ValueError("batch_size must be a positive integer")
|
|
10536
|
-
if not isinstance(max_concurrent, int) or max_concurrent < 1:
|
|
10537
|
-
raise ValueError("max_concurrent must be a positive integer")
|
|
10538
|
-
|
|
10539
10518
|
_check_pre(pre=pre)
|
|
10519
|
+
# TODO: add check for segments
|
|
10520
|
+
# _check_segments(segments=segments)
|
|
10540
10521
|
_check_thresholds(thresholds=thresholds)
|
|
10541
10522
|
_check_boolean_input(param=active, param_name="active")
|
|
10542
10523
|
|
|
10543
|
-
# Promote a single column given as a string to a list
|
|
10544
|
-
if columns_subset is not None and isinstance(columns_subset, str):
|
|
10545
|
-
columns_subset = [columns_subset]
|
|
10546
|
-
|
|
10547
10524
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
10548
10525
|
thresholds = (
|
|
10549
10526
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
10550
10527
|
)
|
|
10551
10528
|
|
|
10529
|
+
if columns_subset is not None and isinstance(columns_subset, str): # pragma: no cover
|
|
10530
|
+
columns_subset = [columns_subset] # pragma: no cover
|
|
10531
|
+
|
|
10532
|
+
# TODO: incorporate Column object
|
|
10533
|
+
|
|
10552
10534
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
10553
10535
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
10554
10536
|
|
|
10555
|
-
# Package up the AI-specific parameters as a dictionary for later use
|
|
10556
|
-
ai_config = {
|
|
10557
|
-
"prompt": prompt,
|
|
10558
|
-
"llm_provider": provider,
|
|
10559
|
-
"llm_model": model_name,
|
|
10560
|
-
"batch_size": batch_size,
|
|
10561
|
-
"max_concurrent": max_concurrent,
|
|
10562
|
-
}
|
|
10563
|
-
|
|
10564
10537
|
val_info = _ValidationInfo(
|
|
10565
10538
|
assertion_type=assertion_type,
|
|
10566
10539
|
column=columns_subset,
|
|
10567
|
-
values=ai_config,
|
|
10568
10540
|
pre=pre,
|
|
10569
10541
|
segments=segments,
|
|
10570
10542
|
thresholds=thresholds,
|
|
@@ -10577,66 +10549,81 @@ class Validate:
|
|
|
10577
10549
|
|
|
10578
10550
|
return self
|
|
10579
10551
|
|
|
10580
|
-
def
|
|
10552
|
+
def prompt(
|
|
10581
10553
|
self,
|
|
10582
|
-
|
|
10583
|
-
|
|
10584
|
-
|
|
10585
|
-
|
|
10586
|
-
|
|
10587
|
-
full_match_dtypes: bool = True,
|
|
10554
|
+
prompt: str,
|
|
10555
|
+
model: str,
|
|
10556
|
+
columns_subset: str | list[str] | None = None,
|
|
10557
|
+
batch_size: int = 1000,
|
|
10558
|
+
max_concurrent: int = 3,
|
|
10588
10559
|
pre: Callable | None = None,
|
|
10560
|
+
segments: SegmentSpec | None = None,
|
|
10589
10561
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
10590
10562
|
actions: Actions | None = None,
|
|
10591
10563
|
brief: str | bool | None = None,
|
|
10592
10564
|
active: bool = True,
|
|
10593
10565
|
) -> Validate:
|
|
10594
10566
|
"""
|
|
10595
|
-
|
|
10567
|
+
Validate rows using AI/LLM-powered analysis.
|
|
10596
10568
|
|
|
10597
|
-
The `
|
|
10598
|
-
|
|
10599
|
-
|
|
10600
|
-
|
|
10601
|
-
|
|
10569
|
+
The `prompt()` validation method uses Large Language Models (LLMs) to validate rows of data
|
|
10570
|
+
based on natural language criteria. Similar to other Pointblank validation methods, this
|
|
10571
|
+
generates binary test results (pass/fail) that integrate seamlessly with the standard
|
|
10572
|
+
reporting framework.
|
|
10573
|
+
|
|
10574
|
+
Like `col_vals_*()` methods, `prompt()` evaluates data against specific criteria, but
|
|
10575
|
+
instead of using programmatic rules, it uses natural language prompts interpreted by an LLM.
|
|
10576
|
+
Like `rows_distinct()` and `rows_complete()`, it operates at the row level and allows you to
|
|
10577
|
+
specify a subset of columns for evaluation using `columns_subset=`.
|
|
10578
|
+
|
|
10579
|
+
The system automatically combines your validation criteria from the `prompt=` parameter with
|
|
10580
|
+
the necessary technical context, data formatting instructions, and response structure
|
|
10581
|
+
requirements. This is all so you only need to focus on describing your validation logic in
|
|
10582
|
+
plain language.
|
|
10583
|
+
|
|
10584
|
+
Each row becomes a test unit that either passes or fails the validation criteria, producing
|
|
10585
|
+
the familiar True/False results that appear in Pointblank validation reports. This method
|
|
10586
|
+
is particularly useful for complex validation rules that are difficult to express with
|
|
10587
|
+
traditional validation methods, such as semantic checks, context-dependent validation, or
|
|
10588
|
+
subjective quality assessments.
|
|
10602
10589
|
|
|
10603
10590
|
Parameters
|
|
10604
10591
|
----------
|
|
10605
|
-
|
|
10606
|
-
A
|
|
10607
|
-
|
|
10608
|
-
|
|
10609
|
-
|
|
10610
|
-
|
|
10611
|
-
|
|
10612
|
-
|
|
10613
|
-
|
|
10614
|
-
|
|
10615
|
-
|
|
10616
|
-
|
|
10617
|
-
|
|
10618
|
-
|
|
10619
|
-
the
|
|
10620
|
-
|
|
10621
|
-
|
|
10622
|
-
|
|
10623
|
-
|
|
10624
|
-
|
|
10625
|
-
|
|
10626
|
-
|
|
10627
|
-
substring matches are allowed, so a schema data type of `Int` would match a target table
|
|
10628
|
-
data type of `Int64`.
|
|
10592
|
+
prompt
|
|
10593
|
+
A natural language description of the validation criteria. This prompt should clearly
|
|
10594
|
+
describe what constitutes valid vs invalid rows. Some examples:
|
|
10595
|
+
`"Each row should contain a valid email address and a realistic person name"`,
|
|
10596
|
+
`"Values should indicate positive sentiment"`,
|
|
10597
|
+
`"The description should mention a country name"`.
|
|
10598
|
+
columns_subset
|
|
10599
|
+
A single column or list of columns to include in the validation. If `None`, all columns
|
|
10600
|
+
will be included. Specifying fewer columns can improve performance and reduce API costs
|
|
10601
|
+
so try to include only the columns necessary for the validation.
|
|
10602
|
+
model
|
|
10603
|
+
The model to be used. This should be in the form of `provider:model` (e.g.,
|
|
10604
|
+
`"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
|
|
10605
|
+
`"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from
|
|
10606
|
+
the provider. Model names are subject to change so consult the provider's documentation
|
|
10607
|
+
for the most up-to-date model names.
|
|
10608
|
+
batch_size
|
|
10609
|
+
Number of rows to process in each batch. Larger batches are more efficient but may hit
|
|
10610
|
+
API limits. Default is `1000`.
|
|
10611
|
+
max_concurrent
|
|
10612
|
+
Maximum number of concurrent API requests. Higher values speed up processing but may
|
|
10613
|
+
hit rate limits. Default is `3`.
|
|
10629
10614
|
pre
|
|
10630
10615
|
An optional preprocessing function or lambda to apply to the data table during
|
|
10631
10616
|
interrogation. This function should take a table as input and return a modified table.
|
|
10632
|
-
|
|
10633
|
-
|
|
10617
|
+
segments
|
|
10618
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
10619
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
10620
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
10621
|
+
(provided as a list).
|
|
10634
10622
|
thresholds
|
|
10635
10623
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
10636
10624
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
10637
10625
|
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
10638
|
-
be set locally and global thresholds (if any) will take effect.
|
|
10639
|
-
section for information on how to set threshold levels.
|
|
10626
|
+
be set locally and global thresholds (if any) will take effect.
|
|
10640
10627
|
actions
|
|
10641
10628
|
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
10642
10629
|
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
@@ -10657,154 +10644,314 @@ class Validate:
|
|
|
10657
10644
|
Validate
|
|
10658
10645
|
The `Validate` object with the added validation step.
|
|
10659
10646
|
|
|
10660
|
-
|
|
10661
|
-
|
|
10662
|
-
The `
|
|
10663
|
-
|
|
10664
|
-
table. This is useful for performing any necessary transformations or filtering on the data
|
|
10665
|
-
before the validation step is applied.
|
|
10647
|
+
Constructing the `model` Argument
|
|
10648
|
+
---------------------------------
|
|
10649
|
+
The `model=` argument should be constructed using the provider and model name separated by a
|
|
10650
|
+
colon (`provider:model`). The provider text can any of:
|
|
10666
10651
|
|
|
10667
|
-
|
|
10668
|
-
|
|
10669
|
-
|
|
10670
|
-
|
|
10652
|
+
- `"anthropic"` (Anthropic)
|
|
10653
|
+
- `"openai"` (OpenAI)
|
|
10654
|
+
- `"ollama"` (Ollama)
|
|
10655
|
+
- `"bedrock"` (Amazon Bedrock)
|
|
10671
10656
|
|
|
10672
|
-
|
|
10673
|
-
|
|
10674
|
-
|
|
10675
|
-
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
10676
|
-
set at the global level in `Validate(thresholds=...)`.
|
|
10657
|
+
The model name should be the specific model to be used from the provider. Model names are
|
|
10658
|
+
subject to change so consult the provider's documentation for the most up-to-date model
|
|
10659
|
+
names.
|
|
10677
10660
|
|
|
10678
|
-
|
|
10679
|
-
|
|
10680
|
-
|
|
10661
|
+
Notes on Authentication
|
|
10662
|
+
-----------------------
|
|
10663
|
+
API keys are automatically loaded from environment variables or `.env` files and are **not**
|
|
10664
|
+
stored in the validation object for security reasons. You should consider using a secure
|
|
10665
|
+
method for handling API keys.
|
|
10681
10666
|
|
|
10682
|
-
|
|
10667
|
+
One way to do this is to load the API key from an environment variable and retrieve it using
|
|
10668
|
+
the `os` module (specifically the `os.getenv()` function). Places to store the API key might
|
|
10669
|
+
include `.bashrc`, `.bash_profile`, `.zshrc`, or `.zsh_profile`.
|
|
10683
10670
|
|
|
10684
|
-
|
|
10685
|
-
|
|
10686
|
-
|
|
10687
|
-
|
|
10688
|
-
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
10689
|
-
'critical'
|
|
10690
|
-
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
10691
|
-
for the 'warning' level only
|
|
10671
|
+
Another solution is to store one or more model provider API keys in an `.env` file (in the
|
|
10672
|
+
root of your project). If the API keys have correct names (e.g., `ANTHROPIC_API_KEY` or
|
|
10673
|
+
`OPENAI_API_KEY`) then the AI validation will automatically load the API key from the `.env`
|
|
10674
|
+
file. An `.env` file might look like this:
|
|
10692
10675
|
|
|
10693
|
-
|
|
10694
|
-
|
|
10695
|
-
|
|
10676
|
+
```plaintext
|
|
10677
|
+
ANTHROPIC_API_KEY="your_anthropic_api_key_here"
|
|
10678
|
+
OPENAI_API_KEY="your_openai_api_key_here"
|
|
10679
|
+
```
|
|
10696
10680
|
|
|
10697
|
-
|
|
10698
|
-
|
|
10681
|
+
There's no need to have the `python-dotenv` package installed when using `.env` files in
|
|
10682
|
+
this way.
|
|
10699
10683
|
|
|
10700
|
-
|
|
10701
|
-
--------
|
|
10702
|
-
```{python}
|
|
10703
|
-
#| echo: false
|
|
10704
|
-
#| output: false
|
|
10705
|
-
import pointblank as pb
|
|
10706
|
-
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
10707
|
-
```
|
|
10684
|
+
**Provider-specific setup**:
|
|
10708
10685
|
|
|
10709
|
-
|
|
10710
|
-
|
|
10686
|
+
- **OpenAI**: set `OPENAI_API_KEY` environment variable or create `.env` file
|
|
10687
|
+
- **Anthropic**: set `ANTHROPIC_API_KEY` environment variable or create `.env` file
|
|
10688
|
+
- **Ollama**: no API key required, just ensure Ollama is running locally
|
|
10689
|
+
- **Bedrock**: configure AWS credentials through standard AWS methods
|
|
10711
10690
|
|
|
10712
|
-
|
|
10713
|
-
|
|
10714
|
-
|
|
10691
|
+
AI Validation Process
|
|
10692
|
+
---------------------
|
|
10693
|
+
The AI validation process works as follows:
|
|
10715
10694
|
|
|
10716
|
-
|
|
10717
|
-
|
|
10718
|
-
|
|
10719
|
-
|
|
10720
|
-
|
|
10721
|
-
|
|
10722
|
-
|
|
10695
|
+
1. data batching: the data is split into batches of the specified size
|
|
10696
|
+
2. row deduplication: duplicate rows (based on selected columns) are identified and only
|
|
10697
|
+
unique combinations are sent to the LLM for analysis
|
|
10698
|
+
3. json conversion: each batch of unique rows is converted to JSON format for the LLM
|
|
10699
|
+
4. prompt construction: the user prompt is embedded in a structured system prompt
|
|
10700
|
+
5. llm processing: each batch is sent to the LLM for analysis
|
|
10701
|
+
6. response parsing: LLM responses are parsed to extract validation results
|
|
10702
|
+
7. result projection: results are mapped back to all original rows using row signatures
|
|
10703
|
+
8. result aggregation: results from all batches are combined
|
|
10723
10704
|
|
|
10724
|
-
|
|
10725
|
-
|
|
10705
|
+
**Performance Optimization**: the process uses row signature memoization to avoid redundant
|
|
10706
|
+
LLM calls. When multiple rows have identical values in the selected columns, only one
|
|
10707
|
+
representative row is validated, and the result is applied to all matching rows. This can
|
|
10708
|
+
dramatically reduce API costs and processing time for datasets with repetitive patterns.
|
|
10726
10709
|
|
|
10727
|
-
|
|
10728
|
-
defined using the [`Schema`](`pointblank.Schema`) class.
|
|
10710
|
+
The LLM receives data in this JSON format:
|
|
10729
10711
|
|
|
10730
|
-
```
|
|
10731
|
-
|
|
10732
|
-
|
|
10733
|
-
|
|
10712
|
+
```json
|
|
10713
|
+
{
|
|
10714
|
+
"columns": ["col1", "col2", "col3"],
|
|
10715
|
+
"rows": [
|
|
10716
|
+
{"col1": "value1", "col2": "value2", "col3": "value3", "_pb_row_index": 0},
|
|
10717
|
+
{"col1": "value4", "col2": "value5", "col3": "value6", "_pb_row_index": 1}
|
|
10718
|
+
]
|
|
10719
|
+
}
|
|
10734
10720
|
```
|
|
10735
10721
|
|
|
10736
|
-
|
|
10737
|
-
|
|
10738
|
-
|
|
10739
|
-
|
|
10722
|
+
The LLM returns validation results in this format:
|
|
10723
|
+
```json
|
|
10724
|
+
[
|
|
10725
|
+
{"index": 0, "result": true},
|
|
10726
|
+
{"index": 1, "result": false}
|
|
10727
|
+
]
|
|
10740
10728
|
```
|
|
10741
10729
|
|
|
10742
|
-
|
|
10743
|
-
|
|
10744
|
-
|
|
10730
|
+
Prompt Design Tips
|
|
10731
|
+
------------------
|
|
10732
|
+
For best results, design prompts that are:
|
|
10745
10733
|
|
|
10746
|
-
|
|
10747
|
-
|
|
10748
|
-
|
|
10749
|
-
|
|
10750
|
-
|
|
10751
|
-
)
|
|
10734
|
+
- boolean-oriented: frame validation criteria to elicit clear valid/invalid responses
|
|
10735
|
+
- specific: clearly define what makes a row valid/invalid
|
|
10736
|
+
- unambiguous: avoid subjective language that could be interpreted differently
|
|
10737
|
+
- context-aware: include relevant business rules or domain knowledge
|
|
10738
|
+
- example-driven: consider providing examples in the prompt when helpful
|
|
10752
10739
|
|
|
10753
|
-
|
|
10754
|
-
|
|
10740
|
+
**Critical**: Prompts must be designed so the LLM can determine whether each row passes or
|
|
10741
|
+
fails the validation criteria. The system expects binary validation responses, so avoid
|
|
10742
|
+
open-ended questions or prompts that might generate explanatory text instead of clear
|
|
10743
|
+
pass/fail judgments.
|
|
10755
10744
|
|
|
10756
|
-
|
|
10757
|
-
since the table columns and their types match the schema.
|
|
10758
|
-
"""
|
|
10745
|
+
Good prompt examples:
|
|
10759
10746
|
|
|
10760
|
-
|
|
10747
|
+
- "Each row should contain a valid email address in the 'email' column and a non-empty name
|
|
10748
|
+
in the 'name' column"
|
|
10749
|
+
- "The 'sentiment' column should contain positive sentiment words (happy, good, excellent,
|
|
10750
|
+
etc.)"
|
|
10751
|
+
- "Product descriptions should mention at least one technical specification"
|
|
10761
10752
|
|
|
10762
|
-
|
|
10763
|
-
_check_thresholds(thresholds=thresholds)
|
|
10764
|
-
_check_boolean_input(param=active, param_name="active")
|
|
10765
|
-
_check_boolean_input(param=complete, param_name="complete")
|
|
10766
|
-
_check_boolean_input(param=in_order, param_name="in_order")
|
|
10767
|
-
_check_boolean_input(param=case_sensitive_colnames, param_name="case_sensitive_colnames")
|
|
10768
|
-
_check_boolean_input(param=case_sensitive_dtypes, param_name="case_sensitive_dtypes")
|
|
10769
|
-
_check_boolean_input(param=full_match_dtypes, param_name="full_match_dtypes")
|
|
10753
|
+
Poor prompt examples (avoid these):
|
|
10770
10754
|
|
|
10771
|
-
|
|
10772
|
-
|
|
10773
|
-
|
|
10774
|
-
)
|
|
10755
|
+
- "What do you think about this data?" (too open-ended)
|
|
10756
|
+
- "Describe the quality of each row" (asks for description, not validation)
|
|
10757
|
+
- "How would you improve this data?" (asks for suggestions, not pass/fail)
|
|
10775
10758
|
|
|
10776
|
-
|
|
10777
|
-
|
|
10778
|
-
|
|
10779
|
-
|
|
10780
|
-
"in_order": in_order,
|
|
10781
|
-
"case_sensitive_colnames": case_sensitive_colnames,
|
|
10782
|
-
"case_sensitive_dtypes": case_sensitive_dtypes,
|
|
10783
|
-
"full_match_dtypes": full_match_dtypes,
|
|
10784
|
-
}
|
|
10759
|
+
Performance Considerations
|
|
10760
|
+
--------------------------
|
|
10761
|
+
AI validation is significantly slower than traditional validation methods due to API calls
|
|
10762
|
+
to LLM providers. However, performance varies dramatically based on data characteristics:
|
|
10785
10763
|
|
|
10786
|
-
|
|
10787
|
-
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
10764
|
+
**High Memoization Scenarios** (seconds to minutes):
|
|
10788
10765
|
|
|
10789
|
-
|
|
10790
|
-
|
|
10791
|
-
|
|
10792
|
-
pre=pre,
|
|
10793
|
-
thresholds=thresholds,
|
|
10794
|
-
actions=actions,
|
|
10795
|
-
brief=brief,
|
|
10796
|
-
active=active,
|
|
10797
|
-
)
|
|
10766
|
+
- data with many duplicate rows in the selected columns
|
|
10767
|
+
- low cardinality data (repeated patterns)
|
|
10768
|
+
- small number of unique row combinations
|
|
10798
10769
|
|
|
10799
|
-
|
|
10770
|
+
**Low Memoization Scenarios** (minutes to hours):
|
|
10800
10771
|
|
|
10801
|
-
|
|
10772
|
+
- high cardinality data with mostly unique rows
|
|
10773
|
+
- large datasets with few repeated patterns
|
|
10774
|
+
- all or most rows requiring individual LLM evaluation
|
|
10802
10775
|
|
|
10803
|
-
|
|
10804
|
-
|
|
10805
|
-
|
|
10806
|
-
|
|
10807
|
-
|
|
10776
|
+
The row signature memoization optimization can reduce processing time significantly when
|
|
10777
|
+
data has repetitive patterns. For datasets where every row is unique, expect longer
|
|
10778
|
+
processing times similar to validating each row individually.
|
|
10779
|
+
|
|
10780
|
+
**Strategies to Reduce Processing Time**:
|
|
10781
|
+
|
|
10782
|
+
- test on data slices: define a sampling function like `def sample_1000(df): return df.head(1000)`
|
|
10783
|
+
and use `pre=sample_1000` to validate on smaller samples
|
|
10784
|
+
- filter relevant data: define filter functions like `def active_only(df): return df.filter(df["status"] == "active")`
|
|
10785
|
+
and use `pre=active_only` to focus on a specific subset
|
|
10786
|
+
- optimize column selection: use `columns_subset=` to include only the columns necessary
|
|
10787
|
+
for validation
|
|
10788
|
+
- start with smaller batches: begin with `batch_size=100` for testing, then increase
|
|
10789
|
+
gradually
|
|
10790
|
+
- reduce concurrency: lower `max_concurrent=1` if hitting rate limits
|
|
10791
|
+
- use faster/cheaper models: consider using smaller or more efficient models for initial
|
|
10792
|
+
testing before switching to more capable models
|
|
10793
|
+
|
|
10794
|
+
Examples
|
|
10795
|
+
--------
|
|
10796
|
+
```{python}
|
|
10797
|
+
#| echo: false
|
|
10798
|
+
#| output: false
|
|
10799
|
+
import pointblank as pb
|
|
10800
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
10801
|
+
```
|
|
10802
|
+
The following examples demonstrate how to use AI validation for different types of data
|
|
10803
|
+
quality checks. These examples show both basic usage and more advanced configurations with
|
|
10804
|
+
custom thresholds and actions.
|
|
10805
|
+
|
|
10806
|
+
**Basic AI validation example:**
|
|
10807
|
+
|
|
10808
|
+
This first example shows a simple validation scenario where we want to check that customer
|
|
10809
|
+
records have both valid email addresses and non-empty names. Notice how we use
|
|
10810
|
+
`columns_subset=` to focus only on the relevant columns, which improves both performance
|
|
10811
|
+
and cost-effectiveness.
|
|
10812
|
+
|
|
10813
|
+
```python
|
|
10814
|
+
import pointblank as pb
|
|
10815
|
+
import polars as pl
|
|
10816
|
+
|
|
10817
|
+
# Sample data with email and name columns
|
|
10818
|
+
tbl = pl.DataFrame({
|
|
10819
|
+
"email": ["john@example.com", "invalid-email", "jane@test.org"],
|
|
10820
|
+
"name": ["John Doe", "", "Jane Smith"],
|
|
10821
|
+
"age": [25, 30, 35]
|
|
10822
|
+
})
|
|
10823
|
+
|
|
10824
|
+
# Validate using AI
|
|
10825
|
+
validation = (
|
|
10826
|
+
pb.Validate(data=tbl)
|
|
10827
|
+
.prompt(
|
|
10828
|
+
prompt="Each row should have a valid email address and a non-empty name",
|
|
10829
|
+
columns_subset=["email", "name"], # Only check these columns
|
|
10830
|
+
model="openai:gpt-4o-mini",
|
|
10831
|
+
)
|
|
10832
|
+
.interrogate()
|
|
10833
|
+
)
|
|
10834
|
+
|
|
10835
|
+
validation
|
|
10836
|
+
```
|
|
10837
|
+
|
|
10838
|
+
In this example, the AI will identify that the second row fails validation because it has
|
|
10839
|
+
an invalid email format (`"invalid-email"`) and the third row also fails because it has an
|
|
10840
|
+
empty name field. The validation results will show 2 out of 3 rows failing the criteria.
|
|
10841
|
+
|
|
10842
|
+
**Advanced example with custom thresholds:**
|
|
10843
|
+
|
|
10844
|
+
This more sophisticated example demonstrates how to use AI validation with custom thresholds
|
|
10845
|
+
and actions. Here we're validating phone number formats to ensure they include area codes,
|
|
10846
|
+
which is a common data quality requirement for customer contact information.
|
|
10847
|
+
|
|
10848
|
+
```python
|
|
10849
|
+
customer_data = pl.DataFrame({
|
|
10850
|
+
"customer_id": [1, 2, 3, 4, 5],
|
|
10851
|
+
"name": ["John Doe", "Jane Smith", "Bob Johnson", "Alice Brown", "Charlie Davis"],
|
|
10852
|
+
"phone_number": [
|
|
10853
|
+
"(555) 123-4567", # Valid with area code
|
|
10854
|
+
"555-987-6543", # Valid with area code
|
|
10855
|
+
"123-4567", # Missing area code
|
|
10856
|
+
"(800) 555-1234", # Valid with area code
|
|
10857
|
+
"987-6543" # Missing area code
|
|
10858
|
+
]
|
|
10859
|
+
})
|
|
10860
|
+
|
|
10861
|
+
validation = (
|
|
10862
|
+
pb.Validate(data=customer_data)
|
|
10863
|
+
.prompt(
|
|
10864
|
+
prompt="Do all the phone numbers include an area code?",
|
|
10865
|
+
columns_subset="phone_number", # Only check the `phone_number` column
|
|
10866
|
+
model="openai:gpt-4o",
|
|
10867
|
+
batch_size=500,
|
|
10868
|
+
max_concurrent=5,
|
|
10869
|
+
thresholds=pb.Thresholds(warning=0.1, error=0.2, critical=0.3),
|
|
10870
|
+
actions=pb.Actions(error="Too many phone numbers missing area codes.")
|
|
10871
|
+
)
|
|
10872
|
+
.interrogate()
|
|
10873
|
+
)
|
|
10874
|
+
```
|
|
10875
|
+
|
|
10876
|
+
This validation will identify that 2 out of 5 phone numbers (40%) are missing area codes,
|
|
10877
|
+
which exceeds all threshold levels. The validation will trigger the specified error action
|
|
10878
|
+
since the failure rate (40%) is above the error threshold (20%). The AI can recognize
|
|
10879
|
+
various phone number formats and determine whether they include area codes.
|
|
10880
|
+
"""
|
|
10881
|
+
|
|
10882
|
+
assertion_type = _get_fn_name()
|
|
10883
|
+
|
|
10884
|
+
# Validation of inputs
|
|
10885
|
+
if not isinstance(prompt, str) or not prompt.strip():
|
|
10886
|
+
raise ValueError("prompt must be a non-empty string")
|
|
10887
|
+
|
|
10888
|
+
# Parse the provider and model name from the `model=` argument
|
|
10889
|
+
try:
|
|
10890
|
+
provider, model_name = model.split(sep=":", maxsplit=1)
|
|
10891
|
+
except ValueError:
|
|
10892
|
+
raise ValueError(f"Model must be in format 'provider:model_name', got: {model}")
|
|
10893
|
+
|
|
10894
|
+
# Error if an unsupported provider is used
|
|
10895
|
+
if provider not in MODEL_PROVIDERS:
|
|
10896
|
+
raise ValueError(
|
|
10897
|
+
f"Unsupported provider: {provider}. Supported providers are {MODEL_PROVIDERS}."
|
|
10898
|
+
)
|
|
10899
|
+
|
|
10900
|
+
# Ensure that `batch_size` and `max_concurrent` are positive integers
|
|
10901
|
+
if not isinstance(batch_size, int) or batch_size < 1:
|
|
10902
|
+
raise ValueError("batch_size must be a positive integer")
|
|
10903
|
+
if not isinstance(max_concurrent, int) or max_concurrent < 1:
|
|
10904
|
+
raise ValueError("max_concurrent must be a positive integer")
|
|
10905
|
+
|
|
10906
|
+
_check_pre(pre=pre)
|
|
10907
|
+
_check_thresholds(thresholds=thresholds)
|
|
10908
|
+
_check_boolean_input(param=active, param_name="active")
|
|
10909
|
+
|
|
10910
|
+
# Promote a single column given as a string to a list
|
|
10911
|
+
if columns_subset is not None and isinstance(columns_subset, str):
|
|
10912
|
+
columns_subset = [columns_subset]
|
|
10913
|
+
|
|
10914
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
10915
|
+
thresholds = (
|
|
10916
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
10917
|
+
)
|
|
10918
|
+
|
|
10919
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
10920
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
10921
|
+
|
|
10922
|
+
# Package up the AI-specific parameters as a dictionary for later use
|
|
10923
|
+
ai_config = {
|
|
10924
|
+
"prompt": prompt,
|
|
10925
|
+
"llm_provider": provider,
|
|
10926
|
+
"llm_model": model_name,
|
|
10927
|
+
"batch_size": batch_size,
|
|
10928
|
+
"max_concurrent": max_concurrent,
|
|
10929
|
+
}
|
|
10930
|
+
|
|
10931
|
+
val_info = _ValidationInfo(
|
|
10932
|
+
assertion_type=assertion_type,
|
|
10933
|
+
column=columns_subset,
|
|
10934
|
+
values=ai_config,
|
|
10935
|
+
pre=pre,
|
|
10936
|
+
segments=segments,
|
|
10937
|
+
thresholds=thresholds,
|
|
10938
|
+
actions=actions,
|
|
10939
|
+
brief=brief,
|
|
10940
|
+
active=active,
|
|
10941
|
+
)
|
|
10942
|
+
|
|
10943
|
+
self._add_validation(validation_info=val_info)
|
|
10944
|
+
|
|
10945
|
+
return self
|
|
10946
|
+
|
|
10947
|
+
def col_schema_match(
|
|
10948
|
+
self,
|
|
10949
|
+
schema: Schema,
|
|
10950
|
+
complete: bool = True,
|
|
10951
|
+
in_order: bool = True,
|
|
10952
|
+
case_sensitive_colnames: bool = True,
|
|
10953
|
+
case_sensitive_dtypes: bool = True,
|
|
10954
|
+
full_match_dtypes: bool = True,
|
|
10808
10955
|
pre: Callable | None = None,
|
|
10809
10956
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
10810
10957
|
actions: Actions | None = None,
|
|
@@ -10812,33 +10959,40 @@ class Validate:
|
|
|
10812
10959
|
active: bool = True,
|
|
10813
10960
|
) -> Validate:
|
|
10814
10961
|
"""
|
|
10815
|
-
|
|
10816
|
-
|
|
10817
|
-
The `row_count_match()` method checks whether the row count of the target table matches a
|
|
10818
|
-
specified count. This validation will operate over a single test unit, which is whether the
|
|
10819
|
-
row count matches the specified count.
|
|
10962
|
+
Do columns in the table (and their types) match a predefined schema?
|
|
10820
10963
|
|
|
10821
|
-
|
|
10822
|
-
|
|
10823
|
-
|
|
10964
|
+
The `col_schema_match()` method works in conjunction with an object generated by the
|
|
10965
|
+
[`Schema`](`pointblank.Schema`) class. That class object is the expectation for the actual
|
|
10966
|
+
schema of the target table. The validation step operates over a single test unit, which is
|
|
10967
|
+
whether the schema matches that of the table (within the constraints enforced by the
|
|
10968
|
+
`complete=`, and `in_order=` options).
|
|
10824
10969
|
|
|
10825
10970
|
Parameters
|
|
10826
10971
|
----------
|
|
10827
|
-
|
|
10828
|
-
|
|
10829
|
-
|
|
10830
|
-
|
|
10831
|
-
|
|
10832
|
-
|
|
10833
|
-
|
|
10834
|
-
|
|
10835
|
-
|
|
10836
|
-
|
|
10837
|
-
|
|
10838
|
-
|
|
10839
|
-
|
|
10840
|
-
|
|
10841
|
-
|
|
10972
|
+
schema
|
|
10973
|
+
A `Schema` object that represents the expected schema of the table. This object is
|
|
10974
|
+
generated by the [`Schema`](`pointblank.Schema`) class.
|
|
10975
|
+
complete
|
|
10976
|
+
Should the schema match be complete? If `True`, then the target table must have all
|
|
10977
|
+
columns specified in the schema. If `False`, then the table can have additional columns
|
|
10978
|
+
not in the schema (i.e., the schema is a subset of the target table's columns).
|
|
10979
|
+
in_order
|
|
10980
|
+
Should the schema match be in order? If `True`, then the columns in the schema must
|
|
10981
|
+
appear in the same order as they do in the target table. If `False`, then the order of
|
|
10982
|
+
columns in the schema and the target table can differ.
|
|
10983
|
+
case_sensitive_colnames
|
|
10984
|
+
Should the schema match be case-sensitive with regard to column names? If `True`, then
|
|
10985
|
+
the column names in the schema and the target table must match exactly. If `False`, then
|
|
10986
|
+
the column names are compared in a case-insensitive manner.
|
|
10987
|
+
case_sensitive_dtypes
|
|
10988
|
+
Should the schema match be case-sensitive with regard to column data types? If `True`,
|
|
10989
|
+
then the column data types in the schema and the target table must match exactly. If
|
|
10990
|
+
`False`, then the column data types are compared in a case-insensitive manner.
|
|
10991
|
+
full_match_dtypes
|
|
10992
|
+
Should the schema match require a full match of data types? If `True`, then the column
|
|
10993
|
+
data types in the schema and the target table must match exactly. If `False` then
|
|
10994
|
+
substring matches are allowed, so a schema data type of `Int` would match a target table
|
|
10995
|
+
data type of `Int64`.
|
|
10842
10996
|
pre
|
|
10843
10997
|
An optional preprocessing function or lambda to apply to the data table during
|
|
10844
10998
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -10878,10 +11032,9 @@ class Validate:
|
|
|
10878
11032
|
before the validation step is applied.
|
|
10879
11033
|
|
|
10880
11034
|
The preprocessing function can be any callable that takes a table as input and returns a
|
|
10881
|
-
modified table.
|
|
10882
|
-
|
|
10883
|
-
|
|
10884
|
-
`Validate` object or used in subsequent validation steps.
|
|
11035
|
+
modified table. Regarding the lifetime of the transformed table, it only exists during the
|
|
11036
|
+
validation step and is not stored in the `Validate` object or used in subsequent validation
|
|
11037
|
+
steps.
|
|
10885
11038
|
|
|
10886
11039
|
Thresholds
|
|
10887
11040
|
----------
|
|
@@ -10917,18 +11070,232 @@ class Validate:
|
|
|
10917
11070
|
#| echo: false
|
|
10918
11071
|
#| output: false
|
|
10919
11072
|
import pointblank as pb
|
|
10920
|
-
pb.config(report_incl_header=False, report_incl_footer=False)
|
|
11073
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
10921
11074
|
```
|
|
10922
11075
|
|
|
10923
|
-
For the examples here, we'll use
|
|
10924
|
-
|
|
11076
|
+
For the examples here, we'll use a simple Polars DataFrame with three columns (string,
|
|
11077
|
+
integer, and float). The table is shown below:
|
|
10925
11078
|
|
|
10926
11079
|
```{python}
|
|
10927
11080
|
import pointblank as pb
|
|
11081
|
+
import polars as pl
|
|
10928
11082
|
|
|
10929
|
-
|
|
11083
|
+
tbl = pl.DataFrame(
|
|
11084
|
+
{
|
|
11085
|
+
"a": ["apple", "banana", "cherry", "date"],
|
|
11086
|
+
"b": [1, 6, 3, 5],
|
|
11087
|
+
"c": [1.1, 2.2, 3.3, 4.4],
|
|
11088
|
+
}
|
|
11089
|
+
)
|
|
10930
11090
|
|
|
10931
|
-
pb.preview(
|
|
11091
|
+
pb.preview(tbl)
|
|
11092
|
+
```
|
|
11093
|
+
|
|
11094
|
+
Let's validate that the columns in the table match a predefined schema. A schema can be
|
|
11095
|
+
defined using the [`Schema`](`pointblank.Schema`) class.
|
|
11096
|
+
|
|
11097
|
+
```{python}
|
|
11098
|
+
schema = pb.Schema(
|
|
11099
|
+
columns=[("a", "String"), ("b", "Int64"), ("c", "Float64")]
|
|
11100
|
+
)
|
|
11101
|
+
```
|
|
11102
|
+
|
|
11103
|
+
You can print the schema object to verify that the expected schema is as intended.
|
|
11104
|
+
|
|
11105
|
+
```{python}
|
|
11106
|
+
print(schema)
|
|
11107
|
+
```
|
|
11108
|
+
|
|
11109
|
+
Now, we'll use the `col_schema_match()` method to validate the table against the expected
|
|
11110
|
+
`schema` object. There is a single test unit for this validation step (whether the schema
|
|
11111
|
+
matches the table or not).
|
|
11112
|
+
|
|
11113
|
+
```{python}
|
|
11114
|
+
validation = (
|
|
11115
|
+
pb.Validate(data=tbl)
|
|
11116
|
+
.col_schema_match(schema=schema)
|
|
11117
|
+
.interrogate()
|
|
11118
|
+
)
|
|
11119
|
+
|
|
11120
|
+
validation
|
|
11121
|
+
```
|
|
11122
|
+
|
|
11123
|
+
The validation table shows that the schema matches the table. The single test unit passed
|
|
11124
|
+
since the table columns and their types match the schema.
|
|
11125
|
+
"""
|
|
11126
|
+
|
|
11127
|
+
assertion_type = _get_fn_name()
|
|
11128
|
+
|
|
11129
|
+
_check_pre(pre=pre)
|
|
11130
|
+
_check_thresholds(thresholds=thresholds)
|
|
11131
|
+
_check_boolean_input(param=active, param_name="active")
|
|
11132
|
+
_check_boolean_input(param=complete, param_name="complete")
|
|
11133
|
+
_check_boolean_input(param=in_order, param_name="in_order")
|
|
11134
|
+
_check_boolean_input(param=case_sensitive_colnames, param_name="case_sensitive_colnames")
|
|
11135
|
+
_check_boolean_input(param=case_sensitive_dtypes, param_name="case_sensitive_dtypes")
|
|
11136
|
+
_check_boolean_input(param=full_match_dtypes, param_name="full_match_dtypes")
|
|
11137
|
+
|
|
11138
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
11139
|
+
thresholds = (
|
|
11140
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
11141
|
+
)
|
|
11142
|
+
|
|
11143
|
+
# Package up the `schema=` and boolean params into a dictionary for later interrogation
|
|
11144
|
+
values = {
|
|
11145
|
+
"schema": schema,
|
|
11146
|
+
"complete": complete,
|
|
11147
|
+
"in_order": in_order,
|
|
11148
|
+
"case_sensitive_colnames": case_sensitive_colnames,
|
|
11149
|
+
"case_sensitive_dtypes": case_sensitive_dtypes,
|
|
11150
|
+
"full_match_dtypes": full_match_dtypes,
|
|
11151
|
+
}
|
|
11152
|
+
|
|
11153
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
11154
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
11155
|
+
|
|
11156
|
+
val_info = _ValidationInfo(
|
|
11157
|
+
assertion_type=assertion_type,
|
|
11158
|
+
values=values,
|
|
11159
|
+
pre=pre,
|
|
11160
|
+
thresholds=thresholds,
|
|
11161
|
+
actions=actions,
|
|
11162
|
+
brief=brief,
|
|
11163
|
+
active=active,
|
|
11164
|
+
)
|
|
11165
|
+
|
|
11166
|
+
self._add_validation(validation_info=val_info)
|
|
11167
|
+
|
|
11168
|
+
return self
|
|
11169
|
+
|
|
11170
|
+
def row_count_match(
|
|
11171
|
+
self,
|
|
11172
|
+
count: int | FrameT | Any,
|
|
11173
|
+
tol: Tolerance = 0,
|
|
11174
|
+
inverse: bool = False,
|
|
11175
|
+
pre: Callable | None = None,
|
|
11176
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
11177
|
+
actions: Actions | None = None,
|
|
11178
|
+
brief: str | bool | None = None,
|
|
11179
|
+
active: bool = True,
|
|
11180
|
+
) -> Validate:
|
|
11181
|
+
"""
|
|
11182
|
+
Validate whether the row count of the table matches a specified count.
|
|
11183
|
+
|
|
11184
|
+
The `row_count_match()` method checks whether the row count of the target table matches a
|
|
11185
|
+
specified count. This validation will operate over a single test unit, which is whether the
|
|
11186
|
+
row count matches the specified count.
|
|
11187
|
+
|
|
11188
|
+
We also have the option to invert the validation step by setting `inverse=True`. This will
|
|
11189
|
+
make the expectation that the row count of the target table *does not* match the specified
|
|
11190
|
+
count.
|
|
11191
|
+
|
|
11192
|
+
Parameters
|
|
11193
|
+
----------
|
|
11194
|
+
count
|
|
11195
|
+
The expected row count of the table. This can be an integer value, a Polars or Pandas
|
|
11196
|
+
DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the row
|
|
11197
|
+
count of that object will be used as the expected count.
|
|
11198
|
+
tol
|
|
11199
|
+
The tolerance allowable for the row count match. This can be specified as a single
|
|
11200
|
+
numeric value (integer or float) or as a tuple of two integers representing the lower
|
|
11201
|
+
and upper bounds of the tolerance range. If a single integer value (greater than 1) is
|
|
11202
|
+
provided, it represents the absolute bounds of the tolerance, ie. plus or minus the value.
|
|
11203
|
+
If a float value (between 0-1) is provided, it represents the relative tolerance, ie.
|
|
11204
|
+
plus or minus the relative percentage of the target. If a tuple is provided, it represents
|
|
11205
|
+
the lower and upper absolute bounds of the tolerance range. See the examples for more.
|
|
11206
|
+
inverse
|
|
11207
|
+
Should the validation step be inverted? If `True`, then the expectation is that the row
|
|
11208
|
+
count of the target table should not match the specified `count=` value.
|
|
11209
|
+
pre
|
|
11210
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
11211
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
11212
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
11213
|
+
argument.
|
|
11214
|
+
thresholds
|
|
11215
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
11216
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
11217
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
11218
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
11219
|
+
section for information on how to set threshold levels.
|
|
11220
|
+
actions
|
|
11221
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
11222
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
11223
|
+
define the actions.
|
|
11224
|
+
brief
|
|
11225
|
+
An optional brief description of the validation step that will be displayed in the
|
|
11226
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
11227
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
11228
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
11229
|
+
won't be a brief.
|
|
11230
|
+
active
|
|
11231
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
11232
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
11233
|
+
for the steps unchanged).
|
|
11234
|
+
|
|
11235
|
+
Returns
|
|
11236
|
+
-------
|
|
11237
|
+
Validate
|
|
11238
|
+
The `Validate` object with the added validation step.
|
|
11239
|
+
|
|
11240
|
+
Preprocessing
|
|
11241
|
+
-------------
|
|
11242
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
11243
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
11244
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
11245
|
+
before the validation step is applied.
|
|
11246
|
+
|
|
11247
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
11248
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
11249
|
+
certain criteria or to apply a transformation to the data. Regarding the lifetime of the
|
|
11250
|
+
transformed table, it only exists during the validation step and is not stored in the
|
|
11251
|
+
`Validate` object or used in subsequent validation steps.
|
|
11252
|
+
|
|
11253
|
+
Thresholds
|
|
11254
|
+
----------
|
|
11255
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
11256
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
11257
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
11258
|
+
|
|
11259
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
11260
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
11261
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
11262
|
+
|
|
11263
|
+
Thresholds can be defined using one of these input schemes:
|
|
11264
|
+
|
|
11265
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
11266
|
+
thresholds)
|
|
11267
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
11268
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
11269
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
11270
|
+
'critical'
|
|
11271
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
11272
|
+
for the 'warning' level only
|
|
11273
|
+
|
|
11274
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
11275
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
11276
|
+
set, you're free to set any combination of them.
|
|
11277
|
+
|
|
11278
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
11279
|
+
take for each level of failure (using the `actions=` parameter).
|
|
11280
|
+
|
|
11281
|
+
Examples
|
|
11282
|
+
--------
|
|
11283
|
+
```{python}
|
|
11284
|
+
#| echo: false
|
|
11285
|
+
#| output: false
|
|
11286
|
+
import pointblank as pb
|
|
11287
|
+
pb.config(report_incl_header=False, report_incl_footer=False)
|
|
11288
|
+
```
|
|
11289
|
+
|
|
11290
|
+
For the examples here, we'll use the built in dataset `"small_table"`. The table can be
|
|
11291
|
+
obtained by calling `load_dataset("small_table")`.
|
|
11292
|
+
|
|
11293
|
+
```{python}
|
|
11294
|
+
import pointblank as pb
|
|
11295
|
+
|
|
11296
|
+
small_table = pb.load_dataset("small_table")
|
|
11297
|
+
|
|
11298
|
+
pb.preview(small_table)
|
|
10932
11299
|
```
|
|
10933
11300
|
|
|
10934
11301
|
Let's validate that the number of rows in the table matches a fixed value. In this case, we
|
|
@@ -12227,12 +12594,19 @@ class Validate:
|
|
|
12227
12594
|
# Generate the autobrief description for the validation step; it's important to perform
|
|
12228
12595
|
# that here since text components like the column and the value(s) have been resolved
|
|
12229
12596
|
# at this point
|
|
12597
|
+
# Get row count for col_pct_null to properly calculate absolute tolerance percentages
|
|
12598
|
+
n_rows = None
|
|
12599
|
+
if assertion_type == "col_pct_null":
|
|
12600
|
+
n_rows = get_row_count(data_tbl)
|
|
12601
|
+
|
|
12230
12602
|
autobrief = _create_autobrief_or_failure_text(
|
|
12231
12603
|
assertion_type=assertion_type,
|
|
12232
12604
|
lang=self.lang,
|
|
12233
12605
|
column=column,
|
|
12234
12606
|
values=value,
|
|
12235
12607
|
for_failure=False,
|
|
12608
|
+
locale=self.locale,
|
|
12609
|
+
n_rows=n_rows,
|
|
12236
12610
|
)
|
|
12237
12611
|
|
|
12238
12612
|
validation.autobrief = autobrief
|
|
@@ -12260,6 +12634,12 @@ class Validate:
|
|
|
12260
12634
|
# This prevents modifications from one validation step affecting others
|
|
12261
12635
|
data_tbl_step = _copy_dataframe(data_tbl)
|
|
12262
12636
|
|
|
12637
|
+
# Capture original table dimensions and columns before preprocessing
|
|
12638
|
+
# (only if preprocessing is present - we'll set these inside the preprocessing block)
|
|
12639
|
+
original_rows = None
|
|
12640
|
+
original_cols = None
|
|
12641
|
+
original_column_names = None
|
|
12642
|
+
|
|
12263
12643
|
# ------------------------------------------------
|
|
12264
12644
|
# Preprocessing stage
|
|
12265
12645
|
# ------------------------------------------------
|
|
@@ -12267,6 +12647,16 @@ class Validate:
|
|
|
12267
12647
|
# Determine whether any preprocessing functions are to be applied to the table
|
|
12268
12648
|
if validation.pre is not None:
|
|
12269
12649
|
try:
|
|
12650
|
+
# Capture original table dimensions before preprocessing
|
|
12651
|
+
# Use get_row_count() instead of len() for compatibility with PySpark, etc.
|
|
12652
|
+
original_rows = get_row_count(data_tbl_step)
|
|
12653
|
+
original_cols = get_column_count(data_tbl_step)
|
|
12654
|
+
original_column_names = set(
|
|
12655
|
+
data_tbl_step.columns
|
|
12656
|
+
if hasattr(data_tbl_step, "columns")
|
|
12657
|
+
else list(data_tbl_step.columns)
|
|
12658
|
+
)
|
|
12659
|
+
|
|
12270
12660
|
# Read the text of the preprocessing function
|
|
12271
12661
|
pre_text = _pre_processing_funcs_to_str(validation.pre)
|
|
12272
12662
|
|
|
@@ -12299,6 +12689,62 @@ class Validate:
|
|
|
12299
12689
|
elif isinstance(validation.pre, Callable):
|
|
12300
12690
|
data_tbl_step = validation.pre(data_tbl_step)
|
|
12301
12691
|
|
|
12692
|
+
# After successful preprocessing, check dimensions and create notes
|
|
12693
|
+
# Use get_row_count() and get_column_count() for compatibility
|
|
12694
|
+
processed_rows = get_row_count(data_tbl_step)
|
|
12695
|
+
processed_cols = get_column_count(data_tbl_step)
|
|
12696
|
+
|
|
12697
|
+
# Always add a note when preprocessing is applied
|
|
12698
|
+
if original_rows != processed_rows or original_cols != processed_cols:
|
|
12699
|
+
# Dimensions changed - show the change
|
|
12700
|
+
note_html = _create_preprocessing_note_html(
|
|
12701
|
+
original_rows=original_rows,
|
|
12702
|
+
original_cols=original_cols,
|
|
12703
|
+
processed_rows=processed_rows,
|
|
12704
|
+
processed_cols=processed_cols,
|
|
12705
|
+
locale=self.locale,
|
|
12706
|
+
)
|
|
12707
|
+
note_text = _create_preprocessing_note_text(
|
|
12708
|
+
original_rows=original_rows,
|
|
12709
|
+
original_cols=original_cols,
|
|
12710
|
+
processed_rows=processed_rows,
|
|
12711
|
+
processed_cols=processed_cols,
|
|
12712
|
+
)
|
|
12713
|
+
else:
|
|
12714
|
+
# No dimension change - just indicate preprocessing was applied
|
|
12715
|
+
note_html = _create_preprocessing_no_change_note_html(locale=self.locale)
|
|
12716
|
+
note_text = _create_preprocessing_no_change_note_text()
|
|
12717
|
+
|
|
12718
|
+
validation._add_note(
|
|
12719
|
+
key="pre_applied",
|
|
12720
|
+
markdown=note_html,
|
|
12721
|
+
text=note_text,
|
|
12722
|
+
)
|
|
12723
|
+
|
|
12724
|
+
# Check if target column is synthetic (exists in processed but not original)
|
|
12725
|
+
# Only check for single column names (not lists used in rows_distinct, etc.)
|
|
12726
|
+
if column is not None and isinstance(column, str):
|
|
12727
|
+
processed_column_names = set(
|
|
12728
|
+
data_tbl_step.columns
|
|
12729
|
+
if hasattr(data_tbl_step, "columns")
|
|
12730
|
+
else list(data_tbl_step.columns)
|
|
12731
|
+
)
|
|
12732
|
+
|
|
12733
|
+
# Check if the target column is in the processed table but not in original
|
|
12734
|
+
if column in processed_column_names and column not in original_column_names:
|
|
12735
|
+
note_html = _create_synthetic_target_column_note_html(
|
|
12736
|
+
column_name=column,
|
|
12737
|
+
locale=self.locale,
|
|
12738
|
+
)
|
|
12739
|
+
note_text = _create_synthetic_target_column_note_text(
|
|
12740
|
+
column_name=column,
|
|
12741
|
+
)
|
|
12742
|
+
validation._add_note(
|
|
12743
|
+
key="syn_target_col",
|
|
12744
|
+
markdown=note_html,
|
|
12745
|
+
text=note_text,
|
|
12746
|
+
)
|
|
12747
|
+
|
|
12302
12748
|
except Exception:
|
|
12303
12749
|
# If preprocessing fails, mark the validation as having an eval_error
|
|
12304
12750
|
validation.eval_error = True
|
|
@@ -12488,6 +12934,21 @@ class Validate:
|
|
|
12488
12934
|
tbl=tbl, column=column, values=value, na_pass=na_pass
|
|
12489
12935
|
)
|
|
12490
12936
|
|
|
12937
|
+
elif assertion_type == "col_pct_null":
|
|
12938
|
+
result_bool = col_pct_null(
|
|
12939
|
+
data_tbl=data_tbl_step,
|
|
12940
|
+
column=column,
|
|
12941
|
+
p=value["p"],
|
|
12942
|
+
bound_finder=value["bound_finder"],
|
|
12943
|
+
)
|
|
12944
|
+
|
|
12945
|
+
validation.all_passed = result_bool
|
|
12946
|
+
validation.n = 1
|
|
12947
|
+
validation.n_passed = int(result_bool)
|
|
12948
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12949
|
+
|
|
12950
|
+
results_tbl = None
|
|
12951
|
+
|
|
12491
12952
|
elif assertion_type == "col_vals_expr":
|
|
12492
12953
|
results_tbl = col_vals_expr(
|
|
12493
12954
|
data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
|
|
@@ -12547,10 +13008,21 @@ class Validate:
|
|
|
12547
13008
|
# Add the schema validation info to the validation object
|
|
12548
13009
|
validation.val_info = schema_validation_info
|
|
12549
13010
|
|
|
13011
|
+
# Add a note with the schema expectation and results
|
|
13012
|
+
schema_note_html = _create_col_schema_match_note_html(
|
|
13013
|
+
schema_info=schema_validation_info, locale=self.locale
|
|
13014
|
+
)
|
|
13015
|
+
schema_note_text = _create_col_schema_match_note_text(
|
|
13016
|
+
schema_info=schema_validation_info
|
|
13017
|
+
)
|
|
13018
|
+
validation._add_note(
|
|
13019
|
+
key="schema_check", markdown=schema_note_html, text=schema_note_text
|
|
13020
|
+
)
|
|
13021
|
+
|
|
12550
13022
|
validation.all_passed = result_bool
|
|
12551
13023
|
validation.n = 1
|
|
12552
13024
|
validation.n_passed = int(result_bool)
|
|
12553
|
-
validation.n_failed = 1 - result_bool
|
|
13025
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12554
13026
|
|
|
12555
13027
|
results_tbl = None
|
|
12556
13028
|
|
|
@@ -12565,7 +13037,7 @@ class Validate:
|
|
|
12565
13037
|
validation.all_passed = result_bool
|
|
12566
13038
|
validation.n = 1
|
|
12567
13039
|
validation.n_passed = int(result_bool)
|
|
12568
|
-
validation.n_failed = 1 - result_bool
|
|
13040
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12569
13041
|
|
|
12570
13042
|
results_tbl = None
|
|
12571
13043
|
|
|
@@ -12577,7 +13049,7 @@ class Validate:
|
|
|
12577
13049
|
validation.all_passed = result_bool
|
|
12578
13050
|
validation.n = 1
|
|
12579
13051
|
validation.n_passed = int(result_bool)
|
|
12580
|
-
validation.n_failed = 1 - result_bool
|
|
13052
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12581
13053
|
|
|
12582
13054
|
results_tbl = None
|
|
12583
13055
|
|
|
@@ -12596,7 +13068,7 @@ class Validate:
|
|
|
12596
13068
|
validation.all_passed = result_bool
|
|
12597
13069
|
validation.n = 1
|
|
12598
13070
|
validation.n_passed = int(result_bool)
|
|
12599
|
-
validation.n_failed = 1 - result_bool
|
|
13071
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12600
13072
|
|
|
12601
13073
|
results_tbl = None
|
|
12602
13074
|
|
|
@@ -12614,8 +13086,9 @@ class Validate:
|
|
|
12614
13086
|
) # pragma: no cover
|
|
12615
13087
|
|
|
12616
13088
|
except Exception as e:
|
|
12617
|
-
#
|
|
13089
|
+
# Catch data quality errors and column not found errors
|
|
12618
13090
|
error_msg = str(e).lower()
|
|
13091
|
+
|
|
12619
13092
|
is_comparison_error = (
|
|
12620
13093
|
"boolean value of na is ambiguous" in error_msg
|
|
12621
13094
|
or "cannot compare" in error_msg
|
|
@@ -12626,20 +13099,101 @@ class Validate:
|
|
|
12626
13099
|
or ("dtype" in error_msg and "compare" in error_msg)
|
|
12627
13100
|
)
|
|
12628
13101
|
|
|
12629
|
-
|
|
12630
|
-
|
|
13102
|
+
is_column_not_found = "column" in error_msg and "not found" in error_msg
|
|
13103
|
+
|
|
13104
|
+
is_comparison_column_not_found = (
|
|
13105
|
+
"unable to find column" in error_msg and "valid columns" in error_msg
|
|
13106
|
+
)
|
|
13107
|
+
|
|
13108
|
+
if (
|
|
13109
|
+
is_comparison_error or is_column_not_found or is_comparison_column_not_found
|
|
13110
|
+
): # pragma: no cover
|
|
13111
|
+
# If data quality comparison fails or column not found, mark as eval_error
|
|
12631
13112
|
validation.eval_error = True # pragma: no cover
|
|
13113
|
+
|
|
13114
|
+
# Add a note for column not found errors (target column)
|
|
13115
|
+
if is_column_not_found:
|
|
13116
|
+
note_html = _create_column_not_found_note_html(
|
|
13117
|
+
column_name=column,
|
|
13118
|
+
available_columns=list(data_tbl_step.columns)
|
|
13119
|
+
if hasattr(data_tbl_step, "columns")
|
|
13120
|
+
else [],
|
|
13121
|
+
locale=self.locale,
|
|
13122
|
+
)
|
|
13123
|
+
note_text = _create_column_not_found_note_text(
|
|
13124
|
+
column_name=column,
|
|
13125
|
+
available_columns=list(data_tbl_step.columns)
|
|
13126
|
+
if hasattr(data_tbl_step, "columns")
|
|
13127
|
+
else [],
|
|
13128
|
+
)
|
|
13129
|
+
validation._add_note(
|
|
13130
|
+
key="column_not_found",
|
|
13131
|
+
markdown=note_html,
|
|
13132
|
+
text=note_text,
|
|
13133
|
+
)
|
|
13134
|
+
|
|
13135
|
+
# Add a note for comparison column not found errors
|
|
13136
|
+
elif is_comparison_column_not_found:
|
|
13137
|
+
# Extract column name from error message
|
|
13138
|
+
# Error format: 'unable to find column "col_name"; valid columns: ...'
|
|
13139
|
+
match = re.search(r'unable to find column "([^"]+)"', str(e))
|
|
13140
|
+
|
|
13141
|
+
if match:
|
|
13142
|
+
missing_col_name = match.group(1)
|
|
13143
|
+
|
|
13144
|
+
# Determine position for between/outside validations
|
|
13145
|
+
position = None
|
|
13146
|
+
if assertion_type in ["col_vals_between", "col_vals_outside"]:
|
|
13147
|
+
# Check if missing column is in left or right position
|
|
13148
|
+
from pointblank.column import Column
|
|
13149
|
+
|
|
13150
|
+
if (
|
|
13151
|
+
isinstance(value[0], Column)
|
|
13152
|
+
and value[0].exprs == missing_col_name
|
|
13153
|
+
):
|
|
13154
|
+
position = "left"
|
|
13155
|
+
elif (
|
|
13156
|
+
isinstance(value[1], Column)
|
|
13157
|
+
and value[1].exprs == missing_col_name
|
|
13158
|
+
):
|
|
13159
|
+
position = "right"
|
|
13160
|
+
|
|
13161
|
+
note_html = _create_comparison_column_not_found_note_html(
|
|
13162
|
+
column_name=missing_col_name,
|
|
13163
|
+
position=position,
|
|
13164
|
+
available_columns=list(data_tbl_step.columns)
|
|
13165
|
+
if hasattr(data_tbl_step, "columns")
|
|
13166
|
+
else [],
|
|
13167
|
+
locale=self.locale,
|
|
13168
|
+
)
|
|
13169
|
+
note_text = _create_comparison_column_not_found_note_text(
|
|
13170
|
+
column_name=missing_col_name,
|
|
13171
|
+
position=position,
|
|
13172
|
+
available_columns=list(data_tbl_step.columns)
|
|
13173
|
+
if hasattr(data_tbl_step, "columns")
|
|
13174
|
+
else [],
|
|
13175
|
+
)
|
|
13176
|
+
validation._add_note(
|
|
13177
|
+
key="comparison_column_not_found",
|
|
13178
|
+
markdown=note_html,
|
|
13179
|
+
text=note_text,
|
|
13180
|
+
)
|
|
13181
|
+
|
|
12632
13182
|
end_time = datetime.datetime.now(datetime.timezone.utc) # pragma: no cover
|
|
13183
|
+
|
|
12633
13184
|
validation.proc_duration_s = (
|
|
12634
13185
|
end_time - start_time
|
|
12635
13186
|
).total_seconds() # pragma: no cover
|
|
13187
|
+
|
|
12636
13188
|
validation.time_processed = end_time.isoformat(
|
|
12637
13189
|
timespec="milliseconds"
|
|
12638
13190
|
) # pragma: no cover
|
|
13191
|
+
|
|
12639
13192
|
validation.active = False # pragma: no cover
|
|
13193
|
+
|
|
12640
13194
|
continue # pragma: no cover
|
|
12641
13195
|
else:
|
|
12642
|
-
# For other errors
|
|
13196
|
+
# For other unexpected errors, let them propagate
|
|
12643
13197
|
raise
|
|
12644
13198
|
|
|
12645
13199
|
else:
|
|
@@ -12722,6 +13276,34 @@ class Validate:
|
|
|
12722
13276
|
),
|
|
12723
13277
|
)
|
|
12724
13278
|
|
|
13279
|
+
# Add note for local thresholds (if they differ from global thresholds)
|
|
13280
|
+
if threshold != self.thresholds:
|
|
13281
|
+
if threshold != Thresholds():
|
|
13282
|
+
# Local thresholds are set - generate threshold note
|
|
13283
|
+
threshold_note_html = _create_local_threshold_note_html(
|
|
13284
|
+
thresholds=threshold, locale=self.locale
|
|
13285
|
+
)
|
|
13286
|
+
threshold_note_text = _create_local_threshold_note_text(thresholds=threshold)
|
|
13287
|
+
|
|
13288
|
+
# Add the note to the validation step
|
|
13289
|
+
validation._add_note(
|
|
13290
|
+
key="local_thresholds",
|
|
13291
|
+
markdown=threshold_note_html,
|
|
13292
|
+
text=threshold_note_text,
|
|
13293
|
+
)
|
|
13294
|
+
|
|
13295
|
+
elif self.thresholds != Thresholds():
|
|
13296
|
+
# Thresholds explicitly reset to empty when global thresholds exist
|
|
13297
|
+
reset_note_html = _create_threshold_reset_note_html(locale=self.locale)
|
|
13298
|
+
reset_note_text = _create_threshold_reset_note_text()
|
|
13299
|
+
|
|
13300
|
+
# Add the note to the validation step
|
|
13301
|
+
validation._add_note(
|
|
13302
|
+
key="local_threshold_reset",
|
|
13303
|
+
markdown=reset_note_html,
|
|
13304
|
+
text=reset_note_text,
|
|
13305
|
+
)
|
|
13306
|
+
|
|
12725
13307
|
# If there is any threshold level that has been exceeded, then produce and
|
|
12726
13308
|
# set the general failure text for the validation step
|
|
12727
13309
|
if validation.warning or validation.error or validation.critical:
|
|
@@ -12732,6 +13314,8 @@ class Validate:
|
|
|
12732
13314
|
column=column,
|
|
12733
13315
|
values=value,
|
|
12734
13316
|
for_failure=True,
|
|
13317
|
+
locale=self.locale,
|
|
13318
|
+
n_rows=n_rows,
|
|
12735
13319
|
)
|
|
12736
13320
|
|
|
12737
13321
|
# Set the failure text in the validation step
|
|
@@ -14217,11 +14801,15 @@ class Validate:
|
|
|
14217
14801
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
14218
14802
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
14219
14803
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
14804
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
14805
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
14220
14806
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
14221
14807
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
14222
14808
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
14809
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
14223
14810
|
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
14224
14811
|
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
14812
|
+
- [`prompt()`](`pointblank.Validate.prompt`)
|
|
14225
14813
|
|
|
14226
14814
|
An extracted row for these validation methods means that a test unit failed for that row in
|
|
14227
14815
|
the validation step.
|
|
@@ -14806,7 +15394,12 @@ class Validate:
|
|
|
14806
15394
|
return None
|
|
14807
15395
|
|
|
14808
15396
|
def get_tabular_report(
|
|
14809
|
-
self,
|
|
15397
|
+
self,
|
|
15398
|
+
title: str | None = ":default:",
|
|
15399
|
+
incl_header: bool = None,
|
|
15400
|
+
incl_footer: bool = None,
|
|
15401
|
+
incl_footer_timings: bool = None,
|
|
15402
|
+
incl_footer_notes: bool = None,
|
|
14810
15403
|
) -> GT:
|
|
14811
15404
|
"""
|
|
14812
15405
|
Validation report as a GT table.
|
|
@@ -14829,6 +15422,20 @@ class Validate:
|
|
|
14829
15422
|
name of the table as the title for the report. If no title is wanted, then `":none:"`
|
|
14830
15423
|
can be used. Aside from keyword options, text can be provided for the title. This will
|
|
14831
15424
|
be interpreted as Markdown text and transformed internally to HTML.
|
|
15425
|
+
incl_header
|
|
15426
|
+
Controls whether the header section should be displayed. If `None`, uses the global
|
|
15427
|
+
configuration setting. The header contains the table name, label, and threshold
|
|
15428
|
+
information.
|
|
15429
|
+
incl_footer
|
|
15430
|
+
Controls whether the footer section should be displayed. If `None`, uses the global
|
|
15431
|
+
configuration setting. The footer can contain validation timing information and notes.
|
|
15432
|
+
incl_footer_timings
|
|
15433
|
+
Controls whether validation timing information (start time, duration, end time) should
|
|
15434
|
+
be displayed in the footer. If `None`, uses the global configuration setting. Only
|
|
15435
|
+
applies when `incl_footer=True`.
|
|
15436
|
+
incl_footer_notes
|
|
15437
|
+
Controls whether notes from validation steps should be displayed in the footer. If
|
|
15438
|
+
`None`, uses the global configuration setting. Only applies when `incl_footer=True`.
|
|
14832
15439
|
|
|
14833
15440
|
Returns
|
|
14834
15441
|
-------
|
|
@@ -14888,6 +15495,10 @@ class Validate:
|
|
|
14888
15495
|
incl_header = global_config.report_incl_header
|
|
14889
15496
|
if incl_footer is None:
|
|
14890
15497
|
incl_footer = global_config.report_incl_footer
|
|
15498
|
+
if incl_footer_timings is None:
|
|
15499
|
+
incl_footer_timings = global_config.report_incl_footer_timings
|
|
15500
|
+
if incl_footer_notes is None:
|
|
15501
|
+
incl_footer_notes = global_config.report_incl_footer_notes
|
|
14891
15502
|
|
|
14892
15503
|
# Do we have a DataFrame library to work with?
|
|
14893
15504
|
_check_any_df_lib(method_used="get_tabular_report")
|
|
@@ -15126,30 +15737,53 @@ class Validate:
|
|
|
15126
15737
|
columns_upd = []
|
|
15127
15738
|
|
|
15128
15739
|
columns = validation_info_dict["column"]
|
|
15740
|
+
notes = validation_info_dict["notes"]
|
|
15129
15741
|
|
|
15130
15742
|
assertion_type = validation_info_dict["assertion_type"]
|
|
15131
15743
|
|
|
15132
15744
|
# Iterate over the values in the `column` entry
|
|
15133
15745
|
for i, column in enumerate(columns):
|
|
15746
|
+
# Check if this validation has a synthetic target column note
|
|
15747
|
+
has_synthetic_column = (
|
|
15748
|
+
notes[i] is not None and isinstance(notes[i], dict) and "syn_target_col" in notes[i]
|
|
15749
|
+
)
|
|
15750
|
+
|
|
15751
|
+
column_text = None
|
|
15752
|
+
|
|
15134
15753
|
if assertion_type[i] in [
|
|
15135
15754
|
"col_schema_match",
|
|
15136
15755
|
"row_count_match",
|
|
15137
15756
|
"col_count_match",
|
|
15138
15757
|
"col_vals_expr",
|
|
15139
15758
|
]:
|
|
15140
|
-
|
|
15759
|
+
column_text = "—"
|
|
15141
15760
|
elif assertion_type[i] in ["rows_distinct", "rows_complete", "prompt"]:
|
|
15142
15761
|
if not column:
|
|
15143
15762
|
# If there is no column subset, then all columns are used
|
|
15144
|
-
|
|
15763
|
+
column_text = "ALL COLUMNS"
|
|
15145
15764
|
else:
|
|
15146
15765
|
# With a column subset list, format with commas between the column names
|
|
15147
|
-
|
|
15148
|
-
|
|
15766
|
+
column_text = ", ".join(column)
|
|
15149
15767
|
elif assertion_type[i] in ["conjointly", "specially"]:
|
|
15150
|
-
|
|
15768
|
+
column_text = ""
|
|
15151
15769
|
else:
|
|
15152
|
-
|
|
15770
|
+
column_text = str(column)
|
|
15771
|
+
|
|
15772
|
+
# Apply underline styling for synthetic columns (using the purple color from the icon)
|
|
15773
|
+
# Only apply styling if column_text is not empty and not a special marker
|
|
15774
|
+
if (
|
|
15775
|
+
has_synthetic_column
|
|
15776
|
+
and column_text
|
|
15777
|
+
and column_text not in ["—", "ALL COLUMNS", ""]
|
|
15778
|
+
):
|
|
15779
|
+
column_text = (
|
|
15780
|
+
f'<span style="text-decoration: underline; '
|
|
15781
|
+
f"text-decoration-color: #9A7CB4; text-decoration-thickness: 1px; "
|
|
15782
|
+
f'text-underline-offset: 3px;">'
|
|
15783
|
+
f"{column_text}</span>"
|
|
15784
|
+
)
|
|
15785
|
+
|
|
15786
|
+
columns_upd.append(column_text)
|
|
15153
15787
|
|
|
15154
15788
|
# Add the `columns_upd` entry to the dictionary
|
|
15155
15789
|
validation_info_dict["columns_upd"] = columns_upd
|
|
@@ -15205,6 +15839,15 @@ class Validate:
|
|
|
15205
15839
|
]:
|
|
15206
15840
|
values_upd.append("—")
|
|
15207
15841
|
|
|
15842
|
+
elif assertion_type[i] in ["col_pct_null"]:
|
|
15843
|
+
# Extract p and tol from the values dict for nice formatting
|
|
15844
|
+
p_value = value["p"]
|
|
15845
|
+
|
|
15846
|
+
# Extract tol from the bound_finder partial function
|
|
15847
|
+
bound_finder = value.get("bound_finder")
|
|
15848
|
+
tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
|
|
15849
|
+
values_upd.append(f"p = {p_value}<br/>tol = {tol_value}")
|
|
15850
|
+
|
|
15208
15851
|
elif assertion_type[i] in ["col_schema_match"]:
|
|
15209
15852
|
values_upd.append("SCHEMA")
|
|
15210
15853
|
|
|
@@ -15680,13 +16323,15 @@ class Validate:
|
|
|
15680
16323
|
gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
|
|
15681
16324
|
|
|
15682
16325
|
if incl_footer:
|
|
15683
|
-
# Add table time as HTML source note
|
|
15684
|
-
|
|
16326
|
+
# Add table time as HTML source note if enabled
|
|
16327
|
+
if incl_footer_timings:
|
|
16328
|
+
gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
|
|
15685
16329
|
|
|
15686
|
-
# Create notes markdown from validation steps and add as separate source note
|
|
15687
|
-
|
|
15688
|
-
|
|
15689
|
-
|
|
16330
|
+
# Create notes markdown from validation steps and add as separate source note if enabled
|
|
16331
|
+
if incl_footer_notes:
|
|
16332
|
+
notes_markdown = _create_notes_html(self.validation_info)
|
|
16333
|
+
if notes_markdown:
|
|
16334
|
+
gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
|
|
15690
16335
|
|
|
15691
16336
|
# If the interrogation has not been performed, then style the table columns dealing with
|
|
15692
16337
|
# interrogation data as grayed out
|
|
@@ -15795,11 +16440,15 @@ class Validate:
|
|
|
15795
16440
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
15796
16441
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
15797
16442
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
16443
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
16444
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
15798
16445
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
15799
16446
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
15800
16447
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
16448
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
15801
16449
|
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
15802
16450
|
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
16451
|
+
- [`prompt()`](`pointblank.Validate.prompt`)
|
|
15803
16452
|
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
15804
16453
|
|
|
15805
16454
|
The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
|
|
@@ -16099,6 +16748,12 @@ class Validate:
|
|
|
16099
16748
|
|
|
16100
16749
|
except Exception: # pragma: no cover
|
|
16101
16750
|
validation.eval_error = True
|
|
16751
|
+
columns_resolved = []
|
|
16752
|
+
# Store columns list for note generation
|
|
16753
|
+
try:
|
|
16754
|
+
columns = list(table.columns) if "table" in locals() else []
|
|
16755
|
+
except Exception:
|
|
16756
|
+
columns = []
|
|
16102
16757
|
|
|
16103
16758
|
# If no columns were resolved, then create a patched validation step with the
|
|
16104
16759
|
# `eval_error` and `column` attributes set
|
|
@@ -16106,6 +16761,22 @@ class Validate:
|
|
|
16106
16761
|
validation.eval_error = True
|
|
16107
16762
|
validation.column = str(column_expr)
|
|
16108
16763
|
|
|
16764
|
+
# Add a helpful note explaining that no columns were resolved
|
|
16765
|
+
note_html = _create_no_columns_resolved_note_html(
|
|
16766
|
+
column_expr=str(column_expr),
|
|
16767
|
+
available_columns=columns,
|
|
16768
|
+
locale=self.locale,
|
|
16769
|
+
)
|
|
16770
|
+
note_text = _create_no_columns_resolved_note_text(
|
|
16771
|
+
column_expr=str(column_expr),
|
|
16772
|
+
available_columns=columns,
|
|
16773
|
+
)
|
|
16774
|
+
validation._add_note(
|
|
16775
|
+
key="no_columns_resolved",
|
|
16776
|
+
markdown=note_html,
|
|
16777
|
+
text=note_text,
|
|
16778
|
+
)
|
|
16779
|
+
|
|
16109
16780
|
expanded_validation_info.append(validation)
|
|
16110
16781
|
continue
|
|
16111
16782
|
|
|
@@ -16664,7 +17335,13 @@ def _process_action_str(
|
|
|
16664
17335
|
|
|
16665
17336
|
|
|
16666
17337
|
def _create_autobrief_or_failure_text(
|
|
16667
|
-
assertion_type: str,
|
|
17338
|
+
assertion_type: str,
|
|
17339
|
+
lang: str,
|
|
17340
|
+
column: str | None,
|
|
17341
|
+
values: str | None,
|
|
17342
|
+
for_failure: bool,
|
|
17343
|
+
locale: str | None = None,
|
|
17344
|
+
n_rows: int | None = None,
|
|
16668
17345
|
) -> str:
|
|
16669
17346
|
if assertion_type in [
|
|
16670
17347
|
"col_vals_gt",
|
|
@@ -16788,6 +17465,16 @@ def _create_autobrief_or_failure_text(
|
|
|
16788
17465
|
for_failure=for_failure,
|
|
16789
17466
|
)
|
|
16790
17467
|
|
|
17468
|
+
if assertion_type == "col_pct_null":
|
|
17469
|
+
return _create_text_col_pct_null(
|
|
17470
|
+
lang=lang,
|
|
17471
|
+
column=column,
|
|
17472
|
+
value=values,
|
|
17473
|
+
for_failure=for_failure,
|
|
17474
|
+
locale=locale if locale else lang,
|
|
17475
|
+
n_rows=n_rows,
|
|
17476
|
+
)
|
|
17477
|
+
|
|
16791
17478
|
if assertion_type == "conjointly":
|
|
16792
17479
|
return _create_text_conjointly(lang=lang, for_failure=for_failure)
|
|
16793
17480
|
|
|
@@ -17010,6 +17697,115 @@ def _create_text_col_count_match(lang: str, value: int, for_failure: bool = Fals
|
|
|
17010
17697
|
return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text)
|
|
17011
17698
|
|
|
17012
17699
|
|
|
17700
|
+
def _create_text_col_pct_null(
|
|
17701
|
+
lang: str,
|
|
17702
|
+
column: str | None,
|
|
17703
|
+
value: dict,
|
|
17704
|
+
for_failure: bool = False,
|
|
17705
|
+
locale: str | None = None,
|
|
17706
|
+
n_rows: int | None = None,
|
|
17707
|
+
) -> str:
|
|
17708
|
+
"""Create text for col_pct_null validation with tolerance handling."""
|
|
17709
|
+
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17710
|
+
|
|
17711
|
+
column_text = _prep_column_text(column=column)
|
|
17712
|
+
|
|
17713
|
+
# Use locale for number formatting, defaulting to lang if not provided
|
|
17714
|
+
fmt_locale = locale if locale else lang
|
|
17715
|
+
|
|
17716
|
+
# Extract p and tol from the values dict
|
|
17717
|
+
p_value = value.get("p", 0) * 100 # Convert to percentage
|
|
17718
|
+
p_value_original = value.get("p", 0) # Keep original value for deviation format
|
|
17719
|
+
|
|
17720
|
+
# Extract tol from the bound_finder partial function
|
|
17721
|
+
bound_finder = value.get("bound_finder")
|
|
17722
|
+
tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
|
|
17723
|
+
|
|
17724
|
+
# Handle different tolerance types
|
|
17725
|
+
has_tolerance = False
|
|
17726
|
+
is_asymmetric = False
|
|
17727
|
+
|
|
17728
|
+
if isinstance(tol_value, tuple):
|
|
17729
|
+
# Tuple tolerance: can be (lower, upper) in absolute or relative terms
|
|
17730
|
+
tol_lower, tol_upper = tol_value
|
|
17731
|
+
|
|
17732
|
+
# Check if we have any non-zero tolerance
|
|
17733
|
+
has_tolerance = tol_lower != 0 or tol_upper != 0
|
|
17734
|
+
is_asymmetric = tol_lower != tol_upper
|
|
17735
|
+
|
|
17736
|
+
# For relative tolerances (floats < 1), we can compute exact percentage bounds
|
|
17737
|
+
# For absolute tolerances (ints >= 1), calculate based on actual row count if available
|
|
17738
|
+
if tol_lower < 1:
|
|
17739
|
+
# Relative tolerance (float)
|
|
17740
|
+
lower_pct_delta = tol_lower * 100
|
|
17741
|
+
else:
|
|
17742
|
+
# Absolute tolerance (int); uses actual row count if available
|
|
17743
|
+
if n_rows is not None and n_rows > 0:
|
|
17744
|
+
lower_pct_delta = (tol_lower / n_rows) * 100
|
|
17745
|
+
else:
|
|
17746
|
+
lower_pct_delta = tol_lower # Fallback approximation
|
|
17747
|
+
|
|
17748
|
+
if tol_upper < 1:
|
|
17749
|
+
# Relative tolerance (float)
|
|
17750
|
+
upper_pct_delta = tol_upper * 100
|
|
17751
|
+
else:
|
|
17752
|
+
# Absolute tolerance (int); uses actual row count if available
|
|
17753
|
+
if n_rows is not None and n_rows > 0:
|
|
17754
|
+
upper_pct_delta = (tol_upper / n_rows) * 100
|
|
17755
|
+
else:
|
|
17756
|
+
upper_pct_delta = tol_upper # Fallback approximation
|
|
17757
|
+
else:
|
|
17758
|
+
# Single value tolerance: symmetric
|
|
17759
|
+
has_tolerance = tol_value != 0
|
|
17760
|
+
|
|
17761
|
+
if tol_value < 1:
|
|
17762
|
+
# Relative tolerance (float)
|
|
17763
|
+
tol_pct = tol_value * 100
|
|
17764
|
+
else:
|
|
17765
|
+
# Absolute tolerance (int) - use actual row count if available
|
|
17766
|
+
if n_rows is not None and n_rows > 0:
|
|
17767
|
+
tol_pct = (tol_value / n_rows) * 100
|
|
17768
|
+
else:
|
|
17769
|
+
tol_pct = tol_value # Fallback approximation
|
|
17770
|
+
|
|
17771
|
+
lower_pct_delta = tol_pct
|
|
17772
|
+
upper_pct_delta = tol_pct
|
|
17773
|
+
|
|
17774
|
+
# Format numbers with locale-aware formatting
|
|
17775
|
+
p_formatted = _format_number_safe(p_value, decimals=1, locale=fmt_locale)
|
|
17776
|
+
p_original_formatted = _format_number_safe(p_value_original, decimals=2, locale=fmt_locale)
|
|
17777
|
+
|
|
17778
|
+
# Choose the appropriate translation key based on tolerance
|
|
17779
|
+
if not has_tolerance:
|
|
17780
|
+
# No tolerance - use simple text
|
|
17781
|
+
text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text"][lang].format(
|
|
17782
|
+
column_text=column_text,
|
|
17783
|
+
p=p_formatted,
|
|
17784
|
+
)
|
|
17785
|
+
elif is_asymmetric or isinstance(tol_value, tuple):
|
|
17786
|
+
# Use deviation format for tuple tolerances (including symmetric ones)
|
|
17787
|
+
# Format the deviation values with signs (using proper minus sign U+2212)
|
|
17788
|
+
lower_dev = f"−{_format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)}%"
|
|
17789
|
+
upper_dev = f"+{_format_number_safe(upper_pct_delta, decimals=1, locale=fmt_locale)}%"
|
|
17790
|
+
|
|
17791
|
+
text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol_deviation"][lang].format(
|
|
17792
|
+
column_text=column_text,
|
|
17793
|
+
lower_dev=lower_dev,
|
|
17794
|
+
upper_dev=upper_dev,
|
|
17795
|
+
p=p_original_formatted,
|
|
17796
|
+
)
|
|
17797
|
+
else:
|
|
17798
|
+
# Single value tolerance - use the symmetric ± format
|
|
17799
|
+
tol_formatted = _format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)
|
|
17800
|
+
text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol"][lang].format(
|
|
17801
|
+
column_text=column_text,
|
|
17802
|
+
p=p_formatted,
|
|
17803
|
+
tol=tol_formatted,
|
|
17804
|
+
)
|
|
17805
|
+
|
|
17806
|
+
return text
|
|
17807
|
+
|
|
17808
|
+
|
|
17013
17809
|
def _create_text_conjointly(lang: str, for_failure: bool = False) -> str:
|
|
17014
17810
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17015
17811
|
|
|
@@ -17408,6 +18204,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
|
|
|
17408
18204
|
|
|
17409
18205
|
def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]:
|
|
17410
18206
|
# For each icon, get the assertion icon SVG test from SVG_ICONS_FOR_ASSERTION_TYPES dictionary
|
|
18207
|
+
# TODO: No point in using `get` if we can't handle missing keys anyways
|
|
17411
18208
|
icon_svg = [SVG_ICONS_FOR_ASSERTION_TYPES.get(icon) for icon in icon]
|
|
17412
18209
|
|
|
17413
18210
|
# Replace the width and height in the SVG string
|
|
@@ -17866,267 +18663,1078 @@ def _create_table_time_html(
|
|
|
17866
18663
|
)
|
|
17867
18664
|
|
|
17868
18665
|
|
|
17869
|
-
def _create_notes_html(validation_info: list) -> str:
|
|
18666
|
+
def _create_notes_html(validation_info: list) -> str:
|
|
18667
|
+
"""
|
|
18668
|
+
Create markdown text for validation notes/footnotes.
|
|
18669
|
+
|
|
18670
|
+
This function collects notes from all validation steps and formats them as footnotes
|
|
18671
|
+
for display in the report footer. Each note is prefixed with the step number in
|
|
18672
|
+
uppercase small caps bold formatting, and the note content is rendered as markdown.
|
|
18673
|
+
|
|
18674
|
+
Parameters
|
|
18675
|
+
----------
|
|
18676
|
+
validation_info
|
|
18677
|
+
List of _ValidationInfo objects from which to extract notes.
|
|
18678
|
+
|
|
18679
|
+
Returns
|
|
18680
|
+
-------
|
|
18681
|
+
str
|
|
18682
|
+
Markdown string containing formatted footnotes, or empty string if no notes exist.
|
|
18683
|
+
"""
|
|
18684
|
+
# Collect all notes from validation steps
|
|
18685
|
+
all_notes = []
|
|
18686
|
+
for step in validation_info:
|
|
18687
|
+
if step.notes:
|
|
18688
|
+
for key, content in step.notes.items():
|
|
18689
|
+
# Store note with step number for context
|
|
18690
|
+
all_notes.append(
|
|
18691
|
+
{
|
|
18692
|
+
"step": step.i,
|
|
18693
|
+
"key": key,
|
|
18694
|
+
"markdown": content["markdown"],
|
|
18695
|
+
"text": content["text"],
|
|
18696
|
+
}
|
|
18697
|
+
)
|
|
18698
|
+
|
|
18699
|
+
# If no notes, return empty string
|
|
18700
|
+
if not all_notes:
|
|
18701
|
+
return ""
|
|
18702
|
+
|
|
18703
|
+
# Build markdown for notes section
|
|
18704
|
+
# Start with a styled horizontal rule and bold "Notes" header
|
|
18705
|
+
notes_parts = [
|
|
18706
|
+
(
|
|
18707
|
+
"<hr style='border: none; border-top-width: 1px; border-top-style: dotted; "
|
|
18708
|
+
"border-top-color: #B5B5B5; margin-top: -3px; margin-bottom: 3px;'>"
|
|
18709
|
+
),
|
|
18710
|
+
"<strong>Notes</strong>",
|
|
18711
|
+
"",
|
|
18712
|
+
]
|
|
18713
|
+
|
|
18714
|
+
previous_step = None
|
|
18715
|
+
for note in all_notes:
|
|
18716
|
+
# Determine if this is the first note for this step
|
|
18717
|
+
is_first_for_step = note["step"] != previous_step
|
|
18718
|
+
previous_step = note["step"]
|
|
18719
|
+
|
|
18720
|
+
# Format step label with HTML for uppercase small caps bold
|
|
18721
|
+
# Use lighter color for subsequent notes of the same step
|
|
18722
|
+
step_color = "#333333" if is_first_for_step else "#999999"
|
|
18723
|
+
step_label = (
|
|
18724
|
+
f"<span style='font-variant: small-caps; font-weight: bold; font-size: smaller; "
|
|
18725
|
+
f"text-transform: uppercase; color: {step_color};'>Step {note['step']}</span>"
|
|
18726
|
+
)
|
|
18727
|
+
|
|
18728
|
+
# Format note key in monospaced font with smaller size
|
|
18729
|
+
note_key = f"<span style='font-family: \"IBM Plex Mono\", monospace; font-size: smaller;'>({note['key']})</span>"
|
|
18730
|
+
|
|
18731
|
+
# Combine step label, note key, and markdown content
|
|
18732
|
+
note_text = f"{step_label} {note_key} {note['markdown']}"
|
|
18733
|
+
notes_parts.append(note_text)
|
|
18734
|
+
notes_parts.append("") # Add blank line between notes
|
|
18735
|
+
|
|
18736
|
+
# Remove trailing blank line
|
|
18737
|
+
if notes_parts[-1] == "":
|
|
18738
|
+
notes_parts.pop()
|
|
18739
|
+
|
|
18740
|
+
# Join with newlines to create markdown text
|
|
18741
|
+
notes_markdown = "\n".join(notes_parts)
|
|
18742
|
+
|
|
18743
|
+
return notes_markdown
|
|
18744
|
+
|
|
18745
|
+
|
|
18746
|
+
def _create_label_html(label: str | None, start_time: str) -> str:
|
|
18747
|
+
if label is None:
|
|
18748
|
+
# Remove the decimal and everything beyond that
|
|
18749
|
+
start_time = str(start_time).split(".")[0]
|
|
18750
|
+
|
|
18751
|
+
# Replace the space character with a pipe character
|
|
18752
|
+
start_time = start_time.replace(" ", "|")
|
|
18753
|
+
|
|
18754
|
+
label = start_time
|
|
18755
|
+
|
|
18756
|
+
return (
|
|
18757
|
+
f"<span style='text-decoration-style: solid; text-decoration-color: #ADD8E6; "
|
|
18758
|
+
f"text-decoration-line: underline; text-underline-position: under; color: #333333; "
|
|
18759
|
+
f"font-variant-numeric: tabular-nums; padding-left: 4px; margin-right: 5px; "
|
|
18760
|
+
f"padding-right: 2px;'>{label}</span>"
|
|
18761
|
+
)
|
|
18762
|
+
|
|
18763
|
+
|
|
18764
|
+
def _format_single_integer_with_gt(value: int, locale: str = "en", df_lib=None) -> str:
|
|
18765
|
+
"""Format a single integer using Great Tables GT object to avoid pandas dependency."""
|
|
18766
|
+
if df_lib is None:
|
|
18767
|
+
# Use library detection to select appropriate DataFrame library
|
|
18768
|
+
if _is_lib_present("polars"):
|
|
18769
|
+
import polars as pl
|
|
18770
|
+
|
|
18771
|
+
df_lib = pl
|
|
18772
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
18773
|
+
import pandas as pd # pragma: no cover
|
|
18774
|
+
|
|
18775
|
+
df_lib = pd # pragma: no cover
|
|
18776
|
+
else: # pragma: no cover
|
|
18777
|
+
raise ImportError(
|
|
18778
|
+
"Neither Polars nor Pandas is available for formatting"
|
|
18779
|
+
) # pragma: no cover
|
|
18780
|
+
|
|
18781
|
+
# Create a single-row, single-column DataFrame using the specified library
|
|
18782
|
+
df = df_lib.DataFrame({"value": [value]})
|
|
18783
|
+
|
|
18784
|
+
# Create GT object and format the column
|
|
18785
|
+
gt_obj = GT(df).fmt_integer(columns="value", locale=locale)
|
|
18786
|
+
|
|
18787
|
+
# Extract the formatted value using _get_column_of_values
|
|
18788
|
+
formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
|
|
18789
|
+
|
|
18790
|
+
return formatted_values[0] # Return the single formatted value
|
|
18791
|
+
|
|
18792
|
+
|
|
18793
|
+
def _format_single_float_with_gt_custom(
|
|
18794
|
+
value: float,
|
|
18795
|
+
decimals: int = 2,
|
|
18796
|
+
drop_trailing_zeros: bool = False,
|
|
18797
|
+
locale: str = "en",
|
|
18798
|
+
df_lib=None,
|
|
18799
|
+
) -> str:
|
|
18800
|
+
"""Format a single float with custom options using Great Tables GT object to avoid pandas dependency."""
|
|
18801
|
+
if df_lib is None:
|
|
18802
|
+
# Use library detection to select appropriate DataFrame library
|
|
18803
|
+
if _is_lib_present("polars"):
|
|
18804
|
+
import polars as pl
|
|
18805
|
+
|
|
18806
|
+
df_lib = pl
|
|
18807
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
18808
|
+
import pandas as pd # pragma: no cover
|
|
18809
|
+
|
|
18810
|
+
df_lib = pd # pragma: no cover
|
|
18811
|
+
else: # pragma: no cover
|
|
18812
|
+
raise ImportError(
|
|
18813
|
+
"Neither Polars nor Pandas is available for formatting"
|
|
18814
|
+
) # pragma: no cover
|
|
18815
|
+
|
|
18816
|
+
# Create a single-row, single-column DataFrame using the specified library
|
|
18817
|
+
df = df_lib.DataFrame({"value": [value]})
|
|
18818
|
+
|
|
18819
|
+
# Create GT object and format the column
|
|
18820
|
+
gt_obj = GT(df).fmt_number(
|
|
18821
|
+
columns="value", decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
18822
|
+
)
|
|
18823
|
+
|
|
18824
|
+
# Extract the formatted value using _get_column_of_values
|
|
18825
|
+
formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
|
|
18826
|
+
|
|
18827
|
+
return formatted_values[0] # Return the single formatted value
|
|
18828
|
+
|
|
18829
|
+
|
|
18830
|
+
def _format_number_safe(
|
|
18831
|
+
value: float, decimals: int, drop_trailing_zeros: bool = False, locale: str = "en", df_lib=None
|
|
18832
|
+
) -> str:
|
|
18833
|
+
"""
|
|
18834
|
+
Safely format a float value with locale support.
|
|
18835
|
+
|
|
18836
|
+
Uses GT-based formatting when a DataFrame library is available, otherwise falls back to
|
|
18837
|
+
vals.fmt_number. This helper is used by threshold formatting functions.
|
|
18838
|
+
"""
|
|
18839
|
+
if df_lib is not None and value is not None:
|
|
18840
|
+
# Use GT-based formatting to avoid Pandas dependency completely
|
|
18841
|
+
return _format_single_float_with_gt_custom(
|
|
18842
|
+
value,
|
|
18843
|
+
decimals=decimals,
|
|
18844
|
+
drop_trailing_zeros=drop_trailing_zeros,
|
|
18845
|
+
locale=locale,
|
|
18846
|
+
df_lib=df_lib,
|
|
18847
|
+
)
|
|
18848
|
+
else:
|
|
18849
|
+
# Fallback to the original behavior
|
|
18850
|
+
return fmt_number(
|
|
18851
|
+
value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
18852
|
+
)[0] # pragma: no cover
|
|
18853
|
+
|
|
18854
|
+
|
|
18855
|
+
def _format_integer_safe(value: int, locale: str = "en", df_lib=None) -> str:
|
|
18856
|
+
"""
|
|
18857
|
+
Safely format an integer value with locale support.
|
|
18858
|
+
|
|
18859
|
+
Uses GT-based formatting when a DataFrame library is available, otherwise falls back to
|
|
18860
|
+
vals.fmt_integer. This helper is used by threshold formatting functions.
|
|
18861
|
+
"""
|
|
18862
|
+
if df_lib is not None and value is not None:
|
|
18863
|
+
# Use GT-based formatting to avoid Pandas dependency completely
|
|
18864
|
+
return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
|
|
18865
|
+
else:
|
|
18866
|
+
# Fallback to the original behavior
|
|
18867
|
+
return fmt_integer(value, locale=locale)[0]
|
|
18868
|
+
|
|
18869
|
+
|
|
18870
|
+
def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> str:
|
|
18871
|
+
if thresholds == Thresholds():
|
|
18872
|
+
return ""
|
|
18873
|
+
|
|
18874
|
+
warning = (
|
|
18875
|
+
_format_number_safe(
|
|
18876
|
+
thresholds.warning_fraction,
|
|
18877
|
+
decimals=3,
|
|
18878
|
+
drop_trailing_zeros=True,
|
|
18879
|
+
locale=locale,
|
|
18880
|
+
df_lib=df_lib,
|
|
18881
|
+
)
|
|
18882
|
+
if thresholds.warning_fraction is not None
|
|
18883
|
+
else (
|
|
18884
|
+
_format_integer_safe(thresholds.warning_count, locale=locale, df_lib=df_lib)
|
|
18885
|
+
if thresholds.warning_count is not None
|
|
18886
|
+
else "—"
|
|
18887
|
+
)
|
|
18888
|
+
)
|
|
18889
|
+
|
|
18890
|
+
error = (
|
|
18891
|
+
_format_number_safe(
|
|
18892
|
+
thresholds.error_fraction,
|
|
18893
|
+
decimals=3,
|
|
18894
|
+
drop_trailing_zeros=True,
|
|
18895
|
+
locale=locale,
|
|
18896
|
+
df_lib=df_lib,
|
|
18897
|
+
)
|
|
18898
|
+
if thresholds.error_fraction is not None
|
|
18899
|
+
else (
|
|
18900
|
+
_format_integer_safe(thresholds.error_count, locale=locale, df_lib=df_lib)
|
|
18901
|
+
if thresholds.error_count is not None
|
|
18902
|
+
else "—"
|
|
18903
|
+
)
|
|
18904
|
+
)
|
|
18905
|
+
|
|
18906
|
+
critical = (
|
|
18907
|
+
_format_number_safe(
|
|
18908
|
+
thresholds.critical_fraction,
|
|
18909
|
+
decimals=3,
|
|
18910
|
+
drop_trailing_zeros=True,
|
|
18911
|
+
locale=locale,
|
|
18912
|
+
df_lib=df_lib,
|
|
18913
|
+
)
|
|
18914
|
+
if thresholds.critical_fraction is not None
|
|
18915
|
+
else (
|
|
18916
|
+
_format_integer_safe(thresholds.critical_count, locale=locale, df_lib=df_lib)
|
|
18917
|
+
if thresholds.critical_count is not None
|
|
18918
|
+
else "—"
|
|
18919
|
+
)
|
|
18920
|
+
)
|
|
18921
|
+
|
|
18922
|
+
warning_color = SEVERITY_LEVEL_COLORS["warning"]
|
|
18923
|
+
error_color = SEVERITY_LEVEL_COLORS["error"]
|
|
18924
|
+
critical_color = SEVERITY_LEVEL_COLORS["critical"]
|
|
18925
|
+
|
|
18926
|
+
return (
|
|
18927
|
+
"<span>"
|
|
18928
|
+
f'<span style="background-color: {warning_color}; color: white; '
|
|
18929
|
+
"padding: 0.5em 0.5em; position: inherit; text-transform: uppercase; "
|
|
18930
|
+
f"margin: 5px 0px 5px 5px; border: solid 1px {warning_color}; "
|
|
18931
|
+
'font-weight: bold; padding: 2px 15px 2px 15px; font-size: smaller;">WARNING</span>'
|
|
18932
|
+
'<span style="background-color: none; color: #333333; padding: 0.5em 0.5em; '
|
|
18933
|
+
"position: inherit; margin: 5px 0px 5px -4px; font-weight: bold; "
|
|
18934
|
+
f"border: solid 1px {warning_color}; padding: 2px 15px 2px 15px; "
|
|
18935
|
+
'font-size: smaller; margin-right: 5px;">'
|
|
18936
|
+
f"{warning}"
|
|
18937
|
+
"</span>"
|
|
18938
|
+
f'<span style="background-color: {error_color}; color: white; '
|
|
18939
|
+
"padding: 0.5em 0.5em; position: inherit; text-transform: uppercase; "
|
|
18940
|
+
f"margin: 5px 0px 5px 1px; border: solid 1px {error_color}; "
|
|
18941
|
+
'font-weight: bold; padding: 2px 15px 2px 15px; font-size: smaller;">ERROR</span>'
|
|
18942
|
+
'<span style="background-color: none; color: #333333; padding: 0.5em 0.5em; '
|
|
18943
|
+
"position: inherit; margin: 5px 0px 5px -4px; font-weight: bold; "
|
|
18944
|
+
f"border: solid 1px {error_color}; padding: 2px 15px 2px 15px; "
|
|
18945
|
+
'font-size: smaller; margin-right: 5px;">'
|
|
18946
|
+
f"{error}"
|
|
18947
|
+
"</span>"
|
|
18948
|
+
f'<span style="background-color: {critical_color}; color: white; '
|
|
18949
|
+
"padding: 0.5em 0.5em; position: inherit; text-transform: uppercase; "
|
|
18950
|
+
f"margin: 5px 0px 5px 1px; border: solid 1px {critical_color}; "
|
|
18951
|
+
'font-weight: bold; padding: 2px 15px 2px 15px; font-size: smaller;">CRITICAL</span>'
|
|
18952
|
+
'<span style="background-color: none; color: #333333; padding: 0.5em 0.5em; '
|
|
18953
|
+
"position: inherit; margin: 5px 0px 5px -4px; font-weight: bold; "
|
|
18954
|
+
f"border: solid 1px {critical_color}; padding: 2px 15px 2px 15px; "
|
|
18955
|
+
'font-size: smaller;">'
|
|
18956
|
+
f"{critical}"
|
|
18957
|
+
"</span>"
|
|
18958
|
+
"</span>"
|
|
18959
|
+
)
|
|
18960
|
+
|
|
18961
|
+
|
|
18962
|
+
def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en") -> str:
|
|
18963
|
+
"""
|
|
18964
|
+
Create a miniature HTML representation of local thresholds for display in notes.
|
|
18965
|
+
|
|
18966
|
+
This function generates a compact HTML representation of threshold values that is suitable for
|
|
18967
|
+
display in validation step notes/footnotes. It follows a similar visual style to the global
|
|
18968
|
+
thresholds shown in the header, but with a more compact format.
|
|
18969
|
+
|
|
18970
|
+
Parameters
|
|
18971
|
+
----------
|
|
18972
|
+
thresholds
|
|
18973
|
+
The Thresholds object containing the local threshold values.
|
|
18974
|
+
locale
|
|
18975
|
+
The locale to use for formatting numbers (default: "en").
|
|
18976
|
+
|
|
18977
|
+
Returns
|
|
18978
|
+
-------
|
|
18979
|
+
str
|
|
18980
|
+
HTML string containing the formatted threshold information.
|
|
18981
|
+
"""
|
|
18982
|
+
if thresholds == Thresholds():
|
|
18983
|
+
return ""
|
|
18984
|
+
|
|
18985
|
+
# Get df_lib for formatting
|
|
18986
|
+
df_lib = None
|
|
18987
|
+
if _is_lib_present("polars"):
|
|
18988
|
+
import polars as pl
|
|
18989
|
+
|
|
18990
|
+
df_lib = pl
|
|
18991
|
+
elif _is_lib_present("pandas"):
|
|
18992
|
+
import pandas as pd
|
|
18993
|
+
|
|
18994
|
+
df_lib = pd
|
|
18995
|
+
|
|
18996
|
+
# Helper function to format threshold values using the shared formatting functions
|
|
18997
|
+
def _format_threshold_value(fraction: float | None, count: int | None) -> str:
|
|
18998
|
+
if fraction is not None:
|
|
18999
|
+
# Format as fraction/percentage with locale formatting
|
|
19000
|
+
if fraction == 0:
|
|
19001
|
+
return "0"
|
|
19002
|
+
elif fraction < 0.01:
|
|
19003
|
+
# For very small fractions, show "<0.01" with locale formatting
|
|
19004
|
+
formatted = _format_number_safe(0.01, decimals=2, locale=locale, df_lib=df_lib)
|
|
19005
|
+
return f"<{formatted}"
|
|
19006
|
+
else:
|
|
19007
|
+
# Use shared formatting function with drop_trailing_zeros
|
|
19008
|
+
formatted = _format_number_safe(
|
|
19009
|
+
fraction, decimals=2, drop_trailing_zeros=True, locale=locale, df_lib=df_lib
|
|
19010
|
+
)
|
|
19011
|
+
return formatted
|
|
19012
|
+
elif count is not None:
|
|
19013
|
+
# Format integer count using shared formatting function
|
|
19014
|
+
return _format_integer_safe(count, locale=locale, df_lib=df_lib)
|
|
19015
|
+
else:
|
|
19016
|
+
return "—"
|
|
19017
|
+
|
|
19018
|
+
warning = _format_threshold_value(thresholds.warning_fraction, thresholds.warning_count)
|
|
19019
|
+
error = _format_threshold_value(thresholds.error_fraction, thresholds.error_count)
|
|
19020
|
+
critical = _format_threshold_value(thresholds.critical_fraction, thresholds.critical_count)
|
|
19021
|
+
|
|
19022
|
+
warning_color = SEVERITY_LEVEL_COLORS["warning"]
|
|
19023
|
+
error_color = SEVERITY_LEVEL_COLORS["error"]
|
|
19024
|
+
critical_color = SEVERITY_LEVEL_COLORS["critical"]
|
|
19025
|
+
|
|
19026
|
+
# Build threshold parts with colored letters in monospace font
|
|
19027
|
+
threshold_parts = []
|
|
19028
|
+
|
|
19029
|
+
# Add warning threshold if set
|
|
19030
|
+
if thresholds.warning is not None:
|
|
19031
|
+
threshold_parts.append(
|
|
19032
|
+
f'<span style="color: {warning_color}; font-weight: bold;">W</span>:{warning}'
|
|
19033
|
+
)
|
|
19034
|
+
|
|
19035
|
+
# Add error threshold if set
|
|
19036
|
+
if thresholds.error is not None:
|
|
19037
|
+
threshold_parts.append(
|
|
19038
|
+
f'<span style="color: {error_color}; font-weight: bold;">E</span>:{error}'
|
|
19039
|
+
)
|
|
19040
|
+
|
|
19041
|
+
# Add critical threshold if set
|
|
19042
|
+
if thresholds.critical is not None:
|
|
19043
|
+
threshold_parts.append(
|
|
19044
|
+
f'<span style="color: {critical_color}; font-weight: bold;">C</span>:{critical}'
|
|
19045
|
+
)
|
|
19046
|
+
|
|
19047
|
+
# Join with "|" separator (only between multiple thresholds)
|
|
19048
|
+
thresholds_html = f'<span style="font-family: monospace;">{"|".join(threshold_parts)}</span>'
|
|
19049
|
+
|
|
19050
|
+
# Get localized text and format with threshold HTML
|
|
19051
|
+
localized_text = NOTES_TEXT["local_threshold"].get(locale, NOTES_TEXT["local_threshold"]["en"])
|
|
19052
|
+
note_html = localized_text.replace("{thresholds}", thresholds_html)
|
|
19053
|
+
|
|
19054
|
+
return note_html
|
|
19055
|
+
|
|
19056
|
+
|
|
19057
|
+
def _create_local_threshold_note_text(thresholds: Thresholds) -> str:
|
|
19058
|
+
"""
|
|
19059
|
+
Create a plain text representation of local thresholds for display in logs.
|
|
19060
|
+
|
|
19061
|
+
This function generates a plain text representation of threshold values that is
|
|
19062
|
+
suitable for display in text-based output such as logs or console output.
|
|
19063
|
+
|
|
19064
|
+
Parameters
|
|
19065
|
+
----------
|
|
19066
|
+
thresholds
|
|
19067
|
+
The Thresholds object containing the local threshold values.
|
|
19068
|
+
|
|
19069
|
+
Returns
|
|
19070
|
+
-------
|
|
19071
|
+
str
|
|
19072
|
+
Plain text string containing the formatted threshold information.
|
|
19073
|
+
"""
|
|
19074
|
+
if thresholds == Thresholds():
|
|
19075
|
+
return ""
|
|
19076
|
+
|
|
19077
|
+
# Helper function to format threshold values
|
|
19078
|
+
def _format_threshold_value(fraction: float | None, count: int | None) -> str:
|
|
19079
|
+
if fraction is not None:
|
|
19080
|
+
if fraction == 0:
|
|
19081
|
+
return "0"
|
|
19082
|
+
elif fraction < 0.01:
|
|
19083
|
+
return "<0.01"
|
|
19084
|
+
else:
|
|
19085
|
+
return f"{fraction:.2f}".rstrip("0").rstrip(".")
|
|
19086
|
+
elif count is not None:
|
|
19087
|
+
return str(count)
|
|
19088
|
+
else:
|
|
19089
|
+
return "—"
|
|
19090
|
+
|
|
19091
|
+
parts = []
|
|
19092
|
+
|
|
19093
|
+
if thresholds.warning is not None:
|
|
19094
|
+
warning = _format_threshold_value(thresholds.warning_fraction, thresholds.warning_count)
|
|
19095
|
+
parts.append(f"W: {warning}")
|
|
19096
|
+
|
|
19097
|
+
if thresholds.error is not None:
|
|
19098
|
+
error = _format_threshold_value(thresholds.error_fraction, thresholds.error_count)
|
|
19099
|
+
parts.append(f"E: {error}")
|
|
19100
|
+
|
|
19101
|
+
if thresholds.critical is not None:
|
|
19102
|
+
critical = _format_threshold_value(thresholds.critical_fraction, thresholds.critical_count)
|
|
19103
|
+
parts.append(f"C: {critical}")
|
|
19104
|
+
|
|
19105
|
+
if parts:
|
|
19106
|
+
return "Step-specific thresholds set: " + ", ".join(parts)
|
|
19107
|
+
else:
|
|
19108
|
+
return ""
|
|
19109
|
+
|
|
19110
|
+
|
|
19111
|
+
def _create_threshold_reset_note_html(locale: str = "en") -> str:
|
|
19112
|
+
"""
|
|
19113
|
+
Create an HTML note for when thresholds are explicitly reset to empty.
|
|
19114
|
+
|
|
19115
|
+
Parameters
|
|
19116
|
+
----------
|
|
19117
|
+
locale
|
|
19118
|
+
The locale string (e.g., 'en', 'fr').
|
|
19119
|
+
|
|
19120
|
+
Returns
|
|
19121
|
+
-------
|
|
19122
|
+
str
|
|
19123
|
+
HTML-formatted note text.
|
|
19124
|
+
"""
|
|
19125
|
+
text = NOTES_TEXT.get("local_threshold_reset", {}).get(
|
|
19126
|
+
locale, NOTES_TEXT.get("local_threshold_reset", {}).get("en", "")
|
|
19127
|
+
)
|
|
19128
|
+
return text
|
|
19129
|
+
|
|
19130
|
+
|
|
19131
|
+
def _create_threshold_reset_note_text() -> str:
|
|
19132
|
+
"""
|
|
19133
|
+
Create a plain text note for when thresholds are explicitly reset to empty.
|
|
19134
|
+
|
|
19135
|
+
Returns
|
|
19136
|
+
-------
|
|
19137
|
+
str
|
|
19138
|
+
Plain text note.
|
|
19139
|
+
"""
|
|
19140
|
+
return "Global thresholds explicitly not used for this step."
|
|
19141
|
+
|
|
19142
|
+
|
|
19143
|
+
def _create_no_columns_resolved_note_html(
|
|
19144
|
+
column_expr: str, available_columns: list[str], locale: str = "en"
|
|
19145
|
+
) -> str:
|
|
19146
|
+
"""
|
|
19147
|
+
Create an HTML note explaining that a column expression resolved to no columns.
|
|
19148
|
+
|
|
19149
|
+
Parameters
|
|
19150
|
+
----------
|
|
19151
|
+
column_expr
|
|
19152
|
+
The column expression that failed to resolve columns (as a string).
|
|
19153
|
+
available_columns
|
|
19154
|
+
List of available column names in the table.
|
|
19155
|
+
locale
|
|
19156
|
+
The locale string (e.g., 'en', 'fr').
|
|
19157
|
+
|
|
19158
|
+
Returns
|
|
19159
|
+
-------
|
|
19160
|
+
str
|
|
19161
|
+
HTML-formatted note text.
|
|
19162
|
+
"""
|
|
19163
|
+
# Get translated strings
|
|
19164
|
+
intro = NOTES_TEXT.get("column_not_found_intro", {}).get(
|
|
19165
|
+
locale, NOTES_TEXT.get("column_not_found_intro", {}).get("en", "The column expression")
|
|
19166
|
+
)
|
|
19167
|
+
no_resolve = NOTES_TEXT.get("column_not_found_no_resolve", {}).get(
|
|
19168
|
+
locale,
|
|
19169
|
+
NOTES_TEXT.get("column_not_found_no_resolve", {}).get(
|
|
19170
|
+
"en", "does not resolve to any columns"
|
|
19171
|
+
),
|
|
19172
|
+
)
|
|
19173
|
+
|
|
19174
|
+
# Format the column expression with monospace font
|
|
19175
|
+
col_expr_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_expr}</code>"
|
|
19176
|
+
|
|
19177
|
+
# Build the HTML note
|
|
19178
|
+
html = f"{intro} {col_expr_html} {no_resolve}."
|
|
19179
|
+
|
|
19180
|
+
return html
|
|
19181
|
+
|
|
19182
|
+
|
|
19183
|
+
def _create_no_columns_resolved_note_text(column_expr: str, available_columns: list[str]) -> str:
|
|
19184
|
+
"""
|
|
19185
|
+
Create a plain text note explaining that a column expression resolved to no columns.
|
|
19186
|
+
|
|
19187
|
+
Parameters
|
|
19188
|
+
----------
|
|
19189
|
+
column_expr
|
|
19190
|
+
The column expression that failed to resolve columns (as a string).
|
|
19191
|
+
available_columns
|
|
19192
|
+
List of available column names in the table.
|
|
19193
|
+
|
|
19194
|
+
Returns
|
|
19195
|
+
-------
|
|
19196
|
+
str
|
|
19197
|
+
Plain text note.
|
|
19198
|
+
"""
|
|
19199
|
+
return f"The column expression `{column_expr}` does not resolve to any columns."
|
|
19200
|
+
|
|
19201
|
+
|
|
19202
|
+
def _create_column_not_found_note_html(
|
|
19203
|
+
column_name: str, available_columns: list[str], locale: str = "en"
|
|
19204
|
+
) -> str:
|
|
19205
|
+
"""
|
|
19206
|
+
Create an HTML note explaining that a specific column was not found.
|
|
19207
|
+
|
|
19208
|
+
Parameters
|
|
19209
|
+
----------
|
|
19210
|
+
column_name
|
|
19211
|
+
The column name that was not found.
|
|
19212
|
+
available_columns
|
|
19213
|
+
List of available column names in the table.
|
|
19214
|
+
locale
|
|
19215
|
+
The locale string (e.g., 'en', 'fr').
|
|
19216
|
+
|
|
19217
|
+
Returns
|
|
19218
|
+
-------
|
|
19219
|
+
str
|
|
19220
|
+
HTML-formatted note text.
|
|
19221
|
+
"""
|
|
19222
|
+
# Get translated strings
|
|
19223
|
+
intro = NOTES_TEXT.get("target_column_provided", {}).get(
|
|
19224
|
+
locale, NOTES_TEXT.get("target_column_provided", {}).get("en", "The target column provided")
|
|
19225
|
+
)
|
|
19226
|
+
not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get(
|
|
19227
|
+
locale,
|
|
19228
|
+
NOTES_TEXT.get("does_not_match_any_columns", {}).get(
|
|
19229
|
+
"en", "does not match any columns in the table"
|
|
19230
|
+
),
|
|
19231
|
+
)
|
|
19232
|
+
|
|
19233
|
+
# Format the column name with monospace font
|
|
19234
|
+
col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
|
|
19235
|
+
|
|
19236
|
+
# Build the HTML note
|
|
19237
|
+
html = f"{intro} ({col_name_html}) {not_found}."
|
|
19238
|
+
|
|
19239
|
+
return html
|
|
19240
|
+
|
|
19241
|
+
|
|
19242
|
+
def _create_column_not_found_note_text(column_name: str, available_columns: list[str]) -> str:
|
|
19243
|
+
"""
|
|
19244
|
+
Create a plain text note explaining that a specific column was not found.
|
|
19245
|
+
|
|
19246
|
+
Parameters
|
|
19247
|
+
----------
|
|
19248
|
+
column_name
|
|
19249
|
+
The column name that was not found.
|
|
19250
|
+
available_columns
|
|
19251
|
+
List of available column names in the table.
|
|
19252
|
+
|
|
19253
|
+
Returns
|
|
19254
|
+
-------
|
|
19255
|
+
str
|
|
19256
|
+
Plain text note.
|
|
19257
|
+
"""
|
|
19258
|
+
return f"The target column provided ({column_name}) does not match any columns in the table."
|
|
19259
|
+
|
|
19260
|
+
|
|
19261
|
+
def _create_comparison_column_not_found_note_html(
|
|
19262
|
+
column_name: str, position: str | None, available_columns: list[str], locale: str = "en"
|
|
19263
|
+
) -> str:
|
|
19264
|
+
"""
|
|
19265
|
+
Create an HTML note explaining that a comparison column was not found.
|
|
19266
|
+
|
|
19267
|
+
Parameters
|
|
19268
|
+
----------
|
|
19269
|
+
column_name
|
|
19270
|
+
The comparison column name that was not found.
|
|
19271
|
+
position
|
|
19272
|
+
Optional position indicator ("left", "right") for between/outside validations.
|
|
19273
|
+
available_columns
|
|
19274
|
+
List of available column names in the table.
|
|
19275
|
+
locale
|
|
19276
|
+
The locale string (e.g., 'en', 'fr').
|
|
19277
|
+
|
|
19278
|
+
Returns
|
|
19279
|
+
-------
|
|
19280
|
+
str
|
|
19281
|
+
HTML-formatted note text.
|
|
19282
|
+
"""
|
|
19283
|
+
# Get translated strings
|
|
19284
|
+
intro = NOTES_TEXT.get("comparison_column_provided", {}).get(
|
|
19285
|
+
locale,
|
|
19286
|
+
NOTES_TEXT.get("comparison_column_provided", {}).get(
|
|
19287
|
+
"en", "The comparison column provided"
|
|
19288
|
+
),
|
|
19289
|
+
)
|
|
19290
|
+
intro_with_for = NOTES_TEXT.get("comparison_column_for", {}).get(
|
|
19291
|
+
locale,
|
|
19292
|
+
NOTES_TEXT.get("comparison_column_for", {}).get("en", "The comparison column provided for"),
|
|
19293
|
+
)
|
|
19294
|
+
not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get(
|
|
19295
|
+
locale,
|
|
19296
|
+
NOTES_TEXT.get("does_not_match_any_columns", {}).get(
|
|
19297
|
+
"en", "does not match any columns in the table"
|
|
19298
|
+
),
|
|
19299
|
+
)
|
|
19300
|
+
|
|
19301
|
+
# Format the column name with monospace font
|
|
19302
|
+
col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
|
|
19303
|
+
|
|
19304
|
+
# Add position if provided (for between/outside validations)
|
|
19305
|
+
if position:
|
|
19306
|
+
# Format position parameter with monospace font (e.g., "left=", "right=")
|
|
19307
|
+
position_param = (
|
|
19308
|
+
f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{position}=</code>"
|
|
19309
|
+
)
|
|
19310
|
+
# Use the "for" version of the intro text
|
|
19311
|
+
html = f"{intro_with_for} {position_param} ({col_name_html}) {not_found}."
|
|
19312
|
+
else:
|
|
19313
|
+
# Use the standard intro text without "for"
|
|
19314
|
+
html = f"{intro} ({col_name_html}) {not_found}."
|
|
19315
|
+
|
|
19316
|
+
return html
|
|
19317
|
+
|
|
19318
|
+
|
|
19319
|
+
def _create_comparison_column_not_found_note_text(
|
|
19320
|
+
column_name: str, position: str | None, available_columns: list[str]
|
|
19321
|
+
) -> str:
|
|
19322
|
+
"""
|
|
19323
|
+
Create a plain text note explaining that a comparison column was not found.
|
|
19324
|
+
|
|
19325
|
+
Parameters
|
|
19326
|
+
----------
|
|
19327
|
+
column_name
|
|
19328
|
+
The comparison column name that was not found.
|
|
19329
|
+
position
|
|
19330
|
+
Optional position indicator ("left", "right") for between/outside validations.
|
|
19331
|
+
available_columns
|
|
19332
|
+
List of available column names in the table.
|
|
19333
|
+
|
|
19334
|
+
Returns
|
|
19335
|
+
-------
|
|
19336
|
+
str
|
|
19337
|
+
Plain text note.
|
|
19338
|
+
"""
|
|
19339
|
+
if position:
|
|
19340
|
+
position_text = f" for {position}="
|
|
19341
|
+
else:
|
|
19342
|
+
position_text = ""
|
|
19343
|
+
|
|
19344
|
+
return (
|
|
19345
|
+
f"The comparison column provided{position_text} ({column_name}) "
|
|
19346
|
+
f"does not match any columns in the table."
|
|
19347
|
+
)
|
|
19348
|
+
|
|
19349
|
+
|
|
19350
|
+
def _create_preprocessing_note_html(
|
|
19351
|
+
original_rows: int,
|
|
19352
|
+
original_cols: int,
|
|
19353
|
+
processed_rows: int,
|
|
19354
|
+
processed_cols: int,
|
|
19355
|
+
locale: str = "en",
|
|
19356
|
+
) -> str:
|
|
19357
|
+
"""
|
|
19358
|
+
Create an HTML note showing table dimension changes from preprocessing.
|
|
19359
|
+
|
|
19360
|
+
Parameters
|
|
19361
|
+
----------
|
|
19362
|
+
original_rows
|
|
19363
|
+
Number of rows in the original table.
|
|
19364
|
+
original_cols
|
|
19365
|
+
Number of columns in the original table.
|
|
19366
|
+
processed_rows
|
|
19367
|
+
Number of rows after preprocessing.
|
|
19368
|
+
processed_cols
|
|
19369
|
+
Number of columns after preprocessing.
|
|
19370
|
+
locale
|
|
19371
|
+
The locale string (e.g., 'en', 'fr').
|
|
19372
|
+
|
|
19373
|
+
Returns
|
|
19374
|
+
-------
|
|
19375
|
+
str
|
|
19376
|
+
HTML-formatted note text.
|
|
19377
|
+
"""
|
|
19378
|
+
# Get translated strings
|
|
19379
|
+
precondition_text = NOTES_TEXT.get("precondition_applied", {}).get(
|
|
19380
|
+
locale, NOTES_TEXT.get("precondition_applied", {}).get("en", "Precondition applied")
|
|
19381
|
+
)
|
|
19382
|
+
table_dims_text = NOTES_TEXT.get("table_dimensions", {}).get(
|
|
19383
|
+
locale, NOTES_TEXT.get("table_dimensions", {}).get("en", "table dimensions")
|
|
19384
|
+
)
|
|
19385
|
+
|
|
19386
|
+
# Helper function to get singular or plural form
|
|
19387
|
+
def get_row_text(count: int) -> str:
|
|
19388
|
+
if count == 1:
|
|
19389
|
+
return NOTES_TEXT.get("row", {}).get(locale, NOTES_TEXT.get("row", {}).get("en", "row"))
|
|
19390
|
+
return NOTES_TEXT.get("rows", {}).get(locale, NOTES_TEXT.get("rows", {}).get("en", "rows"))
|
|
19391
|
+
|
|
19392
|
+
def get_col_text(count: int) -> str:
|
|
19393
|
+
if count == 1:
|
|
19394
|
+
return NOTES_TEXT.get("column", {}).get(
|
|
19395
|
+
locale, NOTES_TEXT.get("column", {}).get("en", "column")
|
|
19396
|
+
)
|
|
19397
|
+
return NOTES_TEXT.get("columns", {}).get(
|
|
19398
|
+
locale, NOTES_TEXT.get("columns", {}).get("en", "columns")
|
|
19399
|
+
)
|
|
19400
|
+
|
|
19401
|
+
# Determine which dimensions changed
|
|
19402
|
+
rows_changed = original_rows != processed_rows
|
|
19403
|
+
cols_changed = original_cols != processed_cols
|
|
19404
|
+
|
|
19405
|
+
# Format original dimensions
|
|
19406
|
+
original_rows_text = get_row_text(original_rows)
|
|
19407
|
+
original_cols_text = get_col_text(original_cols)
|
|
19408
|
+
original_dim = (
|
|
19409
|
+
f'<span style="font-family: monospace;">'
|
|
19410
|
+
f"[{original_rows:,} {original_rows_text}, {original_cols} {original_cols_text}]"
|
|
19411
|
+
f"</span>"
|
|
19412
|
+
)
|
|
19413
|
+
|
|
19414
|
+
# Format processed dimensions with bold for changed values
|
|
19415
|
+
processed_rows_text = get_row_text(processed_rows)
|
|
19416
|
+
processed_cols_text = get_col_text(processed_cols)
|
|
19417
|
+
|
|
19418
|
+
if rows_changed:
|
|
19419
|
+
rows_display = f"<strong>{processed_rows:,}</strong> {processed_rows_text}"
|
|
19420
|
+
else:
|
|
19421
|
+
rows_display = f"{processed_rows:,} {processed_rows_text}"
|
|
19422
|
+
|
|
19423
|
+
if cols_changed:
|
|
19424
|
+
cols_display = f"<strong>{processed_cols}</strong> {processed_cols_text}"
|
|
19425
|
+
else:
|
|
19426
|
+
cols_display = f"{processed_cols} {processed_cols_text}"
|
|
19427
|
+
|
|
19428
|
+
processed_dim = f'<span style="font-family: monospace;">[{rows_display}, {cols_display}]</span>'
|
|
19429
|
+
|
|
19430
|
+
# Build the HTML note
|
|
19431
|
+
html = f"{precondition_text}: {table_dims_text} {original_dim} → {processed_dim}."
|
|
19432
|
+
|
|
19433
|
+
return html
|
|
19434
|
+
|
|
19435
|
+
|
|
19436
|
+
def _create_preprocessing_note_text(
|
|
19437
|
+
original_rows: int,
|
|
19438
|
+
original_cols: int,
|
|
19439
|
+
processed_rows: int,
|
|
19440
|
+
processed_cols: int,
|
|
19441
|
+
) -> str:
|
|
19442
|
+
"""
|
|
19443
|
+
Create a plain text note showing table dimension changes from preprocessing.
|
|
19444
|
+
|
|
19445
|
+
Parameters
|
|
19446
|
+
----------
|
|
19447
|
+
original_rows
|
|
19448
|
+
Number of rows in the original table.
|
|
19449
|
+
original_cols
|
|
19450
|
+
Number of columns in the original table.
|
|
19451
|
+
processed_rows
|
|
19452
|
+
Number of rows after preprocessing.
|
|
19453
|
+
processed_cols
|
|
19454
|
+
Number of columns after preprocessing.
|
|
19455
|
+
|
|
19456
|
+
Returns
|
|
19457
|
+
-------
|
|
19458
|
+
str
|
|
19459
|
+
Plain text note.
|
|
19460
|
+
"""
|
|
19461
|
+
# Get singular or plural forms
|
|
19462
|
+
original_rows_text = "row" if original_rows == 1 else "rows"
|
|
19463
|
+
original_cols_text = "column" if original_cols == 1 else "columns"
|
|
19464
|
+
processed_rows_text = "row" if processed_rows == 1 else "rows"
|
|
19465
|
+
processed_cols_text = "column" if processed_cols == 1 else "columns"
|
|
19466
|
+
|
|
19467
|
+
return (
|
|
19468
|
+
f"Precondition applied: table dimensions "
|
|
19469
|
+
f"[{original_rows:,} {original_rows_text}, {original_cols} {original_cols_text}] → "
|
|
19470
|
+
f"[{processed_rows:,} {processed_rows_text}, {processed_cols} {processed_cols_text}]."
|
|
19471
|
+
)
|
|
19472
|
+
|
|
19473
|
+
|
|
19474
|
+
def _create_preprocessing_no_change_note_html(locale: str = "en") -> str:
|
|
19475
|
+
"""
|
|
19476
|
+
Create an HTML note indicating preprocessing was applied with no dimension change.
|
|
19477
|
+
|
|
19478
|
+
Parameters
|
|
19479
|
+
----------
|
|
19480
|
+
locale
|
|
19481
|
+
The locale string (e.g., 'en', 'fr').
|
|
19482
|
+
|
|
19483
|
+
Returns
|
|
19484
|
+
-------
|
|
19485
|
+
str
|
|
19486
|
+
HTML-formatted note text.
|
|
19487
|
+
"""
|
|
19488
|
+
# Get translated string
|
|
19489
|
+
note_text = NOTES_TEXT.get("precondition_applied_no_change", {}).get(
|
|
19490
|
+
locale,
|
|
19491
|
+
NOTES_TEXT.get("precondition_applied_no_change", {}).get(
|
|
19492
|
+
"en", "Precondition applied: no table dimension change"
|
|
19493
|
+
),
|
|
19494
|
+
)
|
|
19495
|
+
|
|
19496
|
+
return f"{note_text}."
|
|
19497
|
+
|
|
19498
|
+
|
|
19499
|
+
def _create_preprocessing_no_change_note_text() -> str:
|
|
17870
19500
|
"""
|
|
17871
|
-
Create
|
|
19501
|
+
Create a plain text note indicating preprocessing was applied with no dimension change.
|
|
17872
19502
|
|
|
17873
|
-
|
|
17874
|
-
|
|
17875
|
-
|
|
19503
|
+
Returns
|
|
19504
|
+
-------
|
|
19505
|
+
str
|
|
19506
|
+
Plain text note.
|
|
19507
|
+
"""
|
|
19508
|
+
return "Precondition applied: no table dimension change."
|
|
19509
|
+
|
|
19510
|
+
|
|
19511
|
+
def _create_synthetic_target_column_note_html(column_name: str, locale: str = "en") -> str:
|
|
19512
|
+
"""
|
|
19513
|
+
Create an HTML note indicating that the target column was created via preprocessing.
|
|
17876
19514
|
|
|
17877
19515
|
Parameters
|
|
17878
19516
|
----------
|
|
17879
|
-
|
|
17880
|
-
|
|
19517
|
+
column_name
|
|
19518
|
+
The name of the synthetic target column.
|
|
19519
|
+
locale
|
|
19520
|
+
The locale string (e.g., 'en', 'fr').
|
|
17881
19521
|
|
|
17882
19522
|
Returns
|
|
17883
19523
|
-------
|
|
17884
19524
|
str
|
|
17885
|
-
|
|
19525
|
+
HTML-formatted note text.
|
|
17886
19526
|
"""
|
|
17887
|
-
#
|
|
17888
|
-
|
|
17889
|
-
|
|
17890
|
-
|
|
17891
|
-
|
|
17892
|
-
|
|
17893
|
-
|
|
17894
|
-
|
|
17895
|
-
"step": step.i,
|
|
17896
|
-
"key": key,
|
|
17897
|
-
"markdown": content["markdown"],
|
|
17898
|
-
"text": content["text"],
|
|
17899
|
-
}
|
|
17900
|
-
)
|
|
17901
|
-
|
|
17902
|
-
# If no notes, return empty string
|
|
17903
|
-
if not all_notes:
|
|
17904
|
-
return ""
|
|
19527
|
+
# Get translated strings
|
|
19528
|
+
synthetic_text = NOTES_TEXT.get("synthetic_target_column", {}).get(
|
|
19529
|
+
locale, NOTES_TEXT.get("synthetic_target_column", {}).get("en", "Synthetic target column")
|
|
19530
|
+
)
|
|
19531
|
+
created_via_text = NOTES_TEXT.get("created_via_preprocessing", {}).get(
|
|
19532
|
+
locale,
|
|
19533
|
+
NOTES_TEXT.get("created_via_preprocessing", {}).get("en", "created via preprocessing"),
|
|
19534
|
+
)
|
|
17905
19535
|
|
|
17906
|
-
#
|
|
17907
|
-
|
|
17908
|
-
notes_parts = [
|
|
17909
|
-
(
|
|
17910
|
-
"<hr style='border: none; border-top-width: 1px; border-top-style: dotted; "
|
|
17911
|
-
"border-top-color: #B5B5B5; margin-top: -3px; margin-bottom: 3px;'>"
|
|
17912
|
-
),
|
|
17913
|
-
"<strong>Notes</strong>",
|
|
17914
|
-
"",
|
|
17915
|
-
]
|
|
19536
|
+
# Format the column name with monospace font
|
|
19537
|
+
col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
|
|
17916
19538
|
|
|
17917
|
-
|
|
17918
|
-
|
|
17919
|
-
# Determine if this is the first note for this step
|
|
17920
|
-
is_first_for_step = note["step"] != previous_step
|
|
17921
|
-
previous_step = note["step"]
|
|
19539
|
+
# Build the HTML note
|
|
19540
|
+
html = f"{synthetic_text} {col_name_html} {created_via_text}."
|
|
17922
19541
|
|
|
17923
|
-
|
|
17924
|
-
# Use lighter color for subsequent notes of the same step
|
|
17925
|
-
step_color = "#333333" if is_first_for_step else "#999999"
|
|
17926
|
-
step_label = (
|
|
17927
|
-
f"<span style='font-variant: small-caps; font-weight: bold; font-size: smaller; "
|
|
17928
|
-
f"text-transform: uppercase; color: {step_color};'>Step {note['step']}</span>"
|
|
17929
|
-
)
|
|
19542
|
+
return html
|
|
17930
19543
|
|
|
17931
|
-
# Format note key in monospaced font with smaller size
|
|
17932
|
-
note_key = f"<span style='font-family: \"IBM Plex Mono\", monospace; font-size: smaller;'>({note['key']})</span>"
|
|
17933
19544
|
|
|
17934
|
-
|
|
17935
|
-
|
|
17936
|
-
|
|
17937
|
-
notes_parts.append("") # Add blank line between notes
|
|
19545
|
+
def _create_synthetic_target_column_note_text(column_name: str) -> str:
|
|
19546
|
+
"""
|
|
19547
|
+
Create a plain text note indicating that the target column was created via preprocessing.
|
|
17938
19548
|
|
|
17939
|
-
|
|
17940
|
-
|
|
17941
|
-
|
|
19549
|
+
Parameters
|
|
19550
|
+
----------
|
|
19551
|
+
column_name
|
|
19552
|
+
The name of the synthetic target column.
|
|
17942
19553
|
|
|
17943
|
-
|
|
17944
|
-
|
|
19554
|
+
Returns
|
|
19555
|
+
-------
|
|
19556
|
+
str
|
|
19557
|
+
Plain text note.
|
|
19558
|
+
"""
|
|
19559
|
+
return f"Synthetic target column ({column_name}) created via preprocessing."
|
|
17945
19560
|
|
|
17946
|
-
return notes_markdown
|
|
17947
19561
|
|
|
19562
|
+
def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> str:
|
|
19563
|
+
"""
|
|
19564
|
+
Create an HTML note with collapsible schema expectation and results.
|
|
17948
19565
|
|
|
17949
|
-
|
|
17950
|
-
|
|
17951
|
-
|
|
17952
|
-
start_time = str(start_time).split(".")[0]
|
|
19566
|
+
This generates a disclosure-style note showing:
|
|
19567
|
+
1. A summary of what failed (if anything)
|
|
19568
|
+
2. The full step report table (collapsible)
|
|
17953
19569
|
|
|
17954
|
-
|
|
17955
|
-
|
|
19570
|
+
Parameters
|
|
19571
|
+
----------
|
|
19572
|
+
schema_info
|
|
19573
|
+
The schema validation information dictionary from interrogation.
|
|
19574
|
+
locale
|
|
19575
|
+
The locale string (e.g., 'en', 'fr').
|
|
17956
19576
|
|
|
17957
|
-
|
|
19577
|
+
Returns
|
|
19578
|
+
-------
|
|
19579
|
+
str
|
|
19580
|
+
HTML-formatted note with collapsible schema details.
|
|
19581
|
+
"""
|
|
19582
|
+
passed = schema_info["passed"]
|
|
19583
|
+
expect_schema = schema_info["expect_schema"]
|
|
19584
|
+
target_schema = schema_info["target_schema"]
|
|
19585
|
+
params = schema_info["params"]
|
|
19586
|
+
columns_dict = schema_info["columns"]
|
|
19587
|
+
in_order = params["in_order"]
|
|
17958
19588
|
|
|
17959
|
-
|
|
17960
|
-
|
|
17961
|
-
|
|
17962
|
-
|
|
17963
|
-
|
|
19589
|
+
# Get translations for the locale
|
|
19590
|
+
passed_text = VALIDATION_REPORT_TEXT["note_schema_comparison_passed"].get(
|
|
19591
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_comparison_passed"]["en"]
|
|
19592
|
+
)
|
|
19593
|
+
failed_text = VALIDATION_REPORT_TEXT["note_schema_comparison_failed"].get(
|
|
19594
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_comparison_failed"]["en"]
|
|
19595
|
+
)
|
|
19596
|
+
disclosure_text = VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"].get(
|
|
19597
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"]["en"]
|
|
19598
|
+
)
|
|
19599
|
+
settings_title_text = VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"].get(
|
|
19600
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"]["en"]
|
|
17964
19601
|
)
|
|
17965
19602
|
|
|
19603
|
+
# Build summary message
|
|
19604
|
+
if passed:
|
|
19605
|
+
summary = f'<span style="color:#4CA64C;">✓</span> {passed_text}.'
|
|
19606
|
+
else:
|
|
19607
|
+
# Analyze what failed
|
|
19608
|
+
failures = []
|
|
17966
19609
|
|
|
17967
|
-
|
|
17968
|
-
|
|
17969
|
-
|
|
17970
|
-
|
|
17971
|
-
|
|
17972
|
-
|
|
17973
|
-
|
|
17974
|
-
|
|
17975
|
-
elif _is_lib_present("pandas"): # pragma: no cover
|
|
17976
|
-
import pandas as pd # pragma: no cover
|
|
17977
|
-
|
|
17978
|
-
df_lib = pd # pragma: no cover
|
|
17979
|
-
else: # pragma: no cover
|
|
17980
|
-
raise ImportError(
|
|
17981
|
-
"Neither Polars nor Pandas is available for formatting"
|
|
17982
|
-
) # pragma: no cover
|
|
17983
|
-
|
|
17984
|
-
# Create a single-row, single-column DataFrame using the specified library
|
|
17985
|
-
df = df_lib.DataFrame({"value": [value]})
|
|
17986
|
-
|
|
17987
|
-
# Create GT object and format the column
|
|
17988
|
-
gt_obj = GT(df).fmt_integer(columns="value", locale=locale)
|
|
19610
|
+
# Check column count mismatch
|
|
19611
|
+
n_expect = len(expect_schema)
|
|
19612
|
+
n_target = len(target_schema)
|
|
19613
|
+
if n_expect != n_target:
|
|
19614
|
+
count_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"].get(
|
|
19615
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"]["en"]
|
|
19616
|
+
)
|
|
19617
|
+
failures.append(count_mismatch_text.format(n_expect=n_expect, n_target=n_target))
|
|
17989
19618
|
|
|
17990
|
-
|
|
17991
|
-
|
|
19619
|
+
# Check for unmatched columns
|
|
19620
|
+
unmatched_cols = [col for col, info in columns_dict.items() if not info["colname_matched"]]
|
|
19621
|
+
if unmatched_cols:
|
|
19622
|
+
unmatched_text = VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"].get(
|
|
19623
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"]["en"]
|
|
19624
|
+
)
|
|
19625
|
+
failures.append(unmatched_text.format(n=len(unmatched_cols)))
|
|
19626
|
+
|
|
19627
|
+
# Check for wrong order (if in_order=True)
|
|
19628
|
+
if params["in_order"]:
|
|
19629
|
+
wrong_order = [
|
|
19630
|
+
col
|
|
19631
|
+
for col, info in columns_dict.items()
|
|
19632
|
+
if info["colname_matched"] and not info["index_matched"]
|
|
19633
|
+
]
|
|
19634
|
+
if wrong_order:
|
|
19635
|
+
wrong_order_text = VALIDATION_REPORT_TEXT["note_schema_wrong_order"].get(
|
|
19636
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_wrong_order"]["en"]
|
|
19637
|
+
)
|
|
19638
|
+
failures.append(wrong_order_text.format(n=len(wrong_order)))
|
|
17992
19639
|
|
|
17993
|
-
|
|
19640
|
+
# Check for dtype mismatches
|
|
19641
|
+
dtype_mismatches = [
|
|
19642
|
+
col
|
|
19643
|
+
for col, info in columns_dict.items()
|
|
19644
|
+
if info["colname_matched"] and info["dtype_present"] and not info["dtype_matched"]
|
|
19645
|
+
]
|
|
19646
|
+
if dtype_mismatches:
|
|
19647
|
+
dtype_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"].get(
|
|
19648
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"]["en"]
|
|
19649
|
+
)
|
|
19650
|
+
failures.append(dtype_mismatch_text.format(n=len(dtype_mismatches)))
|
|
17994
19651
|
|
|
19652
|
+
if failures:
|
|
19653
|
+
summary = (
|
|
19654
|
+
f'<span style="color:#FF3300;">✗</span> {failed_text}: ' + ", ".join(failures) + "."
|
|
19655
|
+
)
|
|
19656
|
+
else:
|
|
19657
|
+
summary = f'<span style="color:#FF3300;">✗</span> {failed_text}.'
|
|
17995
19658
|
|
|
17996
|
-
|
|
17997
|
-
|
|
17998
|
-
|
|
17999
|
-
|
|
18000
|
-
|
|
18001
|
-
|
|
18002
|
-
)
|
|
18003
|
-
|
|
18004
|
-
|
|
18005
|
-
|
|
18006
|
-
|
|
18007
|
-
|
|
19659
|
+
# Generate the step report table using the existing function
|
|
19660
|
+
# We'll call either _step_report_schema_in_order or _step_report_schema_any_order
|
|
19661
|
+
# depending on the in_order parameter
|
|
19662
|
+
if in_order:
|
|
19663
|
+
step_report_gt = _step_report_schema_in_order(
|
|
19664
|
+
step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
|
|
19665
|
+
)
|
|
19666
|
+
else:
|
|
19667
|
+
step_report_gt = _step_report_schema_any_order(
|
|
19668
|
+
step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
|
|
19669
|
+
)
|
|
19670
|
+
|
|
19671
|
+
# Generate the settings HTML using the existing function
|
|
19672
|
+
settings_html = _create_col_schema_match_params_html(
|
|
19673
|
+
lang=locale,
|
|
19674
|
+
complete=params["complete"],
|
|
19675
|
+
in_order=params["in_order"],
|
|
19676
|
+
case_sensitive_colnames=params["case_sensitive_colnames"],
|
|
19677
|
+
case_sensitive_dtypes=params["case_sensitive_dtypes"],
|
|
19678
|
+
full_match_dtypes=params["full_match_dtypes"],
|
|
19679
|
+
)
|
|
18008
19680
|
|
|
18009
|
-
|
|
18010
|
-
|
|
18011
|
-
import pandas as pd # pragma: no cover
|
|
19681
|
+
# Remove the inner div containing column_schema_match_str
|
|
19682
|
+
settings_html = re.sub(r'<div style="margin-right: 5px;">.*?</div>', "", settings_html, count=1)
|
|
18012
19683
|
|
|
18013
|
-
|
|
18014
|
-
|
|
18015
|
-
raise ImportError(
|
|
18016
|
-
"Neither Polars nor Pandas is available for formatting"
|
|
18017
|
-
) # pragma: no cover
|
|
19684
|
+
# Change padding-top from 7px to 2px
|
|
19685
|
+
settings_html = settings_html.replace("padding-top: 7px;", "padding-top: 2px;")
|
|
18018
19686
|
|
|
18019
|
-
# Create
|
|
18020
|
-
|
|
19687
|
+
# Create new source note HTML that includes both settings and schema
|
|
19688
|
+
source_note_html = f"""
|
|
19689
|
+
<div style='padding-bottom: 2px;'>{settings_title_text}</div>
|
|
19690
|
+
<div style='padding-bottom: 4px;'>{settings_html}</div>
|
|
19691
|
+
"""
|
|
18021
19692
|
|
|
18022
|
-
#
|
|
18023
|
-
|
|
18024
|
-
columns="value", decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
18025
|
-
)
|
|
19693
|
+
# Add the settings as an additional source note to the step report
|
|
19694
|
+
step_report_gt = step_report_gt.tab_source_note(source_note=html(source_note_html))
|
|
18026
19695
|
|
|
18027
|
-
# Extract the
|
|
18028
|
-
|
|
19696
|
+
# Extract the HTML from the GT object
|
|
19697
|
+
step_report_html = step_report_gt._repr_html_()
|
|
18029
19698
|
|
|
18030
|
-
|
|
19699
|
+
# Create collapsible section with the step report
|
|
19700
|
+
note_html = f"""
|
|
19701
|
+
{summary}
|
|
18031
19702
|
|
|
19703
|
+
<details style="margin-top: 2px; margin-bottom: 8px; font-size: 12px; text-indent: 12px;">
|
|
19704
|
+
<summary style="cursor: pointer; font-weight: bold; color: #555; margin-bottom: -5px;">{disclosure_text}</summary>
|
|
19705
|
+
<div style="margin-top: 6px; padding-left: 15px; padding-right: 15px;">
|
|
18032
19706
|
|
|
18033
|
-
|
|
18034
|
-
if thresholds == Thresholds():
|
|
18035
|
-
return ""
|
|
19707
|
+
{step_report_html}
|
|
18036
19708
|
|
|
18037
|
-
|
|
18038
|
-
|
|
18039
|
-
|
|
18040
|
-
# Use GT-based formatting to avoid Pandas dependency completely
|
|
18041
|
-
return _format_single_float_with_gt_custom(
|
|
18042
|
-
value,
|
|
18043
|
-
decimals=decimals,
|
|
18044
|
-
drop_trailing_zeros=drop_trailing_zeros,
|
|
18045
|
-
locale=locale,
|
|
18046
|
-
df_lib=df_lib,
|
|
18047
|
-
)
|
|
18048
|
-
else:
|
|
18049
|
-
# Fallback to the original behavior
|
|
18050
|
-
return fmt_number(
|
|
18051
|
-
value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
18052
|
-
)[0] # pragma: no cover
|
|
19709
|
+
</div>
|
|
19710
|
+
</details>
|
|
19711
|
+
"""
|
|
18053
19712
|
|
|
18054
|
-
|
|
18055
|
-
if df_lib is not None and value is not None:
|
|
18056
|
-
# Use GT-based formatting to avoid Pandas dependency completely
|
|
18057
|
-
return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
|
|
18058
|
-
else:
|
|
18059
|
-
# Fallback to the original behavior
|
|
18060
|
-
return fmt_integer(value, locale=locale)[0]
|
|
19713
|
+
return note_html.strip()
|
|
18061
19714
|
|
|
18062
|
-
warning = (
|
|
18063
|
-
_format_number_safe(thresholds.warning_fraction, decimals=3, drop_trailing_zeros=True)
|
|
18064
|
-
if thresholds.warning_fraction is not None
|
|
18065
|
-
else (
|
|
18066
|
-
_format_integer_safe(thresholds.warning_count)
|
|
18067
|
-
if thresholds.warning_count is not None
|
|
18068
|
-
else "—"
|
|
18069
|
-
)
|
|
18070
|
-
)
|
|
18071
19715
|
|
|
18072
|
-
|
|
18073
|
-
|
|
18074
|
-
|
|
18075
|
-
else (
|
|
18076
|
-
_format_integer_safe(thresholds.error_count)
|
|
18077
|
-
if thresholds.error_count is not None
|
|
18078
|
-
else "—"
|
|
18079
|
-
)
|
|
18080
|
-
)
|
|
19716
|
+
def _create_col_schema_match_note_text(schema_info: dict) -> str:
|
|
19717
|
+
"""
|
|
19718
|
+
Create a plain text note for schema validation.
|
|
18081
19719
|
|
|
18082
|
-
|
|
18083
|
-
|
|
18084
|
-
|
|
18085
|
-
|
|
18086
|
-
_format_integer_safe(thresholds.critical_count)
|
|
18087
|
-
if thresholds.critical_count is not None
|
|
18088
|
-
else "—"
|
|
18089
|
-
)
|
|
18090
|
-
)
|
|
19720
|
+
Parameters
|
|
19721
|
+
----------
|
|
19722
|
+
schema_info
|
|
19723
|
+
The schema validation information dictionary from interrogation.
|
|
18091
19724
|
|
|
18092
|
-
|
|
18093
|
-
|
|
18094
|
-
|
|
19725
|
+
Returns
|
|
19726
|
+
-------
|
|
19727
|
+
str
|
|
19728
|
+
Plain text note.
|
|
19729
|
+
"""
|
|
19730
|
+
passed = schema_info["passed"]
|
|
19731
|
+
expect_schema = schema_info["expect_schema"]
|
|
19732
|
+
target_schema = schema_info["target_schema"]
|
|
18095
19733
|
|
|
18096
|
-
|
|
18097
|
-
"
|
|
18098
|
-
|
|
18099
|
-
"
|
|
18100
|
-
f"margin: 5px 0px 5px 5px; border: solid 1px {warning_color}; "
|
|
18101
|
-
'font-weight: bold; padding: 2px 15px 2px 15px; font-size: smaller;">WARNING</span>'
|
|
18102
|
-
'<span style="background-color: none; color: #333333; padding: 0.5em 0.5em; '
|
|
18103
|
-
"position: inherit; margin: 5px 0px 5px -4px; font-weight: bold; "
|
|
18104
|
-
f"border: solid 1px {warning_color}; padding: 2px 15px 2px 15px; "
|
|
18105
|
-
'font-size: smaller; margin-right: 5px;">'
|
|
18106
|
-
f"{warning}"
|
|
18107
|
-
"</span>"
|
|
18108
|
-
f'<span style="background-color: {error_color}; color: white; '
|
|
18109
|
-
"padding: 0.5em 0.5em; position: inherit; text-transform: uppercase; "
|
|
18110
|
-
f"margin: 5px 0px 5px 1px; border: solid 1px {error_color}; "
|
|
18111
|
-
'font-weight: bold; padding: 2px 15px 2px 15px; font-size: smaller;">ERROR</span>'
|
|
18112
|
-
'<span style="background-color: none; color: #333333; padding: 0.5em 0.5em; '
|
|
18113
|
-
"position: inherit; margin: 5px 0px 5px -4px; font-weight: bold; "
|
|
18114
|
-
f"border: solid 1px {error_color}; padding: 2px 15px 2px 15px; "
|
|
18115
|
-
'font-size: smaller; margin-right: 5px;">'
|
|
18116
|
-
f"{error}"
|
|
18117
|
-
"</span>"
|
|
18118
|
-
f'<span style="background-color: {critical_color}; color: white; '
|
|
18119
|
-
"padding: 0.5em 0.5em; position: inherit; text-transform: uppercase; "
|
|
18120
|
-
f"margin: 5px 0px 5px 1px; border: solid 1px {critical_color}; "
|
|
18121
|
-
'font-weight: bold; padding: 2px 15px 2px 15px; font-size: smaller;">CRITICAL</span>'
|
|
18122
|
-
'<span style="background-color: none; color: #333333; padding: 0.5em 0.5em; '
|
|
18123
|
-
"position: inherit; margin: 5px 0px 5px -4px; font-weight: bold; "
|
|
18124
|
-
f"border: solid 1px {critical_color}; padding: 2px 15px 2px 15px; "
|
|
18125
|
-
'font-size: smaller;">'
|
|
18126
|
-
f"{critical}"
|
|
18127
|
-
"</span>"
|
|
18128
|
-
"</span>"
|
|
18129
|
-
)
|
|
19734
|
+
if passed:
|
|
19735
|
+
return f"Schema validation passed. Expected {len(expect_schema)} column(s), found {len(target_schema)}."
|
|
19736
|
+
else:
|
|
19737
|
+
return f"Schema validation failed. Expected {len(expect_schema)} column(s), found {len(target_schema)}."
|
|
18130
19738
|
|
|
18131
19739
|
|
|
18132
19740
|
def _step_report_row_based(
|
|
@@ -18576,16 +20184,33 @@ def _step_report_schema_in_order(
|
|
|
18576
20184
|
dtype_exp = []
|
|
18577
20185
|
dtype_exp_correct = []
|
|
18578
20186
|
|
|
18579
|
-
for i in range(len(
|
|
20187
|
+
for i in range(len(expect_schema)):
|
|
18580
20188
|
#
|
|
18581
20189
|
# `col_name_exp` values
|
|
18582
20190
|
#
|
|
18583
20191
|
|
|
18584
|
-
#
|
|
18585
|
-
|
|
18586
|
-
col_name_exp.append(
|
|
20192
|
+
# Get the column name from expect_schema (which can have duplicates)
|
|
20193
|
+
column_name_exp_i = expect_schema[i][0]
|
|
20194
|
+
col_name_exp.append(column_name_exp_i)
|
|
20195
|
+
|
|
20196
|
+
# Check if this column exists in exp_columns_dict (it might not if it's a duplicate)
|
|
20197
|
+
# For duplicates, we need to handle them specially
|
|
20198
|
+
if column_name_exp_i not in exp_columns_dict:
|
|
20199
|
+
# This is a duplicate or invalid column, mark it as incorrect
|
|
20200
|
+
col_exp_correct.append(CROSS_MARK_SPAN)
|
|
20201
|
+
|
|
20202
|
+
# For dtype, check if there's a dtype specified in the schema
|
|
20203
|
+
if len(expect_schema[i]) > 1:
|
|
20204
|
+
dtype_value = expect_schema[i][1]
|
|
20205
|
+
if isinstance(dtype_value, list):
|
|
20206
|
+
dtype_exp.append(" | ".join(dtype_value))
|
|
20207
|
+
else:
|
|
20208
|
+
dtype_exp.append(str(dtype_value))
|
|
20209
|
+
else:
|
|
20210
|
+
dtype_exp.append("—")
|
|
18587
20211
|
|
|
18588
|
-
|
|
20212
|
+
dtype_exp_correct.append("—")
|
|
20213
|
+
continue
|
|
18589
20214
|
|
|
18590
20215
|
#
|
|
18591
20216
|
# `col_exp_correct` values
|