pointblank 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_agg.py +120 -0
- pointblank/_constants.py +207 -6
- pointblank/_constants_translations.py +1302 -0
- pointblank/_datascan_utils.py +28 -10
- pointblank/_interrogation.py +216 -139
- pointblank/_typing.py +12 -0
- pointblank/_utils.py +81 -44
- pointblank/_utils_ai.py +4 -5
- pointblank/_utils_check_args.py +3 -3
- pointblank/_utils_llms_txt.py +41 -2
- pointblank/actions.py +1 -1
- pointblank/assistant.py +2 -3
- pointblank/cli.py +1 -1
- pointblank/column.py +162 -46
- pointblank/data/api-docs.txt +2957 -50
- pointblank/datascan.py +17 -17
- pointblank/draft.py +2 -3
- pointblank/scan_profile.py +2 -1
- pointblank/schema.py +61 -20
- pointblank/thresholds.py +15 -13
- pointblank/validate.py +2280 -410
- pointblank/validate.pyi +1104 -0
- pointblank/yaml.py +15 -8
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/METADATA +7 -2
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/RECORD +30 -28
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/licenses/LICENSE +1 -1
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/WHEEL +0 -0
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.16.0.dist-info → pointblank-0.18.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -12,9 +12,10 @@ import tempfile
|
|
|
12
12
|
import threading
|
|
13
13
|
from dataclasses import dataclass
|
|
14
14
|
from enum import Enum
|
|
15
|
+
from functools import partial
|
|
15
16
|
from importlib.metadata import version
|
|
16
17
|
from pathlib import Path
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
18
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal, NoReturn, ParamSpec, TypeVar
|
|
18
19
|
from zipfile import ZipFile
|
|
19
20
|
|
|
20
21
|
import commonmark
|
|
@@ -23,8 +24,8 @@ from great_tables import GT, from_column, google_font, html, loc, md, style, val
|
|
|
23
24
|
from great_tables.gt import _get_column_of_values
|
|
24
25
|
from great_tables.vals import fmt_integer, fmt_number
|
|
25
26
|
from importlib_resources import files
|
|
26
|
-
from narwhals.typing import FrameT
|
|
27
27
|
|
|
28
|
+
from pointblank._agg import is_valid_agg, load_validation_method_grid, resolve_agg_registries
|
|
28
29
|
from pointblank._constants import (
|
|
29
30
|
ASSERTION_TYPE_METHOD_MAP,
|
|
30
31
|
CHECK_MARK_SPAN,
|
|
@@ -54,6 +55,7 @@ from pointblank._interrogation import (
|
|
|
54
55
|
SpeciallyValidation,
|
|
55
56
|
col_count_match,
|
|
56
57
|
col_exists,
|
|
58
|
+
col_pct_null,
|
|
57
59
|
col_schema_match,
|
|
58
60
|
col_vals_expr,
|
|
59
61
|
conjointly_validation,
|
|
@@ -90,6 +92,8 @@ from pointblank._utils import (
|
|
|
90
92
|
_is_lib_present,
|
|
91
93
|
_is_narwhals_table,
|
|
92
94
|
_is_value_a_df,
|
|
95
|
+
_PBUnresolvedColumn,
|
|
96
|
+
_resolve_columns,
|
|
93
97
|
_select_df_lib,
|
|
94
98
|
)
|
|
95
99
|
from pointblank._utils_check_args import (
|
|
@@ -100,7 +104,14 @@ from pointblank._utils_check_args import (
|
|
|
100
104
|
_check_thresholds,
|
|
101
105
|
)
|
|
102
106
|
from pointblank._utils_html import _create_table_dims_html, _create_table_type_html
|
|
103
|
-
from pointblank.column import
|
|
107
|
+
from pointblank.column import (
|
|
108
|
+
Column,
|
|
109
|
+
ColumnLiteral,
|
|
110
|
+
ColumnSelector,
|
|
111
|
+
ColumnSelectorNarwhals,
|
|
112
|
+
ReferenceColumn,
|
|
113
|
+
col,
|
|
114
|
+
)
|
|
104
115
|
from pointblank.schema import Schema, _get_schema_validation_info
|
|
105
116
|
from pointblank.segments import Segment
|
|
106
117
|
from pointblank.thresholds import (
|
|
@@ -111,10 +122,18 @@ from pointblank.thresholds import (
|
|
|
111
122
|
_normalize_thresholds_creation,
|
|
112
123
|
)
|
|
113
124
|
|
|
125
|
+
P = ParamSpec("P")
|
|
126
|
+
R = TypeVar("R")
|
|
127
|
+
|
|
114
128
|
if TYPE_CHECKING:
|
|
115
129
|
from collections.abc import Collection
|
|
130
|
+
from typing import Any
|
|
131
|
+
|
|
132
|
+
import polars as pl
|
|
133
|
+
from narwhals.typing import IntoDataFrame, IntoFrame
|
|
134
|
+
|
|
135
|
+
from pointblank._typing import AbsoluteBounds, Tolerance, _CompliantValue, _CompliantValues
|
|
116
136
|
|
|
117
|
-
from pointblank._typing import AbsoluteBounds, Tolerance
|
|
118
137
|
|
|
119
138
|
__all__ = [
|
|
120
139
|
"Validate",
|
|
@@ -133,6 +152,7 @@ __all__ = [
|
|
|
133
152
|
"get_validation_summary",
|
|
134
153
|
]
|
|
135
154
|
|
|
155
|
+
|
|
136
156
|
# Create a thread-local storage for the metadata
|
|
137
157
|
_action_context = threading.local()
|
|
138
158
|
|
|
@@ -363,12 +383,16 @@ class PointblankConfig:
|
|
|
363
383
|
|
|
364
384
|
report_incl_header: bool = True
|
|
365
385
|
report_incl_footer: bool = True
|
|
386
|
+
report_incl_footer_timings: bool = True
|
|
387
|
+
report_incl_footer_notes: bool = True
|
|
366
388
|
preview_incl_header: bool = True
|
|
367
389
|
|
|
368
390
|
def __repr__(self):
|
|
369
391
|
return (
|
|
370
392
|
f"PointblankConfig(report_incl_header={self.report_incl_header}, "
|
|
371
393
|
f"report_incl_footer={self.report_incl_footer}, "
|
|
394
|
+
f"report_incl_footer_timings={self.report_incl_footer_timings}, "
|
|
395
|
+
f"report_incl_footer_notes={self.report_incl_footer_notes}, "
|
|
372
396
|
f"preview_incl_header={self.preview_incl_header})"
|
|
373
397
|
)
|
|
374
398
|
|
|
@@ -380,6 +404,8 @@ global_config = PointblankConfig()
|
|
|
380
404
|
def config(
|
|
381
405
|
report_incl_header: bool = True,
|
|
382
406
|
report_incl_footer: bool = True,
|
|
407
|
+
report_incl_footer_timings: bool = True,
|
|
408
|
+
report_incl_footer_notes: bool = True,
|
|
383
409
|
preview_incl_header: bool = True,
|
|
384
410
|
) -> PointblankConfig:
|
|
385
411
|
"""
|
|
@@ -393,7 +419,13 @@ def config(
|
|
|
393
419
|
threshold levels (if set).
|
|
394
420
|
report_incl_footer
|
|
395
421
|
Should the footer of the validation table report be displayed? The footer contains the
|
|
396
|
-
starting and ending times of the interrogation.
|
|
422
|
+
starting and ending times of the interrogation and any notes added to validation steps.
|
|
423
|
+
report_incl_footer_timings
|
|
424
|
+
Controls whether the validation timing information (start time, duration, and end time)
|
|
425
|
+
should be displayed in the footer. Only applies when `report_incl_footer=True`.
|
|
426
|
+
report_incl_footer_notes
|
|
427
|
+
Controls whether the notes from validation steps should be displayed in the footer. Only
|
|
428
|
+
applies when `report_incl_footer=True`.
|
|
397
429
|
preview_incl_header
|
|
398
430
|
Whether the header should be present in any preview table (generated via the
|
|
399
431
|
[`preview()`](`pointblank.preview`) function).
|
|
@@ -407,13 +439,16 @@ def config(
|
|
|
407
439
|
global global_config
|
|
408
440
|
global_config.report_incl_header = report_incl_header # pragma: no cover
|
|
409
441
|
global_config.report_incl_footer = report_incl_footer # pragma: no cover
|
|
442
|
+
global_config.report_incl_footer_timings = report_incl_footer_timings # pragma: no cover
|
|
443
|
+
global_config.report_incl_footer_notes = report_incl_footer_notes # pragma: no cover
|
|
410
444
|
global_config.preview_incl_header = preview_incl_header # pragma: no cover
|
|
445
|
+
return global_config # pragma: no cover
|
|
411
446
|
|
|
412
447
|
|
|
413
448
|
def load_dataset(
|
|
414
449
|
dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
|
|
415
450
|
tbl_type: Literal["polars", "pandas", "duckdb"] = "polars",
|
|
416
|
-
) ->
|
|
451
|
+
) -> Any:
|
|
417
452
|
"""
|
|
418
453
|
Load a dataset hosted in the library as specified table type.
|
|
419
454
|
|
|
@@ -434,7 +469,7 @@ def load_dataset(
|
|
|
434
469
|
|
|
435
470
|
Returns
|
|
436
471
|
-------
|
|
437
|
-
|
|
472
|
+
Any
|
|
438
473
|
The dataset for the `Validate` object. This could be a Polars DataFrame, a Pandas DataFrame,
|
|
439
474
|
or a DuckDB table as an Ibis table.
|
|
440
475
|
|
|
@@ -1507,7 +1542,7 @@ def get_data_path(
|
|
|
1507
1542
|
return tmp_file.name
|
|
1508
1543
|
|
|
1509
1544
|
|
|
1510
|
-
def _process_data(data:
|
|
1545
|
+
def _process_data(data: Any) -> Any:
|
|
1511
1546
|
"""
|
|
1512
1547
|
Centralized data processing pipeline that handles all supported input types.
|
|
1513
1548
|
|
|
@@ -1524,7 +1559,7 @@ def _process_data(data: FrameT | Any) -> FrameT | Any:
|
|
|
1524
1559
|
|
|
1525
1560
|
Parameters
|
|
1526
1561
|
----------
|
|
1527
|
-
data
|
|
1562
|
+
data
|
|
1528
1563
|
The input data which could be:
|
|
1529
1564
|
- a DataFrame object (Polars, Pandas, Ibis, etc.)
|
|
1530
1565
|
- a GitHub URL pointing to a CSV or Parquet file
|
|
@@ -1535,7 +1570,7 @@ def _process_data(data: FrameT | Any) -> FrameT | Any:
|
|
|
1535
1570
|
|
|
1536
1571
|
Returns
|
|
1537
1572
|
-------
|
|
1538
|
-
|
|
1573
|
+
Any
|
|
1539
1574
|
Processed data as a DataFrame if input was a supported data source type,
|
|
1540
1575
|
otherwise the original data unchanged.
|
|
1541
1576
|
"""
|
|
@@ -1554,7 +1589,7 @@ def _process_data(data: FrameT | Any) -> FrameT | Any:
|
|
|
1554
1589
|
return data
|
|
1555
1590
|
|
|
1556
1591
|
|
|
1557
|
-
def _process_github_url(data:
|
|
1592
|
+
def _process_github_url(data: Any) -> Any:
|
|
1558
1593
|
"""
|
|
1559
1594
|
Process data parameter to handle GitHub URLs pointing to CSV or Parquet files.
|
|
1560
1595
|
|
|
@@ -1569,12 +1604,12 @@ def _process_github_url(data: FrameT | Any) -> FrameT | Any:
|
|
|
1569
1604
|
|
|
1570
1605
|
Parameters
|
|
1571
1606
|
----------
|
|
1572
|
-
data
|
|
1607
|
+
data
|
|
1573
1608
|
The data parameter which may be a GitHub URL string or any other data type.
|
|
1574
1609
|
|
|
1575
1610
|
Returns
|
|
1576
1611
|
-------
|
|
1577
|
-
|
|
1612
|
+
Any
|
|
1578
1613
|
If the input is a supported GitHub URL, returns a DataFrame loaded from the downloaded file.
|
|
1579
1614
|
Otherwise, returns the original data unchanged.
|
|
1580
1615
|
|
|
@@ -1659,7 +1694,7 @@ def _process_github_url(data: FrameT | Any) -> FrameT | Any:
|
|
|
1659
1694
|
return data
|
|
1660
1695
|
|
|
1661
1696
|
|
|
1662
|
-
def _process_connection_string(data:
|
|
1697
|
+
def _process_connection_string(data: Any) -> Any:
|
|
1663
1698
|
"""
|
|
1664
1699
|
Process data parameter to handle database connection strings.
|
|
1665
1700
|
|
|
@@ -1686,7 +1721,7 @@ def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
|
|
|
1686
1721
|
return connect_to_table(data)
|
|
1687
1722
|
|
|
1688
1723
|
|
|
1689
|
-
def _process_csv_input(data:
|
|
1724
|
+
def _process_csv_input(data: Any) -> Any:
|
|
1690
1725
|
"""
|
|
1691
1726
|
Process data parameter to handle CSV file inputs.
|
|
1692
1727
|
|
|
@@ -1744,7 +1779,7 @@ def _process_csv_input(data: FrameT | Any) -> FrameT | Any:
|
|
|
1744
1779
|
)
|
|
1745
1780
|
|
|
1746
1781
|
|
|
1747
|
-
def _process_parquet_input(data:
|
|
1782
|
+
def _process_parquet_input(data: Any) -> Any:
|
|
1748
1783
|
"""
|
|
1749
1784
|
Process data parameter to handle Parquet file inputs.
|
|
1750
1785
|
|
|
@@ -1887,7 +1922,7 @@ def _process_parquet_input(data: FrameT | Any) -> FrameT | Any:
|
|
|
1887
1922
|
|
|
1888
1923
|
|
|
1889
1924
|
def preview(
|
|
1890
|
-
data:
|
|
1925
|
+
data: Any,
|
|
1891
1926
|
columns_subset: str | list[str] | Column | None = None,
|
|
1892
1927
|
n_head: int = 5,
|
|
1893
1928
|
n_tail: int = 5,
|
|
@@ -1895,7 +1930,7 @@ def preview(
|
|
|
1895
1930
|
show_row_numbers: bool = True,
|
|
1896
1931
|
max_col_width: int = 250,
|
|
1897
1932
|
min_tbl_width: int = 500,
|
|
1898
|
-
incl_header: bool = None,
|
|
1933
|
+
incl_header: bool | None = None,
|
|
1899
1934
|
) -> GT:
|
|
1900
1935
|
"""
|
|
1901
1936
|
Display a table preview that shows some rows from the top, some from the bottom.
|
|
@@ -2153,7 +2188,7 @@ def preview(
|
|
|
2153
2188
|
|
|
2154
2189
|
|
|
2155
2190
|
def _generate_display_table(
|
|
2156
|
-
data:
|
|
2191
|
+
data: Any,
|
|
2157
2192
|
columns_subset: str | list[str] | Column | None = None,
|
|
2158
2193
|
n_head: int = 5,
|
|
2159
2194
|
n_tail: int = 5,
|
|
@@ -2161,7 +2196,7 @@ def _generate_display_table(
|
|
|
2161
2196
|
show_row_numbers: bool = True,
|
|
2162
2197
|
max_col_width: int = 250,
|
|
2163
2198
|
min_tbl_width: int = 500,
|
|
2164
|
-
incl_header: bool = None,
|
|
2199
|
+
incl_header: bool | None = None,
|
|
2165
2200
|
mark_missing_values: bool = True,
|
|
2166
2201
|
row_number_list: list[int] | None = None,
|
|
2167
2202
|
) -> GT:
|
|
@@ -2258,7 +2293,8 @@ def _generate_display_table(
|
|
|
2258
2293
|
tbl_schema = Schema(tbl=data)
|
|
2259
2294
|
|
|
2260
2295
|
# Get the row count for the table
|
|
2261
|
-
|
|
2296
|
+
# Note: ibis tables have count(), to_polars(), to_pandas() methods
|
|
2297
|
+
ibis_rows = data.count() # type: ignore[union-attr]
|
|
2262
2298
|
n_rows = ibis_rows.to_polars() if df_lib_name_gt == "polars" else int(ibis_rows.to_pandas())
|
|
2263
2299
|
|
|
2264
2300
|
# If n_head + n_tail is greater than the row count, display the entire table
|
|
@@ -2267,11 +2303,11 @@ def _generate_display_table(
|
|
|
2267
2303
|
data_subset = data
|
|
2268
2304
|
|
|
2269
2305
|
if row_number_list is None:
|
|
2270
|
-
row_number_list = range(1, n_rows + 1)
|
|
2306
|
+
row_number_list = list(range(1, n_rows + 1))
|
|
2271
2307
|
else:
|
|
2272
2308
|
# Get the first n and last n rows of the table
|
|
2273
|
-
data_head = data.head(n_head)
|
|
2274
|
-
data_tail = data.filter(
|
|
2309
|
+
data_head = data.head(n_head) # type: ignore[union-attr]
|
|
2310
|
+
data_tail = data.filter( # type: ignore[union-attr]
|
|
2275
2311
|
[ibis.row_number() >= (n_rows - n_tail), ibis.row_number() <= n_rows]
|
|
2276
2312
|
)
|
|
2277
2313
|
data_subset = data_head.union(data_tail)
|
|
@@ -2283,9 +2319,9 @@ def _generate_display_table(
|
|
|
2283
2319
|
|
|
2284
2320
|
# Convert either to Polars or Pandas depending on the available library
|
|
2285
2321
|
if df_lib_name_gt == "polars":
|
|
2286
|
-
data = data_subset.to_polars()
|
|
2322
|
+
data = data_subset.to_polars() # type: ignore[union-attr]
|
|
2287
2323
|
else:
|
|
2288
|
-
data = data_subset.to_pandas()
|
|
2324
|
+
data = data_subset.to_pandas() # type: ignore[union-attr]
|
|
2289
2325
|
|
|
2290
2326
|
# From a DataFrame:
|
|
2291
2327
|
# - get the row count
|
|
@@ -2296,17 +2332,18 @@ def _generate_display_table(
|
|
|
2296
2332
|
tbl_schema = Schema(tbl=data)
|
|
2297
2333
|
|
|
2298
2334
|
if tbl_type == "polars":
|
|
2299
|
-
|
|
2335
|
+
# Note: polars DataFrames have height, head(), tail() attributes
|
|
2336
|
+
n_rows = int(data.height) # type: ignore[union-attr]
|
|
2300
2337
|
|
|
2301
2338
|
# If n_head + n_tail is greater than the row count, display the entire table
|
|
2302
2339
|
if n_head + n_tail >= n_rows:
|
|
2303
2340
|
full_dataset = True
|
|
2304
2341
|
|
|
2305
2342
|
if row_number_list is None:
|
|
2306
|
-
row_number_list = range(1, n_rows + 1)
|
|
2343
|
+
row_number_list = list(range(1, n_rows + 1))
|
|
2307
2344
|
|
|
2308
2345
|
else:
|
|
2309
|
-
data = pl.concat([data.head(n=n_head), data.tail(n=n_tail)])
|
|
2346
|
+
data = pl.concat([data.head(n=n_head), data.tail(n=n_tail)]) # type: ignore[union-attr]
|
|
2310
2347
|
|
|
2311
2348
|
if row_number_list is None:
|
|
2312
2349
|
row_number_list = list(range(1, n_head + 1)) + list(
|
|
@@ -2314,40 +2351,42 @@ def _generate_display_table(
|
|
|
2314
2351
|
)
|
|
2315
2352
|
|
|
2316
2353
|
if tbl_type == "pandas":
|
|
2317
|
-
|
|
2354
|
+
# Note: pandas DataFrames have shape, head(), tail() attributes
|
|
2355
|
+
n_rows = data.shape[0] # type: ignore[union-attr]
|
|
2318
2356
|
|
|
2319
2357
|
# If n_head + n_tail is greater than the row count, display the entire table
|
|
2320
2358
|
if n_head + n_tail >= n_rows:
|
|
2321
2359
|
full_dataset = True
|
|
2322
2360
|
data_subset = data
|
|
2323
2361
|
|
|
2324
|
-
row_number_list = range(1, n_rows + 1)
|
|
2362
|
+
row_number_list = list(range(1, n_rows + 1))
|
|
2325
2363
|
else:
|
|
2326
|
-
data = pd.concat([data.head(n=n_head), data.tail(n=n_tail)])
|
|
2364
|
+
data = pd.concat([data.head(n=n_head), data.tail(n=n_tail)]) # type: ignore[union-attr]
|
|
2327
2365
|
|
|
2328
2366
|
row_number_list = list(range(1, n_head + 1)) + list(
|
|
2329
2367
|
range(n_rows - n_tail + 1, n_rows + 1)
|
|
2330
2368
|
)
|
|
2331
2369
|
|
|
2332
2370
|
if tbl_type == "pyspark":
|
|
2333
|
-
|
|
2371
|
+
# Note: pyspark DataFrames have count(), toPandas(), limit(), tail(), sparkSession
|
|
2372
|
+
n_rows = data.count() # type: ignore[union-attr]
|
|
2334
2373
|
|
|
2335
2374
|
# If n_head + n_tail is greater than the row count, display the entire table
|
|
2336
2375
|
if n_head + n_tail >= n_rows:
|
|
2337
2376
|
full_dataset = True
|
|
2338
2377
|
# Convert to pandas for Great Tables compatibility
|
|
2339
|
-
data = data.toPandas()
|
|
2378
|
+
data = data.toPandas() # type: ignore[union-attr]
|
|
2340
2379
|
|
|
2341
|
-
row_number_list = range(1, n_rows + 1)
|
|
2380
|
+
row_number_list = list(range(1, n_rows + 1))
|
|
2342
2381
|
else:
|
|
2343
2382
|
# Get head and tail samples, then convert to pandas
|
|
2344
|
-
head_data = data.limit(n_head).toPandas()
|
|
2383
|
+
head_data = data.limit(n_head).toPandas() # type: ignore[union-attr]
|
|
2345
2384
|
|
|
2346
2385
|
# PySpark tail() returns a list of Row objects, need to convert to DataFrame
|
|
2347
|
-
tail_rows = data.tail(n_tail)
|
|
2386
|
+
tail_rows = data.tail(n_tail) # type: ignore[union-attr]
|
|
2348
2387
|
if tail_rows:
|
|
2349
2388
|
# Convert list of Row objects back to DataFrame, then to pandas
|
|
2350
|
-
tail_df = data.sparkSession.createDataFrame(tail_rows, data.schema)
|
|
2389
|
+
tail_df = data.sparkSession.createDataFrame(tail_rows, data.schema) # type: ignore[union-attr]
|
|
2351
2390
|
tail_data = tail_df.toPandas()
|
|
2352
2391
|
else:
|
|
2353
2392
|
# If no tail data, create empty DataFrame with same schema
|
|
@@ -2375,14 +2414,14 @@ def _generate_display_table(
|
|
|
2375
2414
|
tbl_schema = Schema(tbl=data)
|
|
2376
2415
|
|
|
2377
2416
|
# From the table schema, get a list of tuples containing column names and data types
|
|
2378
|
-
|
|
2417
|
+
col_dtype_list = tbl_schema.columns or []
|
|
2379
2418
|
|
|
2380
2419
|
# Extract the column names from the list of tuples (first element of each tuple)
|
|
2381
|
-
col_names = [col[0] for col in
|
|
2420
|
+
col_names = [col[0] for col in col_dtype_list]
|
|
2382
2421
|
|
|
2383
2422
|
# Iterate over the list of tuples and create a new dictionary with the
|
|
2384
2423
|
# column names and data types
|
|
2385
|
-
col_dtype_dict = {k: v for k, v in
|
|
2424
|
+
col_dtype_dict = {k: v for k, v in col_dtype_list}
|
|
2386
2425
|
|
|
2387
2426
|
# Create short versions of the data types by omitting any text in parentheses
|
|
2388
2427
|
col_dtype_dict_short = {
|
|
@@ -2481,21 +2520,21 @@ def _generate_display_table(
|
|
|
2481
2520
|
# Prepend a column that contains the row numbers if `show_row_numbers=True`
|
|
2482
2521
|
if show_row_numbers or has_leading_row_num_col:
|
|
2483
2522
|
if has_leading_row_num_col:
|
|
2484
|
-
row_number_list = data["_row_num_"].to_list()
|
|
2523
|
+
row_number_list = data["_row_num_"].to_list() # type: ignore[union-attr]
|
|
2485
2524
|
|
|
2486
2525
|
else:
|
|
2487
2526
|
if df_lib_name_gt == "polars":
|
|
2488
2527
|
import polars as pl
|
|
2489
2528
|
|
|
2490
2529
|
row_number_series = pl.Series("_row_num_", row_number_list)
|
|
2491
|
-
data = data.insert_column(0, row_number_series)
|
|
2530
|
+
data = data.insert_column(0, row_number_series) # type: ignore[union-attr]
|
|
2492
2531
|
|
|
2493
2532
|
if df_lib_name_gt == "pandas":
|
|
2494
|
-
data.insert(0, "_row_num_", row_number_list)
|
|
2533
|
+
data.insert(0, "_row_num_", row_number_list) # type: ignore[union-attr]
|
|
2495
2534
|
|
|
2496
2535
|
if df_lib_name_gt == "pyspark":
|
|
2497
2536
|
# For PySpark converted to pandas, use pandas method
|
|
2498
|
-
data.insert(0, "_row_num_", row_number_list)
|
|
2537
|
+
data.insert(0, "_row_num_", row_number_list) # type: ignore[union-attr]
|
|
2499
2538
|
|
|
2500
2539
|
# Get the highest number in the `row_number_list` and calculate a width that will
|
|
2501
2540
|
# safely fit a number of that magnitude
|
|
@@ -2604,7 +2643,7 @@ def _generate_display_table(
|
|
|
2604
2643
|
return gt_tbl
|
|
2605
2644
|
|
|
2606
2645
|
|
|
2607
|
-
def missing_vals_tbl(data:
|
|
2646
|
+
def missing_vals_tbl(data: Any) -> GT:
|
|
2608
2647
|
"""
|
|
2609
2648
|
Display a table that shows the missing values in the input table.
|
|
2610
2649
|
|
|
@@ -3205,7 +3244,7 @@ def _get_column_names_safe(data: Any) -> list[str]:
|
|
|
3205
3244
|
return list(data.columns) # pragma: no cover
|
|
3206
3245
|
|
|
3207
3246
|
|
|
3208
|
-
def _get_column_names(data:
|
|
3247
|
+
def _get_column_names(data: Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
|
|
3209
3248
|
if ibis_tbl:
|
|
3210
3249
|
return data.columns if df_lib_name_gt == "polars" else list(data.columns)
|
|
3211
3250
|
|
|
@@ -3229,12 +3268,10 @@ def _validate_columns_subset(
|
|
|
3229
3268
|
)
|
|
3230
3269
|
return columns_subset
|
|
3231
3270
|
|
|
3232
|
-
return columns_subset.resolve(columns=col_names)
|
|
3271
|
+
return columns_subset.resolve(columns=col_names) # type: ignore[union-attr]
|
|
3233
3272
|
|
|
3234
3273
|
|
|
3235
|
-
def _select_columns(
|
|
3236
|
-
data: FrameT | Any, resolved_columns: list[str], ibis_tbl: bool, tbl_type: str
|
|
3237
|
-
) -> FrameT | Any:
|
|
3274
|
+
def _select_columns(data: Any, resolved_columns: list[str], ibis_tbl: bool, tbl_type: str) -> Any:
|
|
3238
3275
|
if ibis_tbl:
|
|
3239
3276
|
return data[resolved_columns]
|
|
3240
3277
|
if tbl_type == "polars":
|
|
@@ -3242,7 +3279,7 @@ def _select_columns(
|
|
|
3242
3279
|
return data[resolved_columns]
|
|
3243
3280
|
|
|
3244
3281
|
|
|
3245
|
-
def get_column_count(data:
|
|
3282
|
+
def get_column_count(data: Any) -> int:
|
|
3246
3283
|
"""
|
|
3247
3284
|
Get the number of columns in a table.
|
|
3248
3285
|
|
|
@@ -3454,7 +3491,7 @@ def _extract_enum_values(set_values: Any) -> list[Any]:
|
|
|
3454
3491
|
return [set_values]
|
|
3455
3492
|
|
|
3456
3493
|
|
|
3457
|
-
def get_row_count(data:
|
|
3494
|
+
def get_row_count(data: Any) -> int:
|
|
3458
3495
|
"""
|
|
3459
3496
|
Get the number of rows in a table.
|
|
3460
3497
|
|
|
@@ -3707,18 +3744,46 @@ class _ValidationInfo:
|
|
|
3707
3744
|
insertion order, ensuring notes appear in a consistent sequence in reports and logs.
|
|
3708
3745
|
"""
|
|
3709
3746
|
|
|
3747
|
+
@classmethod
|
|
3748
|
+
def from_agg_validator(
|
|
3749
|
+
cls,
|
|
3750
|
+
assertion_type: str,
|
|
3751
|
+
columns: _PBUnresolvedColumn,
|
|
3752
|
+
value: float | Column | ReferenceColumn,
|
|
3753
|
+
tol: Tolerance = 0,
|
|
3754
|
+
thresholds: float | bool | tuple | dict | Thresholds | None = None,
|
|
3755
|
+
brief: str | bool = False,
|
|
3756
|
+
actions: Actions | None = None,
|
|
3757
|
+
active: bool = True,
|
|
3758
|
+
) -> _ValidationInfo:
|
|
3759
|
+
# This factory method creates a `_ValidationInfo` instance for aggregate
|
|
3760
|
+
# methods. The reason this is created, is because all agg methods share the same
|
|
3761
|
+
# signature so instead of instantiating the class directly each time, this method
|
|
3762
|
+
# can be used to reduce redundancy, boilerplate and mistakes :)
|
|
3763
|
+
_check_thresholds(thresholds=thresholds)
|
|
3764
|
+
|
|
3765
|
+
return cls(
|
|
3766
|
+
assertion_type=assertion_type,
|
|
3767
|
+
column=_resolve_columns(columns),
|
|
3768
|
+
values={"value": value, "tol": tol},
|
|
3769
|
+
thresholds=_normalize_thresholds_creation(thresholds),
|
|
3770
|
+
brief=_transform_auto_brief(brief=brief),
|
|
3771
|
+
actions=actions,
|
|
3772
|
+
active=active,
|
|
3773
|
+
)
|
|
3774
|
+
|
|
3710
3775
|
# Validation plan
|
|
3711
3776
|
i: int | None = None
|
|
3712
3777
|
i_o: int | None = None
|
|
3713
3778
|
step_id: str | None = None
|
|
3714
3779
|
sha1: str | None = None
|
|
3715
3780
|
assertion_type: str | None = None
|
|
3716
|
-
column:
|
|
3717
|
-
values:
|
|
3781
|
+
column: Any | None = None
|
|
3782
|
+
values: Any | list[Any] | tuple | None = None
|
|
3718
3783
|
inclusive: tuple[bool, bool] | None = None
|
|
3719
3784
|
na_pass: bool | None = None
|
|
3720
3785
|
pre: Callable | None = None
|
|
3721
|
-
segments:
|
|
3786
|
+
segments: Any | None = None
|
|
3722
3787
|
thresholds: Thresholds | None = None
|
|
3723
3788
|
actions: Actions | None = None
|
|
3724
3789
|
label: str | None = None
|
|
@@ -3737,14 +3802,14 @@ class _ValidationInfo:
|
|
|
3737
3802
|
error: bool | None = None
|
|
3738
3803
|
critical: bool | None = None
|
|
3739
3804
|
failure_text: str | None = None
|
|
3740
|
-
tbl_checked:
|
|
3741
|
-
extract:
|
|
3742
|
-
val_info: dict[str,
|
|
3805
|
+
tbl_checked: Any = None
|
|
3806
|
+
extract: Any = None
|
|
3807
|
+
val_info: dict[str, Any] | None = None
|
|
3743
3808
|
time_processed: str | None = None
|
|
3744
3809
|
proc_duration_s: float | None = None
|
|
3745
3810
|
notes: dict[str, dict[str, str]] | None = None
|
|
3746
3811
|
|
|
3747
|
-
def get_val_info(self) -> dict[str,
|
|
3812
|
+
def get_val_info(self) -> dict[str, Any] | None:
|
|
3748
3813
|
return self.val_info
|
|
3749
3814
|
|
|
3750
3815
|
def _add_note(self, key: str, markdown: str, text: str | None = None) -> None:
|
|
@@ -3920,7 +3985,7 @@ class _ValidationInfo:
|
|
|
3920
3985
|
return self.notes is not None and len(self.notes) > 0
|
|
3921
3986
|
|
|
3922
3987
|
|
|
3923
|
-
def _handle_connection_errors(e: Exception, connection_string: str) ->
|
|
3988
|
+
def _handle_connection_errors(e: Exception, connection_string: str) -> NoReturn:
|
|
3924
3989
|
"""
|
|
3925
3990
|
Shared error handling for database connection failures.
|
|
3926
3991
|
|
|
@@ -4761,7 +4826,8 @@ class Validate:
|
|
|
4761
4826
|
when table specifications are missing or backend dependencies are not installed.
|
|
4762
4827
|
"""
|
|
4763
4828
|
|
|
4764
|
-
data:
|
|
4829
|
+
data: IntoDataFrame
|
|
4830
|
+
reference: IntoFrame | None = None
|
|
4765
4831
|
tbl_name: str | None = None
|
|
4766
4832
|
label: str | None = None
|
|
4767
4833
|
thresholds: int | float | bool | tuple | dict | Thresholds | None = None
|
|
@@ -4775,6 +4841,10 @@ class Validate:
|
|
|
4775
4841
|
# Process data through the centralized data processing pipeline
|
|
4776
4842
|
self.data = _process_data(self.data)
|
|
4777
4843
|
|
|
4844
|
+
# Process reference data if provided
|
|
4845
|
+
if self.reference is not None:
|
|
4846
|
+
self.reference = _process_data(self.reference)
|
|
4847
|
+
|
|
4778
4848
|
# Check input of the `thresholds=` argument
|
|
4779
4849
|
_check_thresholds(thresholds=self.thresholds)
|
|
4780
4850
|
|
|
@@ -4819,9 +4889,107 @@ class Validate:
|
|
|
4819
4889
|
|
|
4820
4890
|
self.validation_info = []
|
|
4821
4891
|
|
|
4892
|
+
def _add_agg_validation(
|
|
4893
|
+
self,
|
|
4894
|
+
*,
|
|
4895
|
+
assertion_type: str,
|
|
4896
|
+
columns: str | Collection[str],
|
|
4897
|
+
value,
|
|
4898
|
+
tol=0,
|
|
4899
|
+
thresholds=None,
|
|
4900
|
+
brief=False,
|
|
4901
|
+
actions=None,
|
|
4902
|
+
active=True,
|
|
4903
|
+
):
|
|
4904
|
+
"""
|
|
4905
|
+
Add an aggregation-based validation step to the validation plan.
|
|
4906
|
+
|
|
4907
|
+
This internal method is used by all aggregation-based column validation methods
|
|
4908
|
+
(e.g., `col_sum_eq`, `col_avg_gt`, `col_sd_le`) to create and register validation
|
|
4909
|
+
steps. It relies heavily on the `_ValidationInfo.from_agg_validator()` class method.
|
|
4910
|
+
|
|
4911
|
+
Automatic Reference Inference
|
|
4912
|
+
-----------------------------
|
|
4913
|
+
When `value` is None and reference data has been set on the Validate object,
|
|
4914
|
+
this method automatically creates a `ReferenceColumn` pointing to the same
|
|
4915
|
+
column name in the reference data. This enables a convenient shorthand:
|
|
4916
|
+
|
|
4917
|
+
.. code-block:: python
|
|
4918
|
+
|
|
4919
|
+
# Instead of writing:
|
|
4920
|
+
Validate(data=df, reference=ref_df).col_sum_eq("a", ref("a"))
|
|
4921
|
+
|
|
4922
|
+
# You can simply write:
|
|
4923
|
+
Validate(data=df, reference=ref_df).col_sum_eq("a")
|
|
4924
|
+
|
|
4925
|
+
If `value` is None and no reference data is set, a `ValueError` is raised
|
|
4926
|
+
immediately to provide clear feedback to the user.
|
|
4927
|
+
|
|
4928
|
+
Parameters
|
|
4929
|
+
----------
|
|
4930
|
+
assertion_type
|
|
4931
|
+
The type of assertion (e.g., "col_sum_eq", "col_avg_gt").
|
|
4932
|
+
columns
|
|
4933
|
+
Column name or collection of column names to validate.
|
|
4934
|
+
value
|
|
4935
|
+
The target value to compare against. Can be:
|
|
4936
|
+
- A numeric literal (int or float)
|
|
4937
|
+
- A `Column` object for cross-column comparison
|
|
4938
|
+
- A `ReferenceColumn` object for reference data comparison
|
|
4939
|
+
- None to automatically use `ref(column)` when reference data is set
|
|
4940
|
+
tol
|
|
4941
|
+
Tolerance for the comparison. Defaults to 0.
|
|
4942
|
+
thresholds
|
|
4943
|
+
Custom thresholds for the validation step.
|
|
4944
|
+
brief
|
|
4945
|
+
Brief description or auto-generate flag.
|
|
4946
|
+
actions
|
|
4947
|
+
Actions to take based on validation results.
|
|
4948
|
+
active
|
|
4949
|
+
Whether this validation step is active.
|
|
4950
|
+
|
|
4951
|
+
Returns
|
|
4952
|
+
-------
|
|
4953
|
+
Validate
|
|
4954
|
+
The Validate instance for method chaining.
|
|
4955
|
+
|
|
4956
|
+
Raises
|
|
4957
|
+
------
|
|
4958
|
+
ValueError
|
|
4959
|
+
If `value` is None and no reference data is set on the Validate object.
|
|
4960
|
+
"""
|
|
4961
|
+
if isinstance(columns, str):
|
|
4962
|
+
columns = [columns]
|
|
4963
|
+
for column in columns:
|
|
4964
|
+
# If value is None, default to referencing the same column from reference data
|
|
4965
|
+
resolved_value = value
|
|
4966
|
+
if value is None:
|
|
4967
|
+
if self.reference is None:
|
|
4968
|
+
raise ValueError(
|
|
4969
|
+
f"The 'value' parameter is required for {assertion_type}() "
|
|
4970
|
+
"when no reference data is set. Either provide a value, or "
|
|
4971
|
+
"set reference data on the Validate object using "
|
|
4972
|
+
"Validate(data=..., reference=...)."
|
|
4973
|
+
)
|
|
4974
|
+
resolved_value = ReferenceColumn(column_name=column)
|
|
4975
|
+
|
|
4976
|
+
val_info = _ValidationInfo.from_agg_validator(
|
|
4977
|
+
assertion_type=assertion_type,
|
|
4978
|
+
columns=column,
|
|
4979
|
+
value=resolved_value,
|
|
4980
|
+
tol=tol,
|
|
4981
|
+
thresholds=self.thresholds if thresholds is None else thresholds,
|
|
4982
|
+
actions=self.actions if actions is None else actions,
|
|
4983
|
+
brief=self.brief if brief is None else brief,
|
|
4984
|
+
active=active,
|
|
4985
|
+
)
|
|
4986
|
+
self._add_validation(validation_info=val_info)
|
|
4987
|
+
|
|
4988
|
+
return self
|
|
4989
|
+
|
|
4822
4990
|
def set_tbl(
|
|
4823
4991
|
self,
|
|
4824
|
-
tbl:
|
|
4992
|
+
tbl: Any,
|
|
4825
4993
|
tbl_name: str | None = None,
|
|
4826
4994
|
label: str | None = None,
|
|
4827
4995
|
) -> Validate:
|
|
@@ -4964,7 +5132,7 @@ class Validate:
|
|
|
4964
5132
|
na_pass: bool = False,
|
|
4965
5133
|
pre: Callable | None = None,
|
|
4966
5134
|
segments: SegmentSpec | None = None,
|
|
4967
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5135
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
4968
5136
|
actions: Actions | None = None,
|
|
4969
5137
|
brief: str | bool | None = None,
|
|
4970
5138
|
active: bool = True,
|
|
@@ -5198,7 +5366,6 @@ class Validate:
|
|
|
5198
5366
|
- Row 1: `c` is `1` and `b` is `2`.
|
|
5199
5367
|
- Row 3: `c` is `2` and `b` is `2`.
|
|
5200
5368
|
"""
|
|
5201
|
-
|
|
5202
5369
|
assertion_type = _get_fn_name()
|
|
5203
5370
|
|
|
5204
5371
|
_check_column(column=columns)
|
|
@@ -5218,14 +5385,7 @@ class Validate:
|
|
|
5218
5385
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
5219
5386
|
)
|
|
5220
5387
|
|
|
5221
|
-
|
|
5222
|
-
# resolve the columns
|
|
5223
|
-
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
5224
|
-
columns = col(columns)
|
|
5225
|
-
|
|
5226
|
-
# If `columns` is Column value or a string, place it in a list for iteration
|
|
5227
|
-
if isinstance(columns, (Column, str)):
|
|
5228
|
-
columns = [columns]
|
|
5388
|
+
columns = _resolve_columns(columns)
|
|
5229
5389
|
|
|
5230
5390
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
5231
5391
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
@@ -5256,7 +5416,7 @@ class Validate:
|
|
|
5256
5416
|
na_pass: bool = False,
|
|
5257
5417
|
pre: Callable | None = None,
|
|
5258
5418
|
segments: SegmentSpec | None = None,
|
|
5259
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5419
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
5260
5420
|
actions: Actions | None = None,
|
|
5261
5421
|
brief: str | bool | None = None,
|
|
5262
5422
|
active: bool = True,
|
|
@@ -5547,7 +5707,7 @@ class Validate:
|
|
|
5547
5707
|
na_pass: bool = False,
|
|
5548
5708
|
pre: Callable | None = None,
|
|
5549
5709
|
segments: SegmentSpec | None = None,
|
|
5550
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5710
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
5551
5711
|
actions: Actions | None = None,
|
|
5552
5712
|
brief: str | bool | None = None,
|
|
5553
5713
|
active: bool = True,
|
|
@@ -5838,7 +5998,7 @@ class Validate:
|
|
|
5838
5998
|
na_pass: bool = False,
|
|
5839
5999
|
pre: Callable | None = None,
|
|
5840
6000
|
segments: SegmentSpec | None = None,
|
|
5841
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
6001
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
5842
6002
|
actions: Actions | None = None,
|
|
5843
6003
|
brief: str | bool | None = None,
|
|
5844
6004
|
active: bool = True,
|
|
@@ -6127,7 +6287,7 @@ class Validate:
|
|
|
6127
6287
|
na_pass: bool = False,
|
|
6128
6288
|
pre: Callable | None = None,
|
|
6129
6289
|
segments: SegmentSpec | None = None,
|
|
6130
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
6290
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
6131
6291
|
actions: Actions | None = None,
|
|
6132
6292
|
brief: str | bool | None = None,
|
|
6133
6293
|
active: bool = True,
|
|
@@ -6419,7 +6579,7 @@ class Validate:
|
|
|
6419
6579
|
na_pass: bool = False,
|
|
6420
6580
|
pre: Callable | None = None,
|
|
6421
6581
|
segments: SegmentSpec | None = None,
|
|
6422
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
6582
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
6423
6583
|
actions: Actions | None = None,
|
|
6424
6584
|
brief: str | bool | None = None,
|
|
6425
6585
|
active: bool = True,
|
|
@@ -6713,7 +6873,7 @@ class Validate:
|
|
|
6713
6873
|
na_pass: bool = False,
|
|
6714
6874
|
pre: Callable | None = None,
|
|
6715
6875
|
segments: SegmentSpec | None = None,
|
|
6716
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
6876
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
6717
6877
|
actions: Actions | None = None,
|
|
6718
6878
|
brief: str | bool | None = None,
|
|
6719
6879
|
active: bool = True,
|
|
@@ -7033,7 +7193,7 @@ class Validate:
|
|
|
7033
7193
|
na_pass: bool = False,
|
|
7034
7194
|
pre: Callable | None = None,
|
|
7035
7195
|
segments: SegmentSpec | None = None,
|
|
7036
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
7196
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
7037
7197
|
actions: Actions | None = None,
|
|
7038
7198
|
brief: str | bool | None = None,
|
|
7039
7199
|
active: bool = True,
|
|
@@ -7350,7 +7510,7 @@ class Validate:
|
|
|
7350
7510
|
set: Collection[Any],
|
|
7351
7511
|
pre: Callable | None = None,
|
|
7352
7512
|
segments: SegmentSpec | None = None,
|
|
7353
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
7513
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
7354
7514
|
actions: Actions | None = None,
|
|
7355
7515
|
brief: str | bool | None = None,
|
|
7356
7516
|
active: bool = True,
|
|
@@ -7667,7 +7827,7 @@ class Validate:
|
|
|
7667
7827
|
set: Collection[Any],
|
|
7668
7828
|
pre: Callable | None = None,
|
|
7669
7829
|
segments: SegmentSpec | None = None,
|
|
7670
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
7830
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
7671
7831
|
actions: Actions | None = None,
|
|
7672
7832
|
brief: str | bool | None = None,
|
|
7673
7833
|
active: bool = True,
|
|
@@ -7958,7 +8118,7 @@ class Validate:
|
|
|
7958
8118
|
na_pass: bool = False,
|
|
7959
8119
|
pre: Callable | None = None,
|
|
7960
8120
|
segments: SegmentSpec | None = None,
|
|
7961
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8121
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
7962
8122
|
actions: Actions | None = None,
|
|
7963
8123
|
brief: str | bool | None = None,
|
|
7964
8124
|
active: bool = True,
|
|
@@ -8146,7 +8306,7 @@ class Validate:
|
|
|
8146
8306
|
na_pass: bool = False,
|
|
8147
8307
|
pre: Callable | None = None,
|
|
8148
8308
|
segments: SegmentSpec | None = None,
|
|
8149
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8309
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
8150
8310
|
actions: Actions | None = None,
|
|
8151
8311
|
brief: str | bool | None = None,
|
|
8152
8312
|
active: bool = True,
|
|
@@ -8331,7 +8491,7 @@ class Validate:
|
|
|
8331
8491
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
8332
8492
|
pre: Callable | None = None,
|
|
8333
8493
|
segments: SegmentSpec | None = None,
|
|
8334
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8494
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
8335
8495
|
actions: Actions | None = None,
|
|
8336
8496
|
brief: str | bool | None = None,
|
|
8337
8497
|
active: bool = True,
|
|
@@ -8574,7 +8734,7 @@ class Validate:
|
|
|
8574
8734
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
8575
8735
|
pre: Callable | None = None,
|
|
8576
8736
|
segments: SegmentSpec | None = None,
|
|
8577
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8737
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
8578
8738
|
actions: Actions | None = None,
|
|
8579
8739
|
brief: str | bool | None = None,
|
|
8580
8740
|
active: bool = True,
|
|
@@ -8820,7 +8980,7 @@ class Validate:
|
|
|
8820
8980
|
inverse: bool = False,
|
|
8821
8981
|
pre: Callable | None = None,
|
|
8822
8982
|
segments: SegmentSpec | None = None,
|
|
8823
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8983
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
8824
8984
|
actions: Actions | None = None,
|
|
8825
8985
|
brief: str | bool | None = None,
|
|
8826
8986
|
active: bool = True,
|
|
@@ -9083,7 +9243,7 @@ class Validate:
|
|
|
9083
9243
|
na_pass: bool = False,
|
|
9084
9244
|
pre: Callable | None = None,
|
|
9085
9245
|
segments: SegmentSpec | None = None,
|
|
9086
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9246
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
9087
9247
|
actions: Actions | None = None,
|
|
9088
9248
|
brief: str | bool | None = None,
|
|
9089
9249
|
active: bool = True,
|
|
@@ -9363,10 +9523,10 @@ class Validate:
|
|
|
9363
9523
|
|
|
9364
9524
|
def col_vals_expr(
|
|
9365
9525
|
self,
|
|
9366
|
-
expr:
|
|
9526
|
+
expr: Any,
|
|
9367
9527
|
pre: Callable | None = None,
|
|
9368
9528
|
segments: SegmentSpec | None = None,
|
|
9369
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9529
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
9370
9530
|
actions: Actions | None = None,
|
|
9371
9531
|
brief: str | bool | None = None,
|
|
9372
9532
|
active: bool = True,
|
|
@@ -9584,7 +9744,7 @@ class Validate:
|
|
|
9584
9744
|
def col_exists(
|
|
9585
9745
|
self,
|
|
9586
9746
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
9587
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9747
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
9588
9748
|
actions: Actions | None = None,
|
|
9589
9749
|
brief: str | bool | None = None,
|
|
9590
9750
|
active: bool = True,
|
|
@@ -9755,40 +9915,41 @@ class Validate:
|
|
|
9755
9915
|
|
|
9756
9916
|
return self
|
|
9757
9917
|
|
|
9758
|
-
def
|
|
9918
|
+
def col_pct_null(
|
|
9759
9919
|
self,
|
|
9760
|
-
|
|
9761
|
-
|
|
9762
|
-
|
|
9763
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9920
|
+
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
9921
|
+
p: float,
|
|
9922
|
+
tol: Tolerance = 0,
|
|
9923
|
+
thresholds: int | float | None | bool | tuple | dict | Thresholds = None,
|
|
9764
9924
|
actions: Actions | None = None,
|
|
9765
9925
|
brief: str | bool | None = None,
|
|
9766
9926
|
active: bool = True,
|
|
9767
9927
|
) -> Validate:
|
|
9768
9928
|
"""
|
|
9769
|
-
Validate whether
|
|
9929
|
+
Validate whether a column has a specific percentage of Null values.
|
|
9770
9930
|
|
|
9771
|
-
The `
|
|
9772
|
-
|
|
9773
|
-
|
|
9931
|
+
The `col_pct_null()` validation method checks whether the percentage of Null values in a
|
|
9932
|
+
column matches a specified percentage `p=` (within an optional tolerance `tol=`). This
|
|
9933
|
+
validation operates at the column level, generating a single validation step per column that
|
|
9934
|
+
passes or fails based on whether the actual percentage of Null values falls within the
|
|
9935
|
+
acceptable range defined by `p ± tol`.
|
|
9774
9936
|
|
|
9775
9937
|
Parameters
|
|
9776
9938
|
----------
|
|
9777
|
-
|
|
9778
|
-
A single column or a list of columns to
|
|
9779
|
-
|
|
9780
|
-
columns are supplied
|
|
9781
|
-
|
|
9782
|
-
|
|
9783
|
-
|
|
9784
|
-
|
|
9785
|
-
|
|
9786
|
-
|
|
9787
|
-
|
|
9788
|
-
|
|
9789
|
-
|
|
9790
|
-
|
|
9791
|
-
(provided as a list). Read the *Segmentation* section for usage information.
|
|
9939
|
+
columns
|
|
9940
|
+
A single column or a list of columns to validate. Can also use
|
|
9941
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
9942
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
9943
|
+
generated for each column.
|
|
9944
|
+
p
|
|
9945
|
+
The expected percentage of Null values in the column, expressed as a decimal between
|
|
9946
|
+
`0.0` and `1.0`. For example, `p=0.5` means 50% of values should be Null.
|
|
9947
|
+
tol
|
|
9948
|
+
The tolerance allowed when comparing the actual percentage of Null values to the
|
|
9949
|
+
expected percentage `p=`. The validation passes if the actual percentage falls within
|
|
9950
|
+
the range `[p - tol, p + tol]`. Default is `0`, meaning an exact match is required. See
|
|
9951
|
+
the *Tolerance* section for details on all supported formats (absolute, relative,
|
|
9952
|
+
symmetric, and asymmetric bounds).
|
|
9792
9953
|
thresholds
|
|
9793
9954
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
9794
9955
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -9796,7 +9957,7 @@ class Validate:
|
|
|
9796
9957
|
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
9797
9958
|
section for information on how to set threshold levels.
|
|
9798
9959
|
actions
|
|
9799
|
-
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
9960
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
9800
9961
|
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
9801
9962
|
define the actions.
|
|
9802
9963
|
brief
|
|
@@ -9815,60 +9976,30 @@ class Validate:
|
|
|
9815
9976
|
Validate
|
|
9816
9977
|
The `Validate` object with the added validation step.
|
|
9817
9978
|
|
|
9818
|
-
|
|
9819
|
-
|
|
9820
|
-
The `
|
|
9821
|
-
|
|
9822
|
-
table. This is useful for performing any necessary transformations or filtering on the data
|
|
9823
|
-
before the validation step is applied.
|
|
9824
|
-
|
|
9825
|
-
The preprocessing function can be any callable that takes a table as input and returns a
|
|
9826
|
-
modified table. For example, you could use a lambda function to filter the table based on
|
|
9827
|
-
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
9828
|
-
columns via `columns_subset=` that are expected to be present in the transformed table, but
|
|
9829
|
-
may not exist in the table before preprocessing. Regarding the lifetime of the transformed
|
|
9830
|
-
table, it only exists during the validation step and is not stored in the `Validate` object
|
|
9831
|
-
or used in subsequent validation steps.
|
|
9832
|
-
|
|
9833
|
-
Segmentation
|
|
9834
|
-
------------
|
|
9835
|
-
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
9836
|
-
segments. This is useful for applying the same validation step to different subsets of the
|
|
9837
|
-
data. The segmentation can be done based on a single column or specific fields within a
|
|
9838
|
-
column.
|
|
9839
|
-
|
|
9840
|
-
Providing a single column name will result in a separate validation step for each unique
|
|
9841
|
-
value in that column. For example, if you have a column called `"region"` with values
|
|
9842
|
-
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
9843
|
-
region.
|
|
9844
|
-
|
|
9845
|
-
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
9846
|
-
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
9847
|
-
segment on only specific dates, you can provide a tuple like
|
|
9848
|
-
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
9849
|
-
(i.e., no validation steps will be created for them).
|
|
9979
|
+
Tolerance
|
|
9980
|
+
---------
|
|
9981
|
+
The `tol=` parameter accepts several different formats to specify the acceptable deviation
|
|
9982
|
+
from the expected percentage `p=`. The tolerance can be expressed as:
|
|
9850
9983
|
|
|
9851
|
-
|
|
9852
|
-
|
|
9984
|
+
1. *single integer* (absolute tolerance): the exact number of test units that can deviate.
|
|
9985
|
+
For example, `tol=2` means the actual count can differ from the expected count by up to 2
|
|
9986
|
+
units in either direction.
|
|
9853
9987
|
|
|
9854
|
-
|
|
9855
|
-
|
|
9856
|
-
|
|
9857
|
-
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
9988
|
+
2. *single float between 0 and 1* (relative tolerance): a proportion of the expected
|
|
9989
|
+
count. For example, if the expected count is 50 and `tol=0.1`, the acceptable range is
|
|
9990
|
+
45 to 55 (50 ± 10% of 50 = 50 ± 5).
|
|
9858
9991
|
|
|
9859
|
-
|
|
9860
|
-
|
|
9861
|
-
|
|
9992
|
+
3. *tuple of two integers* (absolute bounds): explicitly specify the lower and upper
|
|
9993
|
+
bounds as absolute deviations. For example, `tol=(1, 3)` means the actual count can be
|
|
9994
|
+
1 unit below or 3 units above the expected count.
|
|
9862
9995
|
|
|
9863
|
-
|
|
9864
|
-
|
|
9865
|
-
|
|
9866
|
-
identify issues within specific segments.
|
|
9996
|
+
4. *tuple of two floats between 0 and 1* (relative bounds): explicitly specify the lower
|
|
9997
|
+
and upper bounds as proportional deviations. For example, `tol=(0.05, 0.15)` means the
|
|
9998
|
+
lower bound is 5% below and the upper bound is 15% above the expected count.
|
|
9867
9999
|
|
|
9868
|
-
|
|
9869
|
-
|
|
9870
|
-
|
|
9871
|
-
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
10000
|
+
When using a single value (integer or float), the tolerance is applied symmetrically in both
|
|
10001
|
+
directions. When using a tuple, you can specify asymmetric tolerances where the lower and
|
|
10002
|
+
upper bounds differ.
|
|
9872
10003
|
|
|
9873
10004
|
Thresholds
|
|
9874
10005
|
----------
|
|
@@ -9906,8 +10037,8 @@ class Validate:
|
|
|
9906
10037
|
import pointblank as pb
|
|
9907
10038
|
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
9908
10039
|
```
|
|
9909
|
-
For the examples here, we'll use a simple Polars DataFrame with three
|
|
9910
|
-
|
|
10040
|
+
For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`,
|
|
10041
|
+
and `c`) that have different percentages of Null values. The table is shown below:
|
|
9911
10042
|
|
|
9912
10043
|
```{python}
|
|
9913
10044
|
import pointblank as pb
|
|
@@ -9915,56 +10046,133 @@ class Validate:
|
|
|
9915
10046
|
|
|
9916
10047
|
tbl = pl.DataFrame(
|
|
9917
10048
|
{
|
|
9918
|
-
"
|
|
9919
|
-
"
|
|
9920
|
-
"
|
|
10049
|
+
"a": [1, 2, 3, 4, 5, 6, 7, 8],
|
|
10050
|
+
"b": [1, None, 3, None, 5, None, 7, None],
|
|
10051
|
+
"c": [None, None, None, None, None, None, 1, 2],
|
|
9921
10052
|
}
|
|
9922
10053
|
)
|
|
9923
10054
|
|
|
9924
10055
|
pb.preview(tbl)
|
|
9925
10056
|
```
|
|
9926
10057
|
|
|
9927
|
-
Let's validate that
|
|
9928
|
-
determine if this validation had any failing test units (there are four test units, one for
|
|
9929
|
-
each row). A failing test units means that a given row is not distinct from every other row.
|
|
10058
|
+
Let's validate that column `a` has 0% Null values (i.e., no Null values at all).
|
|
9930
10059
|
|
|
9931
10060
|
```{python}
|
|
9932
10061
|
validation = (
|
|
9933
10062
|
pb.Validate(data=tbl)
|
|
9934
|
-
.
|
|
10063
|
+
.col_pct_null(columns="a", p=0.0)
|
|
9935
10064
|
.interrogate()
|
|
9936
10065
|
)
|
|
9937
10066
|
|
|
9938
10067
|
validation
|
|
9939
10068
|
```
|
|
9940
10069
|
|
|
9941
|
-
|
|
9942
|
-
table
|
|
10070
|
+
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
10071
|
+
The validation table shows the single entry that corresponds to the validation step created
|
|
10072
|
+
by using `col_pct_null()`. The validation passed since column `a` has no Null values.
|
|
9943
10073
|
|
|
9944
|
-
|
|
9945
|
-
using columns `col_2` and `col_3` for the next validation.
|
|
10074
|
+
Now, let's check that column `b` has exactly 50% Null values.
|
|
9946
10075
|
|
|
9947
10076
|
```{python}
|
|
9948
10077
|
validation = (
|
|
9949
10078
|
pb.Validate(data=tbl)
|
|
9950
|
-
.
|
|
10079
|
+
.col_pct_null(columns="b", p=0.5)
|
|
9951
10080
|
.interrogate()
|
|
9952
10081
|
)
|
|
9953
10082
|
|
|
9954
10083
|
validation
|
|
9955
10084
|
```
|
|
9956
10085
|
|
|
9957
|
-
|
|
9958
|
-
|
|
9959
|
-
|
|
9960
|
-
|
|
9961
|
-
|
|
10086
|
+
This validation also passes, as column `b` has exactly 4 out of 8 values as Null (50%).
|
|
10087
|
+
|
|
10088
|
+
Finally, let's validate column `c` with a tolerance. Column `c` has 75% Null values, so
|
|
10089
|
+
we'll check if it's approximately 70% Null with a tolerance of 10%.
|
|
10090
|
+
|
|
10091
|
+
```{python}
|
|
10092
|
+
validation = (
|
|
10093
|
+
pb.Validate(data=tbl)
|
|
10094
|
+
.col_pct_null(columns="c", p=0.70, tol=0.10)
|
|
10095
|
+
.interrogate()
|
|
10096
|
+
)
|
|
10097
|
+
|
|
10098
|
+
validation
|
|
10099
|
+
```
|
|
10100
|
+
|
|
10101
|
+
This validation passes because the actual percentage (75%) falls within the acceptable
|
|
10102
|
+
range of 60% to 80% (70% ± 10%).
|
|
10103
|
+
|
|
10104
|
+
The `tol=` parameter supports multiple formats to express tolerance. Let's explore all the
|
|
10105
|
+
different ways to specify tolerance using column `b`, which has exactly 50% Null values
|
|
10106
|
+
(4 out of 8 values).
|
|
10107
|
+
|
|
10108
|
+
*Using an absolute tolerance (integer)*: Specify the exact number of rows that can
|
|
10109
|
+
deviate. With `tol=1`, we allow the count to differ by 1 row in either direction.
|
|
10110
|
+
|
|
10111
|
+
```{python}
|
|
10112
|
+
validation = (
|
|
10113
|
+
pb.Validate(data=tbl)
|
|
10114
|
+
.col_pct_null(columns="b", p=0.375, tol=1) # Expect 3 nulls, allow ±1 (range: 2-4)
|
|
10115
|
+
.interrogate()
|
|
10116
|
+
)
|
|
10117
|
+
|
|
10118
|
+
validation
|
|
10119
|
+
```
|
|
10120
|
+
|
|
10121
|
+
This passes because column `b` has 4 Null values, which falls within the acceptable range
|
|
10122
|
+
of 2 to 4 (3 ± 1).
|
|
10123
|
+
|
|
10124
|
+
*Using a relative tolerance (float)*: Specify the tolerance as a proportion of the
|
|
10125
|
+
expected count. With `tol=0.25`, we allow a 25% deviation from the expected count.
|
|
10126
|
+
|
|
10127
|
+
```{python}
|
|
10128
|
+
validation = (
|
|
10129
|
+
pb.Validate(data=tbl)
|
|
10130
|
+
.col_pct_null(columns="b", p=0.375, tol=0.25) # Expect 3 nulls, allow ±25% (range: 2.25-3.75)
|
|
10131
|
+
.interrogate()
|
|
10132
|
+
)
|
|
10133
|
+
|
|
10134
|
+
validation
|
|
10135
|
+
```
|
|
10136
|
+
|
|
10137
|
+
This passes because 4 Null values falls within the acceptable range (3 ± 0.75 calculates
|
|
10138
|
+
to 2.25 to 3.75, which rounds down to 2 to 3 rows).
|
|
10139
|
+
|
|
10140
|
+
*Using asymmetric absolute bounds (tuple of integers)*: Specify different lower and
|
|
10141
|
+
upper bounds as absolute values. With `tol=(0, 2)`, we allow no deviation below but up
|
|
10142
|
+
to 2 rows above the expected count.
|
|
10143
|
+
|
|
10144
|
+
```{python}
|
|
10145
|
+
validation = (
|
|
10146
|
+
pb.Validate(data=tbl)
|
|
10147
|
+
.col_pct_null(columns="b", p=0.25, tol=(0, 2)) # Expect 2 Nulls, allow +0/-2 (range: 2-4)
|
|
10148
|
+
.interrogate()
|
|
10149
|
+
)
|
|
10150
|
+
|
|
10151
|
+
validation
|
|
10152
|
+
```
|
|
10153
|
+
|
|
10154
|
+
This passes because 4 Null values falls within the acceptable range of 2 to 4.
|
|
10155
|
+
|
|
10156
|
+
*Using asymmetric relative bounds (tuple of floats)*: Specify different lower and upper
|
|
10157
|
+
bounds as proportions. With `tol=(0.1, 0.3)`, we allow 10% below and 30% above the
|
|
10158
|
+
expected count.
|
|
10159
|
+
|
|
10160
|
+
```{python}
|
|
10161
|
+
validation = (
|
|
10162
|
+
pb.Validate(data=tbl)
|
|
10163
|
+
.col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3)) # Expect 3 Nulls, allow -10%/+30%
|
|
10164
|
+
.interrogate()
|
|
10165
|
+
)
|
|
10166
|
+
|
|
10167
|
+
validation
|
|
10168
|
+
```
|
|
9962
10169
|
|
|
10170
|
+
This passes because 4 Null values falls within the acceptable range (3 - 0.3 to 3 + 0.9
|
|
10171
|
+
calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows).
|
|
10172
|
+
"""
|
|
9963
10173
|
assertion_type = _get_fn_name()
|
|
9964
10174
|
|
|
9965
|
-
|
|
9966
|
-
# TODO: add check for segments
|
|
9967
|
-
# _check_segments(segments=segments)
|
|
10175
|
+
_check_column(column=columns)
|
|
9968
10176
|
_check_thresholds(thresholds=thresholds)
|
|
9969
10177
|
_check_boolean_input(param=active, param_name="active")
|
|
9970
10178
|
|
|
@@ -9973,26 +10181,274 @@ class Validate:
|
|
|
9973
10181
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
9974
10182
|
)
|
|
9975
10183
|
|
|
9976
|
-
|
|
9977
|
-
|
|
10184
|
+
# If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
|
|
10185
|
+
# resolve the columns
|
|
10186
|
+
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
10187
|
+
columns = col(columns)
|
|
9978
10188
|
|
|
9979
|
-
#
|
|
10189
|
+
# If `columns` is Column value or a string, place it in a list for iteration
|
|
10190
|
+
if isinstance(columns, (Column, str)):
|
|
10191
|
+
columns = [columns]
|
|
9980
10192
|
|
|
9981
10193
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
9982
10194
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
9983
10195
|
|
|
9984
|
-
|
|
9985
|
-
|
|
9986
|
-
|
|
9987
|
-
|
|
9988
|
-
|
|
9989
|
-
|
|
9990
|
-
|
|
9991
|
-
|
|
9992
|
-
|
|
9993
|
-
|
|
9994
|
-
|
|
9995
|
-
|
|
10196
|
+
bound_finder: Callable[[int], AbsoluteBounds] = partial(_derive_bounds, tol=tol)
|
|
10197
|
+
|
|
10198
|
+
# Iterate over the columns and create a validation step for each
|
|
10199
|
+
for column in columns:
|
|
10200
|
+
val_info = _ValidationInfo(
|
|
10201
|
+
assertion_type=assertion_type,
|
|
10202
|
+
column=column,
|
|
10203
|
+
values={"p": p, "bound_finder": bound_finder},
|
|
10204
|
+
thresholds=thresholds,
|
|
10205
|
+
actions=actions,
|
|
10206
|
+
brief=brief,
|
|
10207
|
+
active=active,
|
|
10208
|
+
)
|
|
10209
|
+
|
|
10210
|
+
self._add_validation(validation_info=val_info)
|
|
10211
|
+
|
|
10212
|
+
return self
|
|
10213
|
+
|
|
10214
|
+
def rows_distinct(
|
|
10215
|
+
self,
|
|
10216
|
+
columns_subset: str | list[str] | None = None,
|
|
10217
|
+
pre: Callable | None = None,
|
|
10218
|
+
segments: SegmentSpec | None = None,
|
|
10219
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
10220
|
+
actions: Actions | None = None,
|
|
10221
|
+
brief: str | bool | None = None,
|
|
10222
|
+
active: bool = True,
|
|
10223
|
+
) -> Validate:
|
|
10224
|
+
"""
|
|
10225
|
+
Validate whether rows in the table are distinct.
|
|
10226
|
+
|
|
10227
|
+
The `rows_distinct()` method checks whether rows in the table are distinct. This validation
|
|
10228
|
+
will operate over the number of test units that is equal to the number of rows in the table
|
|
10229
|
+
(determined after any `pre=` mutation has been applied).
|
|
10230
|
+
|
|
10231
|
+
Parameters
|
|
10232
|
+
----------
|
|
10233
|
+
columns_subset
|
|
10234
|
+
A single column or a list of columns to use as a subset for the distinct comparison.
|
|
10235
|
+
If `None`, then all columns in the table will be used for the comparison. If multiple
|
|
10236
|
+
columns are supplied, the distinct comparison will be made over the combination of
|
|
10237
|
+
values in those columns.
|
|
10238
|
+
pre
|
|
10239
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
10240
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
10241
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
10242
|
+
argument.
|
|
10243
|
+
segments
|
|
10244
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
10245
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
10246
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
10247
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
10248
|
+
thresholds
|
|
10249
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
10250
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
10251
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
10252
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
10253
|
+
section for information on how to set threshold levels.
|
|
10254
|
+
actions
|
|
10255
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
10256
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
10257
|
+
define the actions.
|
|
10258
|
+
brief
|
|
10259
|
+
An optional brief description of the validation step that will be displayed in the
|
|
10260
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
10261
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
10262
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
10263
|
+
won't be a brief.
|
|
10264
|
+
active
|
|
10265
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
10266
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
10267
|
+
for the steps unchanged).
|
|
10268
|
+
|
|
10269
|
+
Returns
|
|
10270
|
+
-------
|
|
10271
|
+
Validate
|
|
10272
|
+
The `Validate` object with the added validation step.
|
|
10273
|
+
|
|
10274
|
+
Preprocessing
|
|
10275
|
+
-------------
|
|
10276
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
10277
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
10278
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
10279
|
+
before the validation step is applied.
|
|
10280
|
+
|
|
10281
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
10282
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
10283
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
10284
|
+
columns via `columns_subset=` that are expected to be present in the transformed table, but
|
|
10285
|
+
may not exist in the table before preprocessing. Regarding the lifetime of the transformed
|
|
10286
|
+
table, it only exists during the validation step and is not stored in the `Validate` object
|
|
10287
|
+
or used in subsequent validation steps.
|
|
10288
|
+
|
|
10289
|
+
Segmentation
|
|
10290
|
+
------------
|
|
10291
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
10292
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
10293
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
10294
|
+
column.
|
|
10295
|
+
|
|
10296
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
10297
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
10298
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
10299
|
+
region.
|
|
10300
|
+
|
|
10301
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
10302
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
10303
|
+
segment on only specific dates, you can provide a tuple like
|
|
10304
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
10305
|
+
(i.e., no validation steps will be created for them).
|
|
10306
|
+
|
|
10307
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
10308
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
10309
|
+
|
|
10310
|
+
```
|
|
10311
|
+
# Segments from all unique values in the `region` column
|
|
10312
|
+
# and specific dates in the `date` column
|
|
10313
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
10314
|
+
|
|
10315
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
10316
|
+
segments=["region", "date"]
|
|
10317
|
+
```
|
|
10318
|
+
|
|
10319
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
10320
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
10321
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
10322
|
+
identify issues within specific segments.
|
|
10323
|
+
|
|
10324
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
10325
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
10326
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
10327
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
10328
|
+
|
|
10329
|
+
Thresholds
|
|
10330
|
+
----------
|
|
10331
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
10332
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
10333
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
10334
|
+
|
|
10335
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
10336
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
10337
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
10338
|
+
|
|
10339
|
+
Thresholds can be defined using one of these input schemes:
|
|
10340
|
+
|
|
10341
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
10342
|
+
thresholds)
|
|
10343
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
10344
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
10345
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
10346
|
+
'critical'
|
|
10347
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
10348
|
+
for the 'warning' level only
|
|
10349
|
+
|
|
10350
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
10351
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
10352
|
+
set, you're free to set any combination of them.
|
|
10353
|
+
|
|
10354
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
10355
|
+
take for each level of failure (using the `actions=` parameter).
|
|
10356
|
+
|
|
10357
|
+
Examples
|
|
10358
|
+
--------
|
|
10359
|
+
```{python}
|
|
10360
|
+
#| echo: false
|
|
10361
|
+
#| output: false
|
|
10362
|
+
import pointblank as pb
|
|
10363
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
10364
|
+
```
|
|
10365
|
+
For the examples here, we'll use a simple Polars DataFrame with three string columns
|
|
10366
|
+
(`col_1`, `col_2`, and `col_3`). The table is shown below:
|
|
10367
|
+
|
|
10368
|
+
```{python}
|
|
10369
|
+
import pointblank as pb
|
|
10370
|
+
import polars as pl
|
|
10371
|
+
|
|
10372
|
+
tbl = pl.DataFrame(
|
|
10373
|
+
{
|
|
10374
|
+
"col_1": ["a", "b", "c", "d"],
|
|
10375
|
+
"col_2": ["a", "a", "c", "d"],
|
|
10376
|
+
"col_3": ["a", "a", "d", "e"],
|
|
10377
|
+
}
|
|
10378
|
+
)
|
|
10379
|
+
|
|
10380
|
+
pb.preview(tbl)
|
|
10381
|
+
```
|
|
10382
|
+
|
|
10383
|
+
Let's validate that the rows in the table are distinct with `rows_distinct()`. We'll
|
|
10384
|
+
determine if this validation had any failing test units (there are four test units, one for
|
|
10385
|
+
each row). A failing test units means that a given row is not distinct from every other row.
|
|
10386
|
+
|
|
10387
|
+
```{python}
|
|
10388
|
+
validation = (
|
|
10389
|
+
pb.Validate(data=tbl)
|
|
10390
|
+
.rows_distinct()
|
|
10391
|
+
.interrogate()
|
|
10392
|
+
)
|
|
10393
|
+
|
|
10394
|
+
validation
|
|
10395
|
+
```
|
|
10396
|
+
|
|
10397
|
+
From this validation table we see that there are no failing test units. All rows in the
|
|
10398
|
+
table are distinct from one another.
|
|
10399
|
+
|
|
10400
|
+
We can also use a subset of columns to determine distinctness. Let's specify the subset
|
|
10401
|
+
using columns `col_2` and `col_3` for the next validation.
|
|
10402
|
+
|
|
10403
|
+
```{python}
|
|
10404
|
+
validation = (
|
|
10405
|
+
pb.Validate(data=tbl)
|
|
10406
|
+
.rows_distinct(columns_subset=["col_2", "col_3"])
|
|
10407
|
+
.interrogate()
|
|
10408
|
+
)
|
|
10409
|
+
|
|
10410
|
+
validation
|
|
10411
|
+
```
|
|
10412
|
+
|
|
10413
|
+
The validation table reports two failing test units. The first and second rows are
|
|
10414
|
+
duplicated when considering only the values in columns `col_2` and `col_3`. There's only
|
|
10415
|
+
one set of duplicates but there are two failing test units since each row is compared to all
|
|
10416
|
+
others.
|
|
10417
|
+
"""
|
|
10418
|
+
|
|
10419
|
+
assertion_type = _get_fn_name()
|
|
10420
|
+
|
|
10421
|
+
_check_pre(pre=pre)
|
|
10422
|
+
# TODO: add check for segments
|
|
10423
|
+
# _check_segments(segments=segments)
|
|
10424
|
+
_check_thresholds(thresholds=thresholds)
|
|
10425
|
+
_check_boolean_input(param=active, param_name="active")
|
|
10426
|
+
|
|
10427
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
10428
|
+
thresholds = (
|
|
10429
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
10430
|
+
)
|
|
10431
|
+
|
|
10432
|
+
if columns_subset is not None and isinstance(columns_subset, str):
|
|
10433
|
+
columns_subset = [columns_subset]
|
|
10434
|
+
|
|
10435
|
+
# TODO: incorporate Column object
|
|
10436
|
+
|
|
10437
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
10438
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
10439
|
+
|
|
10440
|
+
val_info = _ValidationInfo(
|
|
10441
|
+
assertion_type=assertion_type,
|
|
10442
|
+
column=columns_subset,
|
|
10443
|
+
pre=pre,
|
|
10444
|
+
segments=segments,
|
|
10445
|
+
thresholds=thresholds,
|
|
10446
|
+
actions=actions,
|
|
10447
|
+
brief=brief,
|
|
10448
|
+
active=active,
|
|
10449
|
+
)
|
|
10450
|
+
|
|
10451
|
+
self._add_validation(validation_info=val_info)
|
|
9996
10452
|
|
|
9997
10453
|
return self
|
|
9998
10454
|
|
|
@@ -10001,7 +10457,7 @@ class Validate:
|
|
|
10001
10457
|
columns_subset: str | list[str] | None = None,
|
|
10002
10458
|
pre: Callable | None = None,
|
|
10003
10459
|
segments: SegmentSpec | None = None,
|
|
10004
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
10460
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
10005
10461
|
actions: Actions | None = None,
|
|
10006
10462
|
brief: str | bool | None = None,
|
|
10007
10463
|
active: bool = True,
|
|
@@ -10246,7 +10702,7 @@ class Validate:
|
|
|
10246
10702
|
max_concurrent: int = 3,
|
|
10247
10703
|
pre: Callable | None = None,
|
|
10248
10704
|
segments: SegmentSpec | None = None,
|
|
10249
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
10705
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
10250
10706
|
actions: Actions | None = None,
|
|
10251
10707
|
brief: str | bool | None = None,
|
|
10252
10708
|
active: bool = True,
|
|
@@ -10641,7 +11097,7 @@ class Validate:
|
|
|
10641
11097
|
case_sensitive_dtypes: bool = True,
|
|
10642
11098
|
full_match_dtypes: bool = True,
|
|
10643
11099
|
pre: Callable | None = None,
|
|
10644
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
11100
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
10645
11101
|
actions: Actions | None = None,
|
|
10646
11102
|
brief: str | bool | None = None,
|
|
10647
11103
|
active: bool = True,
|
|
@@ -10857,11 +11313,11 @@ class Validate:
|
|
|
10857
11313
|
|
|
10858
11314
|
def row_count_match(
|
|
10859
11315
|
self,
|
|
10860
|
-
count: int |
|
|
11316
|
+
count: int | Any,
|
|
10861
11317
|
tol: Tolerance = 0,
|
|
10862
11318
|
inverse: bool = False,
|
|
10863
11319
|
pre: Callable | None = None,
|
|
10864
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
11320
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
10865
11321
|
actions: Actions | None = None,
|
|
10866
11322
|
brief: str | bool | None = None,
|
|
10867
11323
|
active: bool = True,
|
|
@@ -11076,10 +11532,10 @@ class Validate:
|
|
|
11076
11532
|
|
|
11077
11533
|
def col_count_match(
|
|
11078
11534
|
self,
|
|
11079
|
-
count: int |
|
|
11535
|
+
count: int | Any,
|
|
11080
11536
|
inverse: bool = False,
|
|
11081
11537
|
pre: Callable | None = None,
|
|
11082
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
11538
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
11083
11539
|
actions: Actions | None = None,
|
|
11084
11540
|
brief: str | bool | None = None,
|
|
11085
11541
|
active: bool = True,
|
|
@@ -11252,9 +11708,9 @@ class Validate:
|
|
|
11252
11708
|
|
|
11253
11709
|
def tbl_match(
|
|
11254
11710
|
self,
|
|
11255
|
-
tbl_compare:
|
|
11711
|
+
tbl_compare: Any,
|
|
11256
11712
|
pre: Callable | None = None,
|
|
11257
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
11713
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
11258
11714
|
actions: Actions | None = None,
|
|
11259
11715
|
brief: str | bool | None = None,
|
|
11260
11716
|
active: bool = True,
|
|
@@ -11523,7 +11979,7 @@ class Validate:
|
|
|
11523
11979
|
self,
|
|
11524
11980
|
*exprs: Callable,
|
|
11525
11981
|
pre: Callable | None = None,
|
|
11526
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
11982
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
11527
11983
|
actions: Actions | None = None,
|
|
11528
11984
|
brief: str | bool | None = None,
|
|
11529
11985
|
active: bool = True,
|
|
@@ -11771,7 +12227,7 @@ class Validate:
|
|
|
11771
12227
|
self,
|
|
11772
12228
|
expr: Callable,
|
|
11773
12229
|
pre: Callable | None = None,
|
|
11774
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
12230
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
11775
12231
|
actions: Actions | None = None,
|
|
11776
12232
|
brief: str | bool | None = None,
|
|
11777
12233
|
active: bool = True,
|
|
@@ -12265,7 +12721,7 @@ class Validate:
|
|
|
12265
12721
|
segment = validation.segments
|
|
12266
12722
|
|
|
12267
12723
|
# Get compatible data types for this assertion type
|
|
12268
|
-
assertion_method = ASSERTION_TYPE_METHOD_MAP
|
|
12724
|
+
assertion_method = ASSERTION_TYPE_METHOD_MAP.get(assertion_type, assertion_type)
|
|
12269
12725
|
compatible_dtypes = COMPATIBLE_DTYPES.get(assertion_method, [])
|
|
12270
12726
|
|
|
12271
12727
|
# Process the `brief` text for the validation step by including template variables to
|
|
@@ -12282,12 +12738,19 @@ class Validate:
|
|
|
12282
12738
|
# Generate the autobrief description for the validation step; it's important to perform
|
|
12283
12739
|
# that here since text components like the column and the value(s) have been resolved
|
|
12284
12740
|
# at this point
|
|
12741
|
+
# Get row count for col_pct_null to properly calculate absolute tolerance percentages
|
|
12742
|
+
n_rows = None
|
|
12743
|
+
if assertion_type == "col_pct_null":
|
|
12744
|
+
n_rows = get_row_count(data_tbl)
|
|
12745
|
+
|
|
12285
12746
|
autobrief = _create_autobrief_or_failure_text(
|
|
12286
12747
|
assertion_type=assertion_type,
|
|
12287
12748
|
lang=self.lang,
|
|
12288
12749
|
column=column,
|
|
12289
12750
|
values=value,
|
|
12290
12751
|
for_failure=False,
|
|
12752
|
+
locale=self.locale,
|
|
12753
|
+
n_rows=n_rows,
|
|
12291
12754
|
)
|
|
12292
12755
|
|
|
12293
12756
|
validation.autobrief = autobrief
|
|
@@ -12313,7 +12776,17 @@ class Validate:
|
|
|
12313
12776
|
|
|
12314
12777
|
# Make a deep copy of the table for this step to ensure proper isolation
|
|
12315
12778
|
# This prevents modifications from one validation step affecting others
|
|
12316
|
-
|
|
12779
|
+
try:
|
|
12780
|
+
# TODO: This copying should be scrutinized further
|
|
12781
|
+
data_tbl_step: IntoDataFrame = _copy_dataframe(data_tbl)
|
|
12782
|
+
except Exception as e: # pragma: no cover
|
|
12783
|
+
data_tbl_step: IntoDataFrame = data_tbl # pragma: no cover
|
|
12784
|
+
|
|
12785
|
+
# Capture original table dimensions and columns before preprocessing
|
|
12786
|
+
# (only if preprocessing is present - we'll set these inside the preprocessing block)
|
|
12787
|
+
original_rows = None
|
|
12788
|
+
original_cols = None
|
|
12789
|
+
original_column_names = None
|
|
12317
12790
|
|
|
12318
12791
|
# ------------------------------------------------
|
|
12319
12792
|
# Preprocessing stage
|
|
@@ -12322,6 +12795,16 @@ class Validate:
|
|
|
12322
12795
|
# Determine whether any preprocessing functions are to be applied to the table
|
|
12323
12796
|
if validation.pre is not None:
|
|
12324
12797
|
try:
|
|
12798
|
+
# Capture original table dimensions before preprocessing
|
|
12799
|
+
# Use get_row_count() instead of len() for compatibility with PySpark, etc.
|
|
12800
|
+
original_rows = get_row_count(data_tbl_step)
|
|
12801
|
+
original_cols = get_column_count(data_tbl_step)
|
|
12802
|
+
original_column_names = set(
|
|
12803
|
+
data_tbl_step.columns
|
|
12804
|
+
if hasattr(data_tbl_step, "columns")
|
|
12805
|
+
else list(data_tbl_step.columns)
|
|
12806
|
+
)
|
|
12807
|
+
|
|
12325
12808
|
# Read the text of the preprocessing function
|
|
12326
12809
|
pre_text = _pre_processing_funcs_to_str(validation.pre)
|
|
12327
12810
|
|
|
@@ -12354,6 +12837,62 @@ class Validate:
|
|
|
12354
12837
|
elif isinstance(validation.pre, Callable):
|
|
12355
12838
|
data_tbl_step = validation.pre(data_tbl_step)
|
|
12356
12839
|
|
|
12840
|
+
# After successful preprocessing, check dimensions and create notes
|
|
12841
|
+
# Use get_row_count() and get_column_count() for compatibility
|
|
12842
|
+
processed_rows = get_row_count(data_tbl_step)
|
|
12843
|
+
processed_cols = get_column_count(data_tbl_step)
|
|
12844
|
+
|
|
12845
|
+
# Always add a note when preprocessing is applied
|
|
12846
|
+
if original_rows != processed_rows or original_cols != processed_cols:
|
|
12847
|
+
# Dimensions changed - show the change
|
|
12848
|
+
note_html = _create_preprocessing_note_html(
|
|
12849
|
+
original_rows=original_rows,
|
|
12850
|
+
original_cols=original_cols,
|
|
12851
|
+
processed_rows=processed_rows,
|
|
12852
|
+
processed_cols=processed_cols,
|
|
12853
|
+
locale=self.locale,
|
|
12854
|
+
)
|
|
12855
|
+
note_text = _create_preprocessing_note_text(
|
|
12856
|
+
original_rows=original_rows,
|
|
12857
|
+
original_cols=original_cols,
|
|
12858
|
+
processed_rows=processed_rows,
|
|
12859
|
+
processed_cols=processed_cols,
|
|
12860
|
+
)
|
|
12861
|
+
else:
|
|
12862
|
+
# No dimension change - just indicate preprocessing was applied
|
|
12863
|
+
note_html = _create_preprocessing_no_change_note_html(locale=self.locale)
|
|
12864
|
+
note_text = _create_preprocessing_no_change_note_text()
|
|
12865
|
+
|
|
12866
|
+
validation._add_note(
|
|
12867
|
+
key="pre_applied",
|
|
12868
|
+
markdown=note_html,
|
|
12869
|
+
text=note_text,
|
|
12870
|
+
)
|
|
12871
|
+
|
|
12872
|
+
# Check if target column is synthetic (exists in processed but not original)
|
|
12873
|
+
# Only check for single column names (not lists used in rows_distinct, etc.)
|
|
12874
|
+
if column is not None and isinstance(column, str):
|
|
12875
|
+
processed_column_names = set(
|
|
12876
|
+
data_tbl_step.columns
|
|
12877
|
+
if hasattr(data_tbl_step, "columns")
|
|
12878
|
+
else list(data_tbl_step.columns)
|
|
12879
|
+
)
|
|
12880
|
+
|
|
12881
|
+
# Check if the target column is in the processed table but not in original
|
|
12882
|
+
if column in processed_column_names and column not in original_column_names:
|
|
12883
|
+
note_html = _create_synthetic_target_column_note_html(
|
|
12884
|
+
column_name=column,
|
|
12885
|
+
locale=self.locale,
|
|
12886
|
+
)
|
|
12887
|
+
note_text = _create_synthetic_target_column_note_text(
|
|
12888
|
+
column_name=column,
|
|
12889
|
+
)
|
|
12890
|
+
validation._add_note(
|
|
12891
|
+
key="syn_target_col",
|
|
12892
|
+
markdown=note_html,
|
|
12893
|
+
text=note_text,
|
|
12894
|
+
)
|
|
12895
|
+
|
|
12357
12896
|
except Exception:
|
|
12358
12897
|
# If preprocessing fails, mark the validation as having an eval_error
|
|
12359
12898
|
validation.eval_error = True
|
|
@@ -12543,6 +13082,21 @@ class Validate:
|
|
|
12543
13082
|
tbl=tbl, column=column, values=value, na_pass=na_pass
|
|
12544
13083
|
)
|
|
12545
13084
|
|
|
13085
|
+
elif assertion_type == "col_pct_null":
|
|
13086
|
+
result_bool = col_pct_null(
|
|
13087
|
+
data_tbl=data_tbl_step,
|
|
13088
|
+
column=column,
|
|
13089
|
+
p=value["p"],
|
|
13090
|
+
bound_finder=value["bound_finder"],
|
|
13091
|
+
)
|
|
13092
|
+
|
|
13093
|
+
validation.all_passed = result_bool
|
|
13094
|
+
validation.n = 1
|
|
13095
|
+
validation.n_passed = int(result_bool)
|
|
13096
|
+
validation.n_failed = 1 - int(result_bool)
|
|
13097
|
+
|
|
13098
|
+
results_tbl = None
|
|
13099
|
+
|
|
12546
13100
|
elif assertion_type == "col_vals_expr":
|
|
12547
13101
|
results_tbl = col_vals_expr(
|
|
12548
13102
|
data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
|
|
@@ -12602,10 +13156,21 @@ class Validate:
|
|
|
12602
13156
|
# Add the schema validation info to the validation object
|
|
12603
13157
|
validation.val_info = schema_validation_info
|
|
12604
13158
|
|
|
13159
|
+
# Add a note with the schema expectation and results
|
|
13160
|
+
schema_note_html = _create_col_schema_match_note_html(
|
|
13161
|
+
schema_info=schema_validation_info, locale=self.locale
|
|
13162
|
+
)
|
|
13163
|
+
schema_note_text = _create_col_schema_match_note_text(
|
|
13164
|
+
schema_info=schema_validation_info
|
|
13165
|
+
)
|
|
13166
|
+
validation._add_note(
|
|
13167
|
+
key="schema_check", markdown=schema_note_html, text=schema_note_text
|
|
13168
|
+
)
|
|
13169
|
+
|
|
12605
13170
|
validation.all_passed = result_bool
|
|
12606
13171
|
validation.n = 1
|
|
12607
13172
|
validation.n_passed = int(result_bool)
|
|
12608
|
-
validation.n_failed = 1 - result_bool
|
|
13173
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12609
13174
|
|
|
12610
13175
|
results_tbl = None
|
|
12611
13176
|
|
|
@@ -12620,7 +13185,7 @@ class Validate:
|
|
|
12620
13185
|
validation.all_passed = result_bool
|
|
12621
13186
|
validation.n = 1
|
|
12622
13187
|
validation.n_passed = int(result_bool)
|
|
12623
|
-
validation.n_failed = 1 - result_bool
|
|
13188
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12624
13189
|
|
|
12625
13190
|
results_tbl = None
|
|
12626
13191
|
|
|
@@ -12632,7 +13197,7 @@ class Validate:
|
|
|
12632
13197
|
validation.all_passed = result_bool
|
|
12633
13198
|
validation.n = 1
|
|
12634
13199
|
validation.n_passed = int(result_bool)
|
|
12635
|
-
validation.n_failed = 1 - result_bool
|
|
13200
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12636
13201
|
|
|
12637
13202
|
results_tbl = None
|
|
12638
13203
|
|
|
@@ -12651,7 +13216,7 @@ class Validate:
|
|
|
12651
13216
|
validation.all_passed = result_bool
|
|
12652
13217
|
validation.n = 1
|
|
12653
13218
|
validation.n_passed = int(result_bool)
|
|
12654
|
-
validation.n_failed = 1 - result_bool
|
|
13219
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12655
13220
|
|
|
12656
13221
|
results_tbl = None
|
|
12657
13222
|
|
|
@@ -12663,14 +13228,53 @@ class Validate:
|
|
|
12663
13228
|
tbl_type=tbl_type,
|
|
12664
13229
|
)
|
|
12665
13230
|
|
|
13231
|
+
elif is_valid_agg(assertion_type):
|
|
13232
|
+
agg, comp = resolve_agg_registries(assertion_type)
|
|
13233
|
+
|
|
13234
|
+
# Produce a 1-column Narwhals DataFrame
|
|
13235
|
+
# TODO: Should be able to take lazy too
|
|
13236
|
+
vec: nw.DataFrame = nw.from_native(data_tbl_step).select(column)
|
|
13237
|
+
real = agg(vec)
|
|
13238
|
+
|
|
13239
|
+
raw_value = value["value"]
|
|
13240
|
+
tol = value["tol"]
|
|
13241
|
+
|
|
13242
|
+
# Handle ReferenceColumn: compute target from reference data
|
|
13243
|
+
if isinstance(raw_value, ReferenceColumn):
|
|
13244
|
+
if self.reference is None:
|
|
13245
|
+
raise ValueError(
|
|
13246
|
+
f"Cannot use ref('{raw_value.column_name}') without "
|
|
13247
|
+
"setting reference data on the Validate object. "
|
|
13248
|
+
"Use Validate(data=..., reference=...) to set reference data."
|
|
13249
|
+
)
|
|
13250
|
+
ref_vec: nw.DataFrame = nw.from_native(self.reference).select(
|
|
13251
|
+
raw_value.column_name
|
|
13252
|
+
)
|
|
13253
|
+
target: float | int = agg(ref_vec)
|
|
13254
|
+
else:
|
|
13255
|
+
target = raw_value
|
|
13256
|
+
|
|
13257
|
+
lower_diff, upper_diff = _derive_bounds(target, tol)
|
|
13258
|
+
|
|
13259
|
+
lower_bound = target - lower_diff
|
|
13260
|
+
upper_bound = target + upper_diff
|
|
13261
|
+
result_bool: bool = comp(real, lower_bound, upper_bound)
|
|
13262
|
+
|
|
13263
|
+
validation.all_passed = result_bool
|
|
13264
|
+
validation.n = 1
|
|
13265
|
+
validation.n_passed = int(result_bool)
|
|
13266
|
+
validation.n_failed = 1 - result_bool
|
|
13267
|
+
|
|
13268
|
+
results_tbl = None
|
|
12666
13269
|
else:
|
|
12667
13270
|
raise ValueError(
|
|
12668
13271
|
f"Unknown assertion type: {assertion_type}"
|
|
12669
13272
|
) # pragma: no cover
|
|
12670
13273
|
|
|
12671
13274
|
except Exception as e:
|
|
12672
|
-
#
|
|
13275
|
+
# Catch data quality errors and column not found errors
|
|
12673
13276
|
error_msg = str(e).lower()
|
|
13277
|
+
|
|
12674
13278
|
is_comparison_error = (
|
|
12675
13279
|
"boolean value of na is ambiguous" in error_msg
|
|
12676
13280
|
or "cannot compare" in error_msg
|
|
@@ -12681,20 +13285,101 @@ class Validate:
|
|
|
12681
13285
|
or ("dtype" in error_msg and "compare" in error_msg)
|
|
12682
13286
|
)
|
|
12683
13287
|
|
|
12684
|
-
|
|
12685
|
-
|
|
13288
|
+
is_column_not_found = "column" in error_msg and "not found" in error_msg
|
|
13289
|
+
|
|
13290
|
+
is_comparison_column_not_found = (
|
|
13291
|
+
"unable to find column" in error_msg and "valid columns" in error_msg
|
|
13292
|
+
)
|
|
13293
|
+
|
|
13294
|
+
if (
|
|
13295
|
+
is_comparison_error or is_column_not_found or is_comparison_column_not_found
|
|
13296
|
+
): # pragma: no cover
|
|
13297
|
+
# If data quality comparison fails or column not found, mark as eval_error
|
|
12686
13298
|
validation.eval_error = True # pragma: no cover
|
|
13299
|
+
|
|
13300
|
+
# Add a note for column not found errors (target column)
|
|
13301
|
+
if is_column_not_found:
|
|
13302
|
+
note_html = _create_column_not_found_note_html(
|
|
13303
|
+
column_name=column,
|
|
13304
|
+
available_columns=list(data_tbl_step.columns)
|
|
13305
|
+
if hasattr(data_tbl_step, "columns")
|
|
13306
|
+
else [],
|
|
13307
|
+
locale=self.locale,
|
|
13308
|
+
)
|
|
13309
|
+
note_text = _create_column_not_found_note_text(
|
|
13310
|
+
column_name=column,
|
|
13311
|
+
available_columns=list(data_tbl_step.columns)
|
|
13312
|
+
if hasattr(data_tbl_step, "columns")
|
|
13313
|
+
else [],
|
|
13314
|
+
)
|
|
13315
|
+
validation._add_note(
|
|
13316
|
+
key="column_not_found",
|
|
13317
|
+
markdown=note_html,
|
|
13318
|
+
text=note_text,
|
|
13319
|
+
)
|
|
13320
|
+
|
|
13321
|
+
# Add a note for comparison column not found errors
|
|
13322
|
+
elif is_comparison_column_not_found:
|
|
13323
|
+
# Extract column name from error message
|
|
13324
|
+
# Error format: 'unable to find column "col_name"; valid columns: ...'
|
|
13325
|
+
match = re.search(r'unable to find column "([^"]+)"', str(e))
|
|
13326
|
+
|
|
13327
|
+
if match:
|
|
13328
|
+
missing_col_name = match.group(1)
|
|
13329
|
+
|
|
13330
|
+
# Determine position for between/outside validations
|
|
13331
|
+
position = None
|
|
13332
|
+
if assertion_type in ["col_vals_between", "col_vals_outside"]:
|
|
13333
|
+
# Check if missing column is in left or right position
|
|
13334
|
+
from pointblank.column import Column
|
|
13335
|
+
|
|
13336
|
+
if (
|
|
13337
|
+
isinstance(value[0], Column)
|
|
13338
|
+
and value[0].exprs == missing_col_name
|
|
13339
|
+
):
|
|
13340
|
+
position = "left"
|
|
13341
|
+
elif (
|
|
13342
|
+
isinstance(value[1], Column)
|
|
13343
|
+
and value[1].exprs == missing_col_name
|
|
13344
|
+
):
|
|
13345
|
+
position = "right"
|
|
13346
|
+
|
|
13347
|
+
note_html = _create_comparison_column_not_found_note_html(
|
|
13348
|
+
column_name=missing_col_name,
|
|
13349
|
+
position=position,
|
|
13350
|
+
available_columns=list(data_tbl_step.columns)
|
|
13351
|
+
if hasattr(data_tbl_step, "columns")
|
|
13352
|
+
else [],
|
|
13353
|
+
locale=self.locale,
|
|
13354
|
+
)
|
|
13355
|
+
note_text = _create_comparison_column_not_found_note_text(
|
|
13356
|
+
column_name=missing_col_name,
|
|
13357
|
+
position=position,
|
|
13358
|
+
available_columns=list(data_tbl_step.columns)
|
|
13359
|
+
if hasattr(data_tbl_step, "columns")
|
|
13360
|
+
else [],
|
|
13361
|
+
)
|
|
13362
|
+
validation._add_note(
|
|
13363
|
+
key="comparison_column_not_found",
|
|
13364
|
+
markdown=note_html,
|
|
13365
|
+
text=note_text,
|
|
13366
|
+
)
|
|
13367
|
+
|
|
12687
13368
|
end_time = datetime.datetime.now(datetime.timezone.utc) # pragma: no cover
|
|
13369
|
+
|
|
12688
13370
|
validation.proc_duration_s = (
|
|
12689
13371
|
end_time - start_time
|
|
12690
13372
|
).total_seconds() # pragma: no cover
|
|
13373
|
+
|
|
12691
13374
|
validation.time_processed = end_time.isoformat(
|
|
12692
13375
|
timespec="milliseconds"
|
|
12693
13376
|
) # pragma: no cover
|
|
13377
|
+
|
|
12694
13378
|
validation.active = False # pragma: no cover
|
|
13379
|
+
|
|
12695
13380
|
continue # pragma: no cover
|
|
12696
13381
|
else:
|
|
12697
|
-
# For other errors
|
|
13382
|
+
# For other unexpected errors, let them propagate
|
|
12698
13383
|
raise
|
|
12699
13384
|
|
|
12700
13385
|
else:
|
|
@@ -12792,6 +13477,7 @@ class Validate:
|
|
|
12792
13477
|
markdown=threshold_note_html,
|
|
12793
13478
|
text=threshold_note_text,
|
|
12794
13479
|
)
|
|
13480
|
+
|
|
12795
13481
|
elif self.thresholds != Thresholds():
|
|
12796
13482
|
# Thresholds explicitly reset to empty when global thresholds exist
|
|
12797
13483
|
reset_note_html = _create_threshold_reset_note_html(locale=self.locale)
|
|
@@ -12814,6 +13500,8 @@ class Validate:
|
|
|
12814
13500
|
column=column,
|
|
12815
13501
|
values=value,
|
|
12816
13502
|
for_failure=True,
|
|
13503
|
+
locale=self.locale,
|
|
13504
|
+
n_rows=n_rows,
|
|
12817
13505
|
)
|
|
12818
13506
|
|
|
12819
13507
|
# Set the failure text in the validation step
|
|
@@ -13320,12 +14008,14 @@ class Validate:
|
|
|
13320
14008
|
)
|
|
13321
14009
|
|
|
13322
14010
|
# Get the threshold status using the appropriate method
|
|
14011
|
+
# Note: scalar=False (default) always returns a dict
|
|
14012
|
+
status: dict[int, bool]
|
|
13323
14013
|
if level == "warning":
|
|
13324
|
-
status = self.warning(i=i)
|
|
14014
|
+
status = self.warning(i=i) # type: ignore[assignment]
|
|
13325
14015
|
elif level == "error":
|
|
13326
|
-
status = self.error(i=i)
|
|
13327
|
-
|
|
13328
|
-
status = self.critical(i=i)
|
|
14016
|
+
status = self.error(i=i) # type: ignore[assignment]
|
|
14017
|
+
else: # level == "critical"
|
|
14018
|
+
status = self.critical(i=i) # type: ignore[assignment]
|
|
13329
14019
|
|
|
13330
14020
|
# Find any steps that exceeded the threshold
|
|
13331
14021
|
failures = []
|
|
@@ -13479,12 +14169,14 @@ class Validate:
|
|
|
13479
14169
|
)
|
|
13480
14170
|
|
|
13481
14171
|
# Get the threshold status using the appropriate method
|
|
14172
|
+
# Note: scalar=False (default) always returns a dict
|
|
14173
|
+
status: dict[int, bool]
|
|
13482
14174
|
if level == "warning":
|
|
13483
|
-
status = self.warning(i=i)
|
|
14175
|
+
status = self.warning(i=i) # type: ignore[assignment]
|
|
13484
14176
|
elif level == "error":
|
|
13485
|
-
status = self.error(i=i)
|
|
13486
|
-
|
|
13487
|
-
status = self.critical(i=i)
|
|
14177
|
+
status = self.error(i=i) # type: ignore[assignment]
|
|
14178
|
+
else: # level == "critical"
|
|
14179
|
+
status = self.critical(i=i) # type: ignore[assignment]
|
|
13488
14180
|
|
|
13489
14181
|
# Return True if any steps exceeded the threshold
|
|
13490
14182
|
return any(status.values())
|
|
@@ -14257,7 +14949,7 @@ class Validate:
|
|
|
14257
14949
|
|
|
14258
14950
|
def get_data_extracts(
|
|
14259
14951
|
self, i: int | list[int] | None = None, frame: bool = False
|
|
14260
|
-
) -> dict[int,
|
|
14952
|
+
) -> dict[int, Any] | Any:
|
|
14261
14953
|
"""
|
|
14262
14954
|
Get the rows that failed for each validation step.
|
|
14263
14955
|
|
|
@@ -14280,7 +14972,7 @@ class Validate:
|
|
|
14280
14972
|
|
|
14281
14973
|
Returns
|
|
14282
14974
|
-------
|
|
14283
|
-
dict[int,
|
|
14975
|
+
dict[int, Any] | Any
|
|
14284
14976
|
A dictionary of tables containing the rows that failed in every compatible validation
|
|
14285
14977
|
step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
|
|
14286
14978
|
|
|
@@ -14570,7 +15262,7 @@ class Validate:
|
|
|
14570
15262
|
|
|
14571
15263
|
return json.dumps(report, indent=4, default=str)
|
|
14572
15264
|
|
|
14573
|
-
def get_sundered_data(self, type="pass") ->
|
|
15265
|
+
def get_sundered_data(self, type="pass") -> Any:
|
|
14574
15266
|
"""
|
|
14575
15267
|
Get the data that passed or failed the validation steps.
|
|
14576
15268
|
|
|
@@ -14606,7 +15298,7 @@ class Validate:
|
|
|
14606
15298
|
|
|
14607
15299
|
Returns
|
|
14608
15300
|
-------
|
|
14609
|
-
|
|
15301
|
+
Any
|
|
14610
15302
|
A table containing the data that passed or failed the validation steps.
|
|
14611
15303
|
|
|
14612
15304
|
Examples
|
|
@@ -14698,6 +15390,7 @@ class Validate:
|
|
|
14698
15390
|
# Get all validation step result tables and join together the `pb_is_good_` columns
|
|
14699
15391
|
# ensuring that the columns are named uniquely (e.g., `pb_is_good_1`, `pb_is_good_2`, ...)
|
|
14700
15392
|
# and that the index is reset
|
|
15393
|
+
labeled_tbl_nw: nw.DataFrame | nw.LazyFrame | None = None
|
|
14701
15394
|
for i, validation in enumerate(validation_info):
|
|
14702
15395
|
results_tbl = nw.from_native(validation.tbl_checked)
|
|
14703
15396
|
|
|
@@ -14718,7 +15411,7 @@ class Validate:
|
|
|
14718
15411
|
)
|
|
14719
15412
|
|
|
14720
15413
|
# Add the results table to the list of tables
|
|
14721
|
-
if
|
|
15414
|
+
if labeled_tbl_nw is None:
|
|
14722
15415
|
labeled_tbl_nw = results_tbl
|
|
14723
15416
|
else:
|
|
14724
15417
|
labeled_tbl_nw = labeled_tbl_nw.join(results_tbl, on=index_name, how="left")
|
|
@@ -14892,7 +15585,12 @@ class Validate:
|
|
|
14892
15585
|
return None
|
|
14893
15586
|
|
|
14894
15587
|
def get_tabular_report(
|
|
14895
|
-
self,
|
|
15588
|
+
self,
|
|
15589
|
+
title: str | None = ":default:",
|
|
15590
|
+
incl_header: bool | None = None,
|
|
15591
|
+
incl_footer: bool | None = None,
|
|
15592
|
+
incl_footer_timings: bool | None = None,
|
|
15593
|
+
incl_footer_notes: bool | None = None,
|
|
14896
15594
|
) -> GT:
|
|
14897
15595
|
"""
|
|
14898
15596
|
Validation report as a GT table.
|
|
@@ -14915,6 +15613,20 @@ class Validate:
|
|
|
14915
15613
|
name of the table as the title for the report. If no title is wanted, then `":none:"`
|
|
14916
15614
|
can be used. Aside from keyword options, text can be provided for the title. This will
|
|
14917
15615
|
be interpreted as Markdown text and transformed internally to HTML.
|
|
15616
|
+
incl_header
|
|
15617
|
+
Controls whether the header section should be displayed. If `None`, uses the global
|
|
15618
|
+
configuration setting. The header contains the table name, label, and threshold
|
|
15619
|
+
information.
|
|
15620
|
+
incl_footer
|
|
15621
|
+
Controls whether the footer section should be displayed. If `None`, uses the global
|
|
15622
|
+
configuration setting. The footer can contain validation timing information and notes.
|
|
15623
|
+
incl_footer_timings
|
|
15624
|
+
Controls whether validation timing information (start time, duration, end time) should
|
|
15625
|
+
be displayed in the footer. If `None`, uses the global configuration setting. Only
|
|
15626
|
+
applies when `incl_footer=True`.
|
|
15627
|
+
incl_footer_notes
|
|
15628
|
+
Controls whether notes from validation steps should be displayed in the footer. If
|
|
15629
|
+
`None`, uses the global configuration setting. Only applies when `incl_footer=True`.
|
|
14918
15630
|
|
|
14919
15631
|
Returns
|
|
14920
15632
|
-------
|
|
@@ -14974,6 +15686,10 @@ class Validate:
|
|
|
14974
15686
|
incl_header = global_config.report_incl_header
|
|
14975
15687
|
if incl_footer is None:
|
|
14976
15688
|
incl_footer = global_config.report_incl_footer
|
|
15689
|
+
if incl_footer_timings is None:
|
|
15690
|
+
incl_footer_timings = global_config.report_incl_footer_timings
|
|
15691
|
+
if incl_footer_notes is None:
|
|
15692
|
+
incl_footer_notes = global_config.report_incl_footer_notes
|
|
14977
15693
|
|
|
14978
15694
|
# Do we have a DataFrame library to work with?
|
|
14979
15695
|
_check_any_df_lib(method_used="get_tabular_report")
|
|
@@ -15212,30 +15928,59 @@ class Validate:
|
|
|
15212
15928
|
columns_upd = []
|
|
15213
15929
|
|
|
15214
15930
|
columns = validation_info_dict["column"]
|
|
15931
|
+
notes = validation_info_dict["notes"]
|
|
15215
15932
|
|
|
15216
15933
|
assertion_type = validation_info_dict["assertion_type"]
|
|
15217
15934
|
|
|
15218
15935
|
# Iterate over the values in the `column` entry
|
|
15219
15936
|
for i, column in enumerate(columns):
|
|
15937
|
+
# Check if this validation has a synthetic target column note
|
|
15938
|
+
has_synthetic_column = (
|
|
15939
|
+
notes[i] is not None and isinstance(notes[i], dict) and "syn_target_col" in notes[i]
|
|
15940
|
+
)
|
|
15941
|
+
|
|
15942
|
+
column_text = None
|
|
15943
|
+
|
|
15220
15944
|
if assertion_type[i] in [
|
|
15221
15945
|
"col_schema_match",
|
|
15222
15946
|
"row_count_match",
|
|
15223
15947
|
"col_count_match",
|
|
15224
15948
|
"col_vals_expr",
|
|
15225
15949
|
]:
|
|
15226
|
-
|
|
15950
|
+
column_text = "—"
|
|
15227
15951
|
elif assertion_type[i] in ["rows_distinct", "rows_complete", "prompt"]:
|
|
15228
15952
|
if not column:
|
|
15229
15953
|
# If there is no column subset, then all columns are used
|
|
15230
|
-
|
|
15954
|
+
column_text = "ALL COLUMNS"
|
|
15231
15955
|
else:
|
|
15232
15956
|
# With a column subset list, format with commas between the column names
|
|
15233
|
-
|
|
15234
|
-
|
|
15957
|
+
column_text = ", ".join(column)
|
|
15235
15958
|
elif assertion_type[i] in ["conjointly", "specially"]:
|
|
15236
|
-
|
|
15959
|
+
column_text = ""
|
|
15237
15960
|
else:
|
|
15238
|
-
|
|
15961
|
+
# Handle both string columns and list columns
|
|
15962
|
+
# For single-element lists like ['a'], display as 'a'
|
|
15963
|
+
# For multi-element lists, display as comma-separated values
|
|
15964
|
+
if isinstance(column, list):
|
|
15965
|
+
column_text = ", ".join(str(c) for c in column)
|
|
15966
|
+
else:
|
|
15967
|
+
column_text = str(column)
|
|
15968
|
+
|
|
15969
|
+
# Apply underline styling for synthetic columns; only apply styling if column_text is
|
|
15970
|
+
# not empty and not a special marker
|
|
15971
|
+
if (
|
|
15972
|
+
has_synthetic_column
|
|
15973
|
+
and column_text
|
|
15974
|
+
and column_text not in ["—", "ALL COLUMNS", ""]
|
|
15975
|
+
):
|
|
15976
|
+
column_text = (
|
|
15977
|
+
f'<span style="text-decoration: underline; '
|
|
15978
|
+
f"text-decoration-color: #9A7CB4; text-decoration-thickness: 1px; "
|
|
15979
|
+
f'text-underline-offset: 3px;">'
|
|
15980
|
+
f"{column_text}</span>"
|
|
15981
|
+
)
|
|
15982
|
+
|
|
15983
|
+
columns_upd.append(column_text)
|
|
15239
15984
|
|
|
15240
15985
|
# Add the `columns_upd` entry to the dictionary
|
|
15241
15986
|
validation_info_dict["columns_upd"] = columns_upd
|
|
@@ -15291,6 +16036,15 @@ class Validate:
|
|
|
15291
16036
|
]:
|
|
15292
16037
|
values_upd.append("—")
|
|
15293
16038
|
|
|
16039
|
+
elif assertion_type[i] in ["col_pct_null"]:
|
|
16040
|
+
# Extract p and tol from the values dict for nice formatting
|
|
16041
|
+
p_value = value["p"]
|
|
16042
|
+
|
|
16043
|
+
# Extract tol from the bound_finder partial function
|
|
16044
|
+
bound_finder = value.get("bound_finder")
|
|
16045
|
+
tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
|
|
16046
|
+
values_upd.append(f"p = {p_value}<br/>tol = {tol_value}")
|
|
16047
|
+
|
|
15294
16048
|
elif assertion_type[i] in ["col_schema_match"]:
|
|
15295
16049
|
values_upd.append("SCHEMA")
|
|
15296
16050
|
|
|
@@ -15332,6 +16086,32 @@ class Validate:
|
|
|
15332
16086
|
else: # pragma: no cover
|
|
15333
16087
|
values_upd.append(str(value)) # pragma: no cover
|
|
15334
16088
|
|
|
16089
|
+
# Handle aggregation methods (col_sum_gt, col_avg_eq, etc.)
|
|
16090
|
+
elif is_valid_agg(assertion_type[i]):
|
|
16091
|
+
# Extract the value and tolerance from the values dict
|
|
16092
|
+
agg_value = value.get("value")
|
|
16093
|
+
tol_value = value.get("tol", 0)
|
|
16094
|
+
|
|
16095
|
+
# Format the value (could be a number, Column, or ReferenceColumn)
|
|
16096
|
+
if hasattr(agg_value, "__repr__"):
|
|
16097
|
+
# For Column or ReferenceColumn objects, use their repr
|
|
16098
|
+
value_str = repr(agg_value)
|
|
16099
|
+
else:
|
|
16100
|
+
value_str = str(agg_value)
|
|
16101
|
+
|
|
16102
|
+
# Format tolerance - only show on second line if non-zero
|
|
16103
|
+
if tol_value != 0:
|
|
16104
|
+
# Format tolerance based on its type
|
|
16105
|
+
if isinstance(tol_value, tuple):
|
|
16106
|
+
# Asymmetric bounds: (lower, upper)
|
|
16107
|
+
tol_str = f"tol=({tol_value[0]}, {tol_value[1]})"
|
|
16108
|
+
else:
|
|
16109
|
+
# Symmetric tolerance
|
|
16110
|
+
tol_str = f"tol={tol_value}"
|
|
16111
|
+
values_upd.append(f"{value_str}<br/>{tol_str}")
|
|
16112
|
+
else:
|
|
16113
|
+
values_upd.append(value_str)
|
|
16114
|
+
|
|
15335
16115
|
# If the assertion type is not recognized, add the value as a string
|
|
15336
16116
|
else: # pragma: no cover
|
|
15337
16117
|
values_upd.append(str(value)) # pragma: no cover
|
|
@@ -15766,13 +16546,15 @@ class Validate:
|
|
|
15766
16546
|
gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
|
|
15767
16547
|
|
|
15768
16548
|
if incl_footer:
|
|
15769
|
-
# Add table time as HTML source note
|
|
15770
|
-
|
|
16549
|
+
# Add table time as HTML source note if enabled
|
|
16550
|
+
if incl_footer_timings:
|
|
16551
|
+
gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
|
|
15771
16552
|
|
|
15772
|
-
# Create notes markdown from validation steps and add as separate source note
|
|
15773
|
-
|
|
15774
|
-
|
|
15775
|
-
|
|
16553
|
+
# Create notes markdown from validation steps and add as separate source note if enabled
|
|
16554
|
+
if incl_footer_notes:
|
|
16555
|
+
notes_markdown = _create_notes_html(self.validation_info)
|
|
16556
|
+
if notes_markdown:
|
|
16557
|
+
gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
|
|
15776
16558
|
|
|
15777
16559
|
# If the interrogation has not been performed, then style the table columns dealing with
|
|
15778
16560
|
# interrogation data as grayed out
|
|
@@ -16179,7 +16961,7 @@ class Validate:
|
|
|
16179
16961
|
table = validation.pre(self.data)
|
|
16180
16962
|
|
|
16181
16963
|
# Get the columns from the table as a list
|
|
16182
|
-
columns = list(table.columns)
|
|
16964
|
+
columns = list(table.columns) # type: ignore[union-attr]
|
|
16183
16965
|
|
|
16184
16966
|
# Evaluate the column expression
|
|
16185
16967
|
if isinstance(column_expr, ColumnSelectorNarwhals):
|
|
@@ -16189,6 +16971,12 @@ class Validate:
|
|
|
16189
16971
|
|
|
16190
16972
|
except Exception: # pragma: no cover
|
|
16191
16973
|
validation.eval_error = True
|
|
16974
|
+
columns_resolved = []
|
|
16975
|
+
# Store columns list for note generation
|
|
16976
|
+
try:
|
|
16977
|
+
columns = list(table.columns) if "table" in locals() else []
|
|
16978
|
+
except Exception:
|
|
16979
|
+
columns = []
|
|
16192
16980
|
|
|
16193
16981
|
# If no columns were resolved, then create a patched validation step with the
|
|
16194
16982
|
# `eval_error` and `column` attributes set
|
|
@@ -16196,6 +16984,22 @@ class Validate:
|
|
|
16196
16984
|
validation.eval_error = True
|
|
16197
16985
|
validation.column = str(column_expr)
|
|
16198
16986
|
|
|
16987
|
+
# Add a helpful note explaining that no columns were resolved
|
|
16988
|
+
note_html = _create_no_columns_resolved_note_html(
|
|
16989
|
+
column_expr=str(column_expr),
|
|
16990
|
+
available_columns=columns,
|
|
16991
|
+
locale=self.locale,
|
|
16992
|
+
)
|
|
16993
|
+
note_text = _create_no_columns_resolved_note_text(
|
|
16994
|
+
column_expr=str(column_expr),
|
|
16995
|
+
available_columns=columns,
|
|
16996
|
+
)
|
|
16997
|
+
validation._add_note(
|
|
16998
|
+
key="no_columns_resolved",
|
|
16999
|
+
markdown=note_html,
|
|
17000
|
+
text=note_text,
|
|
17001
|
+
)
|
|
17002
|
+
|
|
16199
17003
|
expanded_validation_info.append(validation)
|
|
16200
17004
|
continue
|
|
16201
17005
|
|
|
@@ -16535,7 +17339,7 @@ def _convert_string_to_datetime(value: str) -> datetime.datetime:
|
|
|
16535
17339
|
return datetime.datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
|
|
16536
17340
|
|
|
16537
17341
|
|
|
16538
|
-
def _string_date_dttm_conversion(value:
|
|
17342
|
+
def _string_date_dttm_conversion(value: Any) -> Any:
|
|
16539
17343
|
"""
|
|
16540
17344
|
Convert a string to a date or datetime object if it is in the correct format.
|
|
16541
17345
|
If the value is not a string, it is returned as is.
|
|
@@ -16570,8 +17374,8 @@ def _string_date_dttm_conversion(value: any) -> any:
|
|
|
16570
17374
|
|
|
16571
17375
|
|
|
16572
17376
|
def _conditional_string_date_dttm_conversion(
|
|
16573
|
-
value:
|
|
16574
|
-
) ->
|
|
17377
|
+
value: Any, allow_regular_strings: bool = False
|
|
17378
|
+
) -> Any:
|
|
16575
17379
|
"""
|
|
16576
17380
|
Conditionally convert a string to a date or datetime object if it is in the correct format. If
|
|
16577
17381
|
`allow_regular_strings=` is `True`, regular strings are allowed to pass through unchanged. If
|
|
@@ -16615,9 +17419,9 @@ def _process_brief(
|
|
|
16615
17419
|
brief: str | None,
|
|
16616
17420
|
step: int,
|
|
16617
17421
|
col: str | list[str] | None,
|
|
16618
|
-
values:
|
|
16619
|
-
thresholds:
|
|
16620
|
-
segment:
|
|
17422
|
+
values: Any | None,
|
|
17423
|
+
thresholds: Any | None,
|
|
17424
|
+
segment: Any | None,
|
|
16621
17425
|
) -> str:
|
|
16622
17426
|
# If there is no brief, return `None`
|
|
16623
17427
|
if brief is None:
|
|
@@ -16704,7 +17508,7 @@ def _process_action_str(
|
|
|
16704
17508
|
action_str: str,
|
|
16705
17509
|
step: int,
|
|
16706
17510
|
col: str | None,
|
|
16707
|
-
value:
|
|
17511
|
+
value: Any,
|
|
16708
17512
|
type: str,
|
|
16709
17513
|
level: str,
|
|
16710
17514
|
time: str,
|
|
@@ -16754,7 +17558,13 @@ def _process_action_str(
|
|
|
16754
17558
|
|
|
16755
17559
|
|
|
16756
17560
|
def _create_autobrief_or_failure_text(
|
|
16757
|
-
assertion_type: str,
|
|
17561
|
+
assertion_type: str,
|
|
17562
|
+
lang: str,
|
|
17563
|
+
column: str,
|
|
17564
|
+
values: Any,
|
|
17565
|
+
for_failure: bool,
|
|
17566
|
+
locale: str | None = None,
|
|
17567
|
+
n_rows: int | None = None,
|
|
16758
17568
|
) -> str:
|
|
16759
17569
|
if assertion_type in [
|
|
16760
17570
|
"col_vals_gt",
|
|
@@ -16878,6 +17688,16 @@ def _create_autobrief_or_failure_text(
|
|
|
16878
17688
|
for_failure=for_failure,
|
|
16879
17689
|
)
|
|
16880
17690
|
|
|
17691
|
+
if assertion_type == "col_pct_null":
|
|
17692
|
+
return _create_text_col_pct_null(
|
|
17693
|
+
lang=lang,
|
|
17694
|
+
column=column,
|
|
17695
|
+
value=values,
|
|
17696
|
+
for_failure=for_failure,
|
|
17697
|
+
locale=locale if locale else lang,
|
|
17698
|
+
n_rows=n_rows,
|
|
17699
|
+
)
|
|
17700
|
+
|
|
16881
17701
|
if assertion_type == "conjointly":
|
|
16882
17702
|
return _create_text_conjointly(lang=lang, for_failure=for_failure)
|
|
16883
17703
|
|
|
@@ -16893,7 +17713,7 @@ def _create_autobrief_or_failure_text(
|
|
|
16893
17713
|
for_failure=for_failure,
|
|
16894
17714
|
)
|
|
16895
17715
|
|
|
16896
|
-
return None
|
|
17716
|
+
return None
|
|
16897
17717
|
|
|
16898
17718
|
|
|
16899
17719
|
def _expect_failure_type(for_failure: bool) -> str:
|
|
@@ -16903,7 +17723,7 @@ def _expect_failure_type(for_failure: bool) -> str:
|
|
|
16903
17723
|
def _create_text_comparison(
|
|
16904
17724
|
assertion_type: str,
|
|
16905
17725
|
lang: str,
|
|
16906
|
-
column: str | list[str]
|
|
17726
|
+
column: str | list[str],
|
|
16907
17727
|
values: str | None,
|
|
16908
17728
|
for_failure: bool = False,
|
|
16909
17729
|
) -> str:
|
|
@@ -16929,7 +17749,7 @@ def _create_text_comparison(
|
|
|
16929
17749
|
|
|
16930
17750
|
def _create_text_between(
|
|
16931
17751
|
lang: str,
|
|
16932
|
-
column: str
|
|
17752
|
+
column: str,
|
|
16933
17753
|
value_1: str,
|
|
16934
17754
|
value_2: str,
|
|
16935
17755
|
not_: bool = False,
|
|
@@ -16959,7 +17779,7 @@ def _create_text_between(
|
|
|
16959
17779
|
|
|
16960
17780
|
|
|
16961
17781
|
def _create_text_set(
|
|
16962
|
-
lang: str, column: str
|
|
17782
|
+
lang: str, column: str, values: list[Any], not_: bool = False, for_failure: bool = False
|
|
16963
17783
|
) -> str:
|
|
16964
17784
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
16965
17785
|
|
|
@@ -16981,9 +17801,7 @@ def _create_text_set(
|
|
|
16981
17801
|
return text
|
|
16982
17802
|
|
|
16983
17803
|
|
|
16984
|
-
def _create_text_null(
|
|
16985
|
-
lang: str, column: str | None, not_: bool = False, for_failure: bool = False
|
|
16986
|
-
) -> str:
|
|
17804
|
+
def _create_text_null(lang: str, column: str, not_: bool = False, for_failure: bool = False) -> str:
|
|
16987
17805
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
16988
17806
|
|
|
16989
17807
|
column_text = _prep_column_text(column=column)
|
|
@@ -17000,9 +17818,7 @@ def _create_text_null(
|
|
|
17000
17818
|
return text
|
|
17001
17819
|
|
|
17002
17820
|
|
|
17003
|
-
def _create_text_regex(
|
|
17004
|
-
lang: str, column: str | None, pattern: str | dict, for_failure: bool = False
|
|
17005
|
-
) -> str:
|
|
17821
|
+
def _create_text_regex(lang: str, column: str, pattern: str, for_failure: bool = False) -> str:
|
|
17006
17822
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17007
17823
|
|
|
17008
17824
|
column_text = _prep_column_text(column=column)
|
|
@@ -17034,7 +17850,7 @@ def _create_text_expr(lang: str, for_failure: bool) -> str:
|
|
|
17034
17850
|
return EXPECT_FAIL_TEXT[f"col_vals_expr_{type_}_text"][lang]
|
|
17035
17851
|
|
|
17036
17852
|
|
|
17037
|
-
def _create_text_col_exists(lang: str, column: str
|
|
17853
|
+
def _create_text_col_exists(lang: str, column: str, for_failure: bool = False) -> str:
|
|
17038
17854
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17039
17855
|
|
|
17040
17856
|
column_text = _prep_column_text(column=column)
|
|
@@ -17084,7 +17900,7 @@ def _create_text_rows_complete(
|
|
|
17084
17900
|
return text
|
|
17085
17901
|
|
|
17086
17902
|
|
|
17087
|
-
def _create_text_row_count_match(lang: str, value:
|
|
17903
|
+
def _create_text_row_count_match(lang: str, value: dict, for_failure: bool = False) -> str:
|
|
17088
17904
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17089
17905
|
|
|
17090
17906
|
values_text = _prep_values_text(value["count"], lang=lang)
|
|
@@ -17092,7 +17908,7 @@ def _create_text_row_count_match(lang: str, value: int, for_failure: bool = Fals
|
|
|
17092
17908
|
return EXPECT_FAIL_TEXT[f"row_count_match_n_{type_}_text"][lang].format(values_text=values_text)
|
|
17093
17909
|
|
|
17094
17910
|
|
|
17095
|
-
def _create_text_col_count_match(lang: str, value:
|
|
17911
|
+
def _create_text_col_count_match(lang: str, value: dict, for_failure: bool = False) -> str:
|
|
17096
17912
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17097
17913
|
|
|
17098
17914
|
values_text = _prep_values_text(value["count"], lang=lang)
|
|
@@ -17100,6 +17916,115 @@ def _create_text_col_count_match(lang: str, value: int, for_failure: bool = Fals
|
|
|
17100
17916
|
return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text)
|
|
17101
17917
|
|
|
17102
17918
|
|
|
17919
|
+
def _create_text_col_pct_null(
|
|
17920
|
+
lang: str,
|
|
17921
|
+
column: str | None,
|
|
17922
|
+
value: dict,
|
|
17923
|
+
for_failure: bool = False,
|
|
17924
|
+
locale: str | None = None,
|
|
17925
|
+
n_rows: int | None = None,
|
|
17926
|
+
) -> str:
|
|
17927
|
+
"""Create text for col_pct_null validation with tolerance handling."""
|
|
17928
|
+
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17929
|
+
|
|
17930
|
+
column_text = _prep_column_text(column=column)
|
|
17931
|
+
|
|
17932
|
+
# Use locale for number formatting, defaulting to lang if not provided
|
|
17933
|
+
fmt_locale = locale if locale else lang
|
|
17934
|
+
|
|
17935
|
+
# Extract p and tol from the values dict
|
|
17936
|
+
p_value = value.get("p", 0) * 100 # Convert to percentage
|
|
17937
|
+
p_value_original = value.get("p", 0) # Keep original value for deviation format
|
|
17938
|
+
|
|
17939
|
+
# Extract tol from the bound_finder partial function
|
|
17940
|
+
bound_finder = value.get("bound_finder")
|
|
17941
|
+
tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
|
|
17942
|
+
|
|
17943
|
+
# Handle different tolerance types
|
|
17944
|
+
has_tolerance = False
|
|
17945
|
+
is_asymmetric = False
|
|
17946
|
+
|
|
17947
|
+
if isinstance(tol_value, tuple):
|
|
17948
|
+
# Tuple tolerance: can be (lower, upper) in absolute or relative terms
|
|
17949
|
+
tol_lower, tol_upper = tol_value
|
|
17950
|
+
|
|
17951
|
+
# Check if we have any non-zero tolerance
|
|
17952
|
+
has_tolerance = tol_lower != 0 or tol_upper != 0
|
|
17953
|
+
is_asymmetric = tol_lower != tol_upper
|
|
17954
|
+
|
|
17955
|
+
# For relative tolerances (floats < 1), we can compute exact percentage bounds
|
|
17956
|
+
# For absolute tolerances (ints >= 1), calculate based on actual row count if available
|
|
17957
|
+
if tol_lower < 1:
|
|
17958
|
+
# Relative tolerance (float)
|
|
17959
|
+
lower_pct_delta = tol_lower * 100
|
|
17960
|
+
else:
|
|
17961
|
+
# Absolute tolerance (int); uses actual row count if available
|
|
17962
|
+
if n_rows is not None and n_rows > 0:
|
|
17963
|
+
lower_pct_delta = (tol_lower / n_rows) * 100
|
|
17964
|
+
else:
|
|
17965
|
+
lower_pct_delta = tol_lower # Fallback approximation
|
|
17966
|
+
|
|
17967
|
+
if tol_upper < 1:
|
|
17968
|
+
# Relative tolerance (float)
|
|
17969
|
+
upper_pct_delta = tol_upper * 100
|
|
17970
|
+
else:
|
|
17971
|
+
# Absolute tolerance (int); uses actual row count if available
|
|
17972
|
+
if n_rows is not None and n_rows > 0:
|
|
17973
|
+
upper_pct_delta = (tol_upper / n_rows) * 100
|
|
17974
|
+
else:
|
|
17975
|
+
upper_pct_delta = tol_upper # Fallback approximation
|
|
17976
|
+
else:
|
|
17977
|
+
# Single value tolerance: symmetric
|
|
17978
|
+
has_tolerance = tol_value != 0
|
|
17979
|
+
|
|
17980
|
+
if tol_value < 1:
|
|
17981
|
+
# Relative tolerance (float)
|
|
17982
|
+
tol_pct = tol_value * 100
|
|
17983
|
+
else:
|
|
17984
|
+
# Absolute tolerance (int) - use actual row count if available
|
|
17985
|
+
if n_rows is not None and n_rows > 0:
|
|
17986
|
+
tol_pct = (tol_value / n_rows) * 100
|
|
17987
|
+
else:
|
|
17988
|
+
tol_pct = tol_value # Fallback approximation
|
|
17989
|
+
|
|
17990
|
+
lower_pct_delta = tol_pct
|
|
17991
|
+
upper_pct_delta = tol_pct
|
|
17992
|
+
|
|
17993
|
+
# Format numbers with locale-aware formatting
|
|
17994
|
+
p_formatted = _format_number_safe(p_value, decimals=1, locale=fmt_locale)
|
|
17995
|
+
p_original_formatted = _format_number_safe(p_value_original, decimals=2, locale=fmt_locale)
|
|
17996
|
+
|
|
17997
|
+
# Choose the appropriate translation key based on tolerance
|
|
17998
|
+
if not has_tolerance:
|
|
17999
|
+
# No tolerance - use simple text
|
|
18000
|
+
text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text"][lang].format(
|
|
18001
|
+
column_text=column_text,
|
|
18002
|
+
p=p_formatted,
|
|
18003
|
+
)
|
|
18004
|
+
elif is_asymmetric or isinstance(tol_value, tuple):
|
|
18005
|
+
# Use deviation format for tuple tolerances (including symmetric ones)
|
|
18006
|
+
# Format the deviation values with signs (using proper minus sign U+2212)
|
|
18007
|
+
lower_dev = f"−{_format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)}%"
|
|
18008
|
+
upper_dev = f"+{_format_number_safe(upper_pct_delta, decimals=1, locale=fmt_locale)}%"
|
|
18009
|
+
|
|
18010
|
+
text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol_deviation"][lang].format(
|
|
18011
|
+
column_text=column_text,
|
|
18012
|
+
lower_dev=lower_dev,
|
|
18013
|
+
upper_dev=upper_dev,
|
|
18014
|
+
p=p_original_formatted,
|
|
18015
|
+
)
|
|
18016
|
+
else:
|
|
18017
|
+
# Single value tolerance - use the symmetric ± format
|
|
18018
|
+
tol_formatted = _format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)
|
|
18019
|
+
text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol"][lang].format(
|
|
18020
|
+
column_text=column_text,
|
|
18021
|
+
p=p_formatted,
|
|
18022
|
+
tol=tol_formatted,
|
|
18023
|
+
)
|
|
18024
|
+
|
|
18025
|
+
return text
|
|
18026
|
+
|
|
18027
|
+
|
|
17103
18028
|
def _create_text_conjointly(lang: str, for_failure: bool = False) -> str:
|
|
17104
18029
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17105
18030
|
|
|
@@ -17120,19 +18045,13 @@ def _create_text_prompt(lang: str, prompt: str, for_failure: bool = False) -> st
|
|
|
17120
18045
|
def _prep_column_text(column: str | list[str]) -> str:
|
|
17121
18046
|
if isinstance(column, list):
|
|
17122
18047
|
return "`" + str(column[0]) + "`"
|
|
17123
|
-
|
|
18048
|
+
if isinstance(column, str):
|
|
17124
18049
|
return "`" + column + "`"
|
|
17125
|
-
|
|
17126
|
-
return ""
|
|
18050
|
+
raise AssertionError
|
|
17127
18051
|
|
|
17128
18052
|
|
|
17129
18053
|
def _prep_values_text(
|
|
17130
|
-
values:
|
|
17131
|
-
| int
|
|
17132
|
-
| float
|
|
17133
|
-
| datetime.datetime
|
|
17134
|
-
| datetime.date
|
|
17135
|
-
| list[str | int | float | datetime.datetime | datetime.date],
|
|
18054
|
+
values: _CompliantValue | _CompliantValues,
|
|
17136
18055
|
lang: str,
|
|
17137
18056
|
limit: int = 3,
|
|
17138
18057
|
) -> str:
|
|
@@ -17180,7 +18099,7 @@ def _prep_values_text(
|
|
|
17180
18099
|
return values_str
|
|
17181
18100
|
|
|
17182
18101
|
|
|
17183
|
-
def _seg_expr_from_string(data_tbl:
|
|
18102
|
+
def _seg_expr_from_string(data_tbl: Any, segments_expr: str) -> tuple[str, str]:
|
|
17184
18103
|
"""
|
|
17185
18104
|
Obtain the segmentation categories from a table column.
|
|
17186
18105
|
|
|
@@ -17283,7 +18202,7 @@ def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, Any]]:
|
|
|
17283
18202
|
return seg_tuples
|
|
17284
18203
|
|
|
17285
18204
|
|
|
17286
|
-
def _apply_segments(data_tbl:
|
|
18205
|
+
def _apply_segments(data_tbl: Any, segments_expr: tuple[str, str]) -> Any:
|
|
17287
18206
|
"""
|
|
17288
18207
|
Apply the segments expression to the data table.
|
|
17289
18208
|
|
|
@@ -17347,8 +18266,26 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
|
|
|
17347
18266
|
except ValueError: # pragma: no cover
|
|
17348
18267
|
pass # pragma: no cover
|
|
17349
18268
|
|
|
17350
|
-
# Format 2:
|
|
17351
|
-
#
|
|
18269
|
+
# Format 2: Direct datetime strings like "2016-01-04 00:00:01" (Polars 1.36+)
|
|
18270
|
+
# These don't have UTC suffix anymore
|
|
18271
|
+
elif (
|
|
18272
|
+
" " in segment_str
|
|
18273
|
+
and "UTC" not in segment_str
|
|
18274
|
+
and "[" not in segment_str
|
|
18275
|
+
and ".alias" not in segment_str
|
|
18276
|
+
):
|
|
18277
|
+
try:
|
|
18278
|
+
parsed_dt = datetime.fromisoformat(segment_str)
|
|
18279
|
+
# Convert midnight datetimes to dates for consistency
|
|
18280
|
+
if parsed_dt.time() == datetime.min.time():
|
|
18281
|
+
parsed_value = parsed_dt.date() # pragma: no cover
|
|
18282
|
+
else:
|
|
18283
|
+
parsed_value = parsed_dt
|
|
18284
|
+
except ValueError: # pragma: no cover
|
|
18285
|
+
pass # pragma: no cover
|
|
18286
|
+
|
|
18287
|
+
# Format 3: Datetime strings with UTC timezone like
|
|
18288
|
+
# "2016-01-04 00:00:01 UTC.strict_cast(...)" (Polars < 1.36)
|
|
17352
18289
|
elif " UTC" in segment_str:
|
|
17353
18290
|
try:
|
|
17354
18291
|
# Extract just the datetime part before "UTC"
|
|
@@ -17363,7 +18300,7 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
|
|
|
17363
18300
|
except (ValueError, IndexError): # pragma: no cover
|
|
17364
18301
|
pass # pragma: no cover
|
|
17365
18302
|
|
|
17366
|
-
# Format
|
|
18303
|
+
# Format 4: Bracketed expressions like ['2016-01-04']
|
|
17367
18304
|
elif segment_str.startswith("[") and segment_str.endswith("]"):
|
|
17368
18305
|
try: # pragma: no cover
|
|
17369
18306
|
# Remove [' and ']
|
|
@@ -17498,7 +18435,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
|
|
|
17498
18435
|
|
|
17499
18436
|
def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]:
|
|
17500
18437
|
# For each icon, get the assertion icon SVG test from SVG_ICONS_FOR_ASSERTION_TYPES dictionary
|
|
17501
|
-
icon_svg = [SVG_ICONS_FOR_ASSERTION_TYPES
|
|
18438
|
+
icon_svg: list[str] = [SVG_ICONS_FOR_ASSERTION_TYPES[icon] for icon in icon]
|
|
17502
18439
|
|
|
17503
18440
|
# Replace the width and height in the SVG string
|
|
17504
18441
|
for i in range(len(icon_svg)):
|
|
@@ -17507,11 +18444,9 @@ def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]:
|
|
|
17507
18444
|
return icon_svg
|
|
17508
18445
|
|
|
17509
18446
|
|
|
17510
|
-
def _replace_svg_dimensions(svg:
|
|
18447
|
+
def _replace_svg_dimensions(svg: str, height_width: int | float) -> str:
|
|
17511
18448
|
svg = re.sub(r'width="[0-9]*?px', f'width="{height_width}px', svg)
|
|
17512
|
-
|
|
17513
|
-
|
|
17514
|
-
return svg
|
|
18449
|
+
return re.sub(r'height="[0-9]*?px', f'height="{height_width}px', svg)
|
|
17515
18450
|
|
|
17516
18451
|
|
|
17517
18452
|
def _get_title_text(
|
|
@@ -17575,7 +18510,7 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s
|
|
|
17575
18510
|
return title_text
|
|
17576
18511
|
|
|
17577
18512
|
|
|
17578
|
-
def _transform_tbl_preprocessed(pre:
|
|
18513
|
+
def _transform_tbl_preprocessed(pre: Any, seg: Any, interrogation_performed: bool) -> list[str]:
|
|
17579
18514
|
# If no interrogation was performed, return a list of empty strings
|
|
17580
18515
|
if not interrogation_performed:
|
|
17581
18516
|
return ["" for _ in range(len(pre))]
|
|
@@ -17597,9 +18532,7 @@ def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: boo
|
|
|
17597
18532
|
|
|
17598
18533
|
def _get_preprocessed_table_icon(icon: list[str]) -> list[str]:
|
|
17599
18534
|
# For each icon, get the SVG icon from the SVG_ICONS_FOR_TBL_STATUS dictionary
|
|
17600
|
-
|
|
17601
|
-
|
|
17602
|
-
return icon_svg
|
|
18535
|
+
return [SVG_ICONS_FOR_TBL_STATUS[icon] for icon in icon]
|
|
17603
18536
|
|
|
17604
18537
|
|
|
17605
18538
|
def _transform_eval(
|
|
@@ -17677,9 +18610,9 @@ def _transform_test_units(
|
|
|
17677
18610
|
return _format_single_number_with_gt(
|
|
17678
18611
|
value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
|
|
17679
18612
|
)
|
|
17680
|
-
|
|
17681
|
-
|
|
17682
|
-
|
|
18613
|
+
formatted = vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)
|
|
18614
|
+
assert isinstance(formatted, list)
|
|
18615
|
+
return formatted[0]
|
|
17683
18616
|
|
|
17684
18617
|
return [
|
|
17685
18618
|
(
|
|
@@ -17883,22 +18816,21 @@ def _transform_assertion_str(
|
|
|
17883
18816
|
return type_upd
|
|
17884
18817
|
|
|
17885
18818
|
|
|
17886
|
-
def _pre_processing_funcs_to_str(pre: Callable) -> str | list[str]:
|
|
18819
|
+
def _pre_processing_funcs_to_str(pre: Callable) -> str | list[str] | None:
|
|
17887
18820
|
if isinstance(pre, Callable):
|
|
17888
18821
|
return _get_callable_source(fn=pre)
|
|
18822
|
+
return None
|
|
17889
18823
|
|
|
17890
18824
|
|
|
17891
18825
|
def _get_callable_source(fn: Callable) -> str:
|
|
17892
|
-
|
|
17893
|
-
|
|
17894
|
-
|
|
17895
|
-
|
|
17896
|
-
|
|
17897
|
-
|
|
17898
|
-
|
|
17899
|
-
|
|
17900
|
-
return fn.__name__
|
|
17901
|
-
return fn # pragma: no cover
|
|
18826
|
+
try:
|
|
18827
|
+
source_lines, _ = inspect.getsourcelines(fn)
|
|
18828
|
+
source = "".join(source_lines).strip()
|
|
18829
|
+
# Extract the `pre` argument from the source code
|
|
18830
|
+
pre_arg = _extract_pre_argument(source)
|
|
18831
|
+
return pre_arg
|
|
18832
|
+
except (OSError, TypeError): # pragma: no cover
|
|
18833
|
+
return fn.__name__ # ty: ignore
|
|
17902
18834
|
|
|
17903
18835
|
|
|
17904
18836
|
def _extract_pre_argument(source: str) -> str:
|
|
@@ -17924,6 +18856,7 @@ def _create_table_time_html(
|
|
|
17924
18856
|
if time_start is None:
|
|
17925
18857
|
return ""
|
|
17926
18858
|
|
|
18859
|
+
assert time_end is not None # typing
|
|
17927
18860
|
# Get the time duration (difference between `time_end` and `time_start`) in seconds
|
|
17928
18861
|
time_duration = (time_end - time_start).total_seconds()
|
|
17929
18862
|
|
|
@@ -18138,11 +19071,11 @@ def _format_number_safe(
|
|
|
18138
19071
|
locale=locale,
|
|
18139
19072
|
df_lib=df_lib,
|
|
18140
19073
|
)
|
|
18141
|
-
|
|
18142
|
-
|
|
18143
|
-
|
|
18144
|
-
|
|
18145
|
-
|
|
19074
|
+
ints = fmt_number(
|
|
19075
|
+
value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
19076
|
+
)
|
|
19077
|
+
assert isinstance(ints, list)
|
|
19078
|
+
return ints[0]
|
|
18146
19079
|
|
|
18147
19080
|
|
|
18148
19081
|
def _format_integer_safe(value: int, locale: str = "en", df_lib=None) -> str:
|
|
@@ -18155,9 +19088,10 @@ def _format_integer_safe(value: int, locale: str = "en", df_lib=None) -> str:
|
|
|
18155
19088
|
if df_lib is not None and value is not None:
|
|
18156
19089
|
# Use GT-based formatting to avoid Pandas dependency completely
|
|
18157
19090
|
return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
|
|
18158
|
-
|
|
18159
|
-
|
|
18160
|
-
|
|
19091
|
+
|
|
19092
|
+
ints = fmt_integer(value, locale=locale)
|
|
19093
|
+
assert isinstance(ints, list)
|
|
19094
|
+
return ints[0]
|
|
18161
19095
|
|
|
18162
19096
|
|
|
18163
19097
|
def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> str:
|
|
@@ -18273,7 +19207,7 @@ def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en"
|
|
|
18273
19207
|
HTML string containing the formatted threshold information.
|
|
18274
19208
|
"""
|
|
18275
19209
|
if thresholds == Thresholds():
|
|
18276
|
-
return ""
|
|
19210
|
+
return "" # pragma: no cover
|
|
18277
19211
|
|
|
18278
19212
|
# Get df_lib for formatting
|
|
18279
19213
|
df_lib = None
|
|
@@ -18281,10 +19215,10 @@ def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en"
|
|
|
18281
19215
|
import polars as pl
|
|
18282
19216
|
|
|
18283
19217
|
df_lib = pl
|
|
18284
|
-
elif _is_lib_present("pandas"):
|
|
18285
|
-
import pandas as pd
|
|
19218
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
19219
|
+
import pandas as pd # pragma: no cover
|
|
18286
19220
|
|
|
18287
|
-
df_lib = pd
|
|
19221
|
+
df_lib = pd # pragma: no cover
|
|
18288
19222
|
|
|
18289
19223
|
# Helper function to format threshold values using the shared formatting functions
|
|
18290
19224
|
def _format_threshold_value(fraction: float | None, count: int | None) -> str:
|
|
@@ -18292,10 +19226,12 @@ def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en"
|
|
|
18292
19226
|
# Format as fraction/percentage with locale formatting
|
|
18293
19227
|
if fraction == 0:
|
|
18294
19228
|
return "0"
|
|
18295
|
-
elif fraction < 0.01:
|
|
19229
|
+
elif fraction < 0.01: # pragma: no cover
|
|
18296
19230
|
# For very small fractions, show "<0.01" with locale formatting
|
|
18297
|
-
formatted = _format_number_safe(
|
|
18298
|
-
|
|
19231
|
+
formatted = _format_number_safe(
|
|
19232
|
+
0.01, decimals=2, locale=locale, df_lib=df_lib
|
|
19233
|
+
) # pragma: no cover
|
|
19234
|
+
return f"<{formatted}" # pragma: no cover
|
|
18299
19235
|
else:
|
|
18300
19236
|
# Use shared formatting function with drop_trailing_zeros
|
|
18301
19237
|
formatted = _format_number_safe(
|
|
@@ -18372,14 +19308,14 @@ def _create_local_threshold_note_text(thresholds: Thresholds) -> str:
|
|
|
18372
19308
|
if fraction is not None:
|
|
18373
19309
|
if fraction == 0:
|
|
18374
19310
|
return "0"
|
|
18375
|
-
elif fraction < 0.01:
|
|
18376
|
-
return "<0.01"
|
|
19311
|
+
elif fraction < 0.01: # pragma: no cover
|
|
19312
|
+
return "<0.01" # pragma: no cover
|
|
18377
19313
|
else:
|
|
18378
19314
|
return f"{fraction:.2f}".rstrip("0").rstrip(".")
|
|
18379
19315
|
elif count is not None:
|
|
18380
19316
|
return str(count)
|
|
18381
19317
|
else:
|
|
18382
|
-
return "—"
|
|
19318
|
+
return "—" # pragma: no cover
|
|
18383
19319
|
|
|
18384
19320
|
parts = []
|
|
18385
19321
|
|
|
@@ -18398,7 +19334,7 @@ def _create_local_threshold_note_text(thresholds: Thresholds) -> str:
|
|
|
18398
19334
|
if parts:
|
|
18399
19335
|
return "Step-specific thresholds set: " + ", ".join(parts)
|
|
18400
19336
|
else:
|
|
18401
|
-
return ""
|
|
19337
|
+
return "" # pragma: no cover
|
|
18402
19338
|
|
|
18403
19339
|
|
|
18404
19340
|
def _create_threshold_reset_note_html(locale: str = "en") -> str:
|
|
@@ -18433,79 +19369,678 @@ def _create_threshold_reset_note_text() -> str:
|
|
|
18433
19369
|
return "Global thresholds explicitly not used for this step."
|
|
18434
19370
|
|
|
18435
19371
|
|
|
18436
|
-
def
|
|
18437
|
-
|
|
18438
|
-
|
|
18439
|
-
|
|
18440
|
-
|
|
18441
|
-
columns_subset: list[str] | None,
|
|
18442
|
-
values: any,
|
|
18443
|
-
inclusive: tuple[bool, bool] | None,
|
|
18444
|
-
n: int,
|
|
18445
|
-
n_failed: int,
|
|
18446
|
-
all_passed: bool,
|
|
18447
|
-
extract: any,
|
|
18448
|
-
tbl_preview: GT,
|
|
18449
|
-
header: str,
|
|
18450
|
-
limit: int | None,
|
|
18451
|
-
lang: str,
|
|
18452
|
-
) -> GT:
|
|
18453
|
-
# Get the length of the extracted data for the step
|
|
18454
|
-
extract_length = get_row_count(extract)
|
|
18455
|
-
|
|
18456
|
-
# Determine whether the `lang` value represents a right-to-left language
|
|
18457
|
-
is_rtl_lang = lang in RTL_LANGUAGES
|
|
18458
|
-
direction_rtl = " direction: rtl;" if is_rtl_lang else ""
|
|
19372
|
+
def _create_no_columns_resolved_note_html(
|
|
19373
|
+
column_expr: str, available_columns: list[str], locale: str = "en"
|
|
19374
|
+
) -> str:
|
|
19375
|
+
"""
|
|
19376
|
+
Create an HTML note explaining that a column expression resolved to no columns.
|
|
18459
19377
|
|
|
18460
|
-
|
|
18461
|
-
|
|
18462
|
-
|
|
18463
|
-
|
|
18464
|
-
|
|
18465
|
-
|
|
18466
|
-
|
|
18467
|
-
|
|
18468
|
-
text = f"{column} ≠ {values}"
|
|
18469
|
-
elif assertion_type == "col_vals_ge":
|
|
18470
|
-
text = f"{column} ≥ {values}"
|
|
18471
|
-
elif assertion_type == "col_vals_le":
|
|
18472
|
-
text = f"{column} ≤ {values}"
|
|
18473
|
-
elif assertion_type == "col_vals_between":
|
|
18474
|
-
symbol_left = "≤" if inclusive[0] else "<"
|
|
18475
|
-
symbol_right = "≤" if inclusive[1] else "<"
|
|
18476
|
-
text = f"{values[0]} {symbol_left} {column} {symbol_right} {values[1]}"
|
|
18477
|
-
elif assertion_type == "col_vals_outside":
|
|
18478
|
-
symbol_left = "<" if inclusive[0] else "≤"
|
|
18479
|
-
symbol_right = ">" if inclusive[1] else "≥"
|
|
18480
|
-
text = f"{column} {symbol_left} {values[0]}, {column} {symbol_right} {values[1]}"
|
|
18481
|
-
elif assertion_type == "col_vals_in_set":
|
|
18482
|
-
elements = ", ".join(map(str, values))
|
|
18483
|
-
text = f"{column} ∈ {{{elements}}}"
|
|
18484
|
-
elif assertion_type == "col_vals_not_in_set":
|
|
18485
|
-
elements = ", ".join(values)
|
|
18486
|
-
text = f"{column} ∉ {{{elements}}}"
|
|
18487
|
-
elif assertion_type == "col_vals_regex":
|
|
18488
|
-
pattern = values["pattern"]
|
|
18489
|
-
text = STEP_REPORT_TEXT["column_matches_regex"][lang].format(column=column, values=pattern)
|
|
18490
|
-
elif assertion_type == "col_vals_null":
|
|
18491
|
-
text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
|
|
18492
|
-
elif assertion_type == "col_vals_not_null":
|
|
18493
|
-
text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
|
|
18494
|
-
elif assertion_type == "col_vals_expr":
|
|
18495
|
-
text = STEP_REPORT_TEXT["column_expr"][lang].format(values=values)
|
|
18496
|
-
elif assertion_type == "rows_complete":
|
|
18497
|
-
if column is None:
|
|
18498
|
-
text = STEP_REPORT_TEXT["rows_complete_all"][lang]
|
|
18499
|
-
else:
|
|
18500
|
-
text = STEP_REPORT_TEXT["rows_complete_subset"][lang]
|
|
19378
|
+
Parameters
|
|
19379
|
+
----------
|
|
19380
|
+
column_expr
|
|
19381
|
+
The column expression that failed to resolve columns (as a string).
|
|
19382
|
+
available_columns
|
|
19383
|
+
List of available column names in the table.
|
|
19384
|
+
locale
|
|
19385
|
+
The locale string (e.g., 'en', 'fr').
|
|
18501
19386
|
|
|
18502
|
-
|
|
18503
|
-
|
|
18504
|
-
|
|
19387
|
+
Returns
|
|
19388
|
+
-------
|
|
19389
|
+
str
|
|
19390
|
+
HTML-formatted note text.
|
|
19391
|
+
"""
|
|
19392
|
+
# Get translated strings
|
|
19393
|
+
intro = NOTES_TEXT.get("column_not_found_intro", {}).get(
|
|
19394
|
+
locale, NOTES_TEXT.get("column_not_found_intro", {}).get("en", "The column expression")
|
|
19395
|
+
)
|
|
19396
|
+
no_resolve = NOTES_TEXT.get("column_not_found_no_resolve", {}).get(
|
|
19397
|
+
locale,
|
|
19398
|
+
NOTES_TEXT.get("column_not_found_no_resolve", {}).get(
|
|
19399
|
+
"en", "does not resolve to any columns"
|
|
19400
|
+
),
|
|
18505
19401
|
)
|
|
18506
19402
|
|
|
18507
|
-
|
|
18508
|
-
|
|
19403
|
+
# Format the column expression with monospace font
|
|
19404
|
+
col_expr_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_expr}</code>"
|
|
19405
|
+
|
|
19406
|
+
# Build the HTML note
|
|
19407
|
+
html = f"{intro} {col_expr_html} {no_resolve}."
|
|
19408
|
+
|
|
19409
|
+
return html
|
|
19410
|
+
|
|
19411
|
+
|
|
19412
|
+
def _create_no_columns_resolved_note_text(column_expr: str, available_columns: list[str]) -> str:
|
|
19413
|
+
"""
|
|
19414
|
+
Create a plain text note explaining that a column expression resolved to no columns.
|
|
19415
|
+
|
|
19416
|
+
Parameters
|
|
19417
|
+
----------
|
|
19418
|
+
column_expr
|
|
19419
|
+
The column expression that failed to resolve columns (as a string).
|
|
19420
|
+
available_columns
|
|
19421
|
+
List of available column names in the table.
|
|
19422
|
+
|
|
19423
|
+
Returns
|
|
19424
|
+
-------
|
|
19425
|
+
str
|
|
19426
|
+
Plain text note.
|
|
19427
|
+
"""
|
|
19428
|
+
return f"The column expression `{column_expr}` does not resolve to any columns."
|
|
19429
|
+
|
|
19430
|
+
|
|
19431
|
+
def _create_column_not_found_note_html(
|
|
19432
|
+
column_name: str, available_columns: list[str], locale: str = "en"
|
|
19433
|
+
) -> str:
|
|
19434
|
+
"""
|
|
19435
|
+
Create an HTML note explaining that a specific column was not found.
|
|
19436
|
+
|
|
19437
|
+
Parameters
|
|
19438
|
+
----------
|
|
19439
|
+
column_name
|
|
19440
|
+
The column name that was not found.
|
|
19441
|
+
available_columns
|
|
19442
|
+
List of available column names in the table.
|
|
19443
|
+
locale
|
|
19444
|
+
The locale string (e.g., 'en', 'fr').
|
|
19445
|
+
|
|
19446
|
+
Returns
|
|
19447
|
+
-------
|
|
19448
|
+
str
|
|
19449
|
+
HTML-formatted note text.
|
|
19450
|
+
"""
|
|
19451
|
+
# Get translated strings
|
|
19452
|
+
intro = NOTES_TEXT.get("target_column_provided", {}).get(
|
|
19453
|
+
locale, NOTES_TEXT.get("target_column_provided", {}).get("en", "The target column provided")
|
|
19454
|
+
)
|
|
19455
|
+
not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get(
|
|
19456
|
+
locale,
|
|
19457
|
+
NOTES_TEXT.get("does_not_match_any_columns", {}).get(
|
|
19458
|
+
"en", "does not match any columns in the table"
|
|
19459
|
+
),
|
|
19460
|
+
)
|
|
19461
|
+
|
|
19462
|
+
# Format the column name with monospace font
|
|
19463
|
+
col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
|
|
19464
|
+
|
|
19465
|
+
# Build the HTML note
|
|
19466
|
+
html = f"{intro} ({col_name_html}) {not_found}."
|
|
19467
|
+
|
|
19468
|
+
return html
|
|
19469
|
+
|
|
19470
|
+
|
|
19471
|
+
def _create_column_not_found_note_text(column_name: str, available_columns: list[str]) -> str:
|
|
19472
|
+
"""
|
|
19473
|
+
Create a plain text note explaining that a specific column was not found.
|
|
19474
|
+
|
|
19475
|
+
Parameters
|
|
19476
|
+
----------
|
|
19477
|
+
column_name
|
|
19478
|
+
The column name that was not found.
|
|
19479
|
+
available_columns
|
|
19480
|
+
List of available column names in the table.
|
|
19481
|
+
|
|
19482
|
+
Returns
|
|
19483
|
+
-------
|
|
19484
|
+
str
|
|
19485
|
+
Plain text note.
|
|
19486
|
+
"""
|
|
19487
|
+
return f"The target column provided ({column_name}) does not match any columns in the table."
|
|
19488
|
+
|
|
19489
|
+
|
|
19490
|
+
def _create_comparison_column_not_found_note_html(
|
|
19491
|
+
column_name: str, position: str | None, available_columns: list[str], locale: str = "en"
|
|
19492
|
+
) -> str:
|
|
19493
|
+
"""
|
|
19494
|
+
Create an HTML note explaining that a comparison column was not found.
|
|
19495
|
+
|
|
19496
|
+
Parameters
|
|
19497
|
+
----------
|
|
19498
|
+
column_name
|
|
19499
|
+
The comparison column name that was not found.
|
|
19500
|
+
position
|
|
19501
|
+
Optional position indicator ("left", "right") for between/outside validations.
|
|
19502
|
+
available_columns
|
|
19503
|
+
List of available column names in the table.
|
|
19504
|
+
locale
|
|
19505
|
+
The locale string (e.g., 'en', 'fr').
|
|
19506
|
+
|
|
19507
|
+
Returns
|
|
19508
|
+
-------
|
|
19509
|
+
str
|
|
19510
|
+
HTML-formatted note text.
|
|
19511
|
+
"""
|
|
19512
|
+
# Get translated strings
|
|
19513
|
+
intro = NOTES_TEXT.get("comparison_column_provided", {}).get(
|
|
19514
|
+
locale,
|
|
19515
|
+
NOTES_TEXT.get("comparison_column_provided", {}).get(
|
|
19516
|
+
"en", "The comparison column provided"
|
|
19517
|
+
),
|
|
19518
|
+
)
|
|
19519
|
+
intro_with_for = NOTES_TEXT.get("comparison_column_for", {}).get(
|
|
19520
|
+
locale,
|
|
19521
|
+
NOTES_TEXT.get("comparison_column_for", {}).get("en", "The comparison column provided for"),
|
|
19522
|
+
)
|
|
19523
|
+
not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get(
|
|
19524
|
+
locale,
|
|
19525
|
+
NOTES_TEXT.get("does_not_match_any_columns", {}).get(
|
|
19526
|
+
"en", "does not match any columns in the table"
|
|
19527
|
+
),
|
|
19528
|
+
)
|
|
19529
|
+
|
|
19530
|
+
# Format the column name with monospace font
|
|
19531
|
+
col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
|
|
19532
|
+
|
|
19533
|
+
# Add position if provided (for between/outside validations)
|
|
19534
|
+
if position:
|
|
19535
|
+
# Format position parameter with monospace font (e.g., "left=", "right=")
|
|
19536
|
+
position_param = (
|
|
19537
|
+
f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{position}=</code>"
|
|
19538
|
+
)
|
|
19539
|
+
# Use the "for" version of the intro text
|
|
19540
|
+
html = f"{intro_with_for} {position_param} ({col_name_html}) {not_found}."
|
|
19541
|
+
else:
|
|
19542
|
+
# Use the standard intro text without "for"
|
|
19543
|
+
html = f"{intro} ({col_name_html}) {not_found}."
|
|
19544
|
+
|
|
19545
|
+
return html
|
|
19546
|
+
|
|
19547
|
+
|
|
19548
|
+
def _create_comparison_column_not_found_note_text(
|
|
19549
|
+
column_name: str, position: str | None, available_columns: list[str]
|
|
19550
|
+
) -> str:
|
|
19551
|
+
"""
|
|
19552
|
+
Create a plain text note explaining that a comparison column was not found.
|
|
19553
|
+
|
|
19554
|
+
Parameters
|
|
19555
|
+
----------
|
|
19556
|
+
column_name
|
|
19557
|
+
The comparison column name that was not found.
|
|
19558
|
+
position
|
|
19559
|
+
Optional position indicator ("left", "right") for between/outside validations.
|
|
19560
|
+
available_columns
|
|
19561
|
+
List of available column names in the table.
|
|
19562
|
+
|
|
19563
|
+
Returns
|
|
19564
|
+
-------
|
|
19565
|
+
str
|
|
19566
|
+
Plain text note.
|
|
19567
|
+
"""
|
|
19568
|
+
if position:
|
|
19569
|
+
position_text = f" for {position}="
|
|
19570
|
+
else:
|
|
19571
|
+
position_text = ""
|
|
19572
|
+
|
|
19573
|
+
return (
|
|
19574
|
+
f"The comparison column provided{position_text} ({column_name}) "
|
|
19575
|
+
f"does not match any columns in the table."
|
|
19576
|
+
)
|
|
19577
|
+
|
|
19578
|
+
|
|
19579
|
+
def _create_preprocessing_note_html(
|
|
19580
|
+
original_rows: int,
|
|
19581
|
+
original_cols: int,
|
|
19582
|
+
processed_rows: int,
|
|
19583
|
+
processed_cols: int,
|
|
19584
|
+
locale: str = "en",
|
|
19585
|
+
) -> str:
|
|
19586
|
+
"""
|
|
19587
|
+
Create an HTML note showing table dimension changes from preprocessing.
|
|
19588
|
+
|
|
19589
|
+
Parameters
|
|
19590
|
+
----------
|
|
19591
|
+
original_rows
|
|
19592
|
+
Number of rows in the original table.
|
|
19593
|
+
original_cols
|
|
19594
|
+
Number of columns in the original table.
|
|
19595
|
+
processed_rows
|
|
19596
|
+
Number of rows after preprocessing.
|
|
19597
|
+
processed_cols
|
|
19598
|
+
Number of columns after preprocessing.
|
|
19599
|
+
locale
|
|
19600
|
+
The locale string (e.g., 'en', 'fr').
|
|
19601
|
+
|
|
19602
|
+
Returns
|
|
19603
|
+
-------
|
|
19604
|
+
str
|
|
19605
|
+
HTML-formatted note text.
|
|
19606
|
+
"""
|
|
19607
|
+
# Get translated strings
|
|
19608
|
+
precondition_text = NOTES_TEXT.get("precondition_applied", {}).get(
|
|
19609
|
+
locale, NOTES_TEXT.get("precondition_applied", {}).get("en", "Precondition applied")
|
|
19610
|
+
)
|
|
19611
|
+
table_dims_text = NOTES_TEXT.get("table_dimensions", {}).get(
|
|
19612
|
+
locale, NOTES_TEXT.get("table_dimensions", {}).get("en", "table dimensions")
|
|
19613
|
+
)
|
|
19614
|
+
|
|
19615
|
+
# Helper function to get singular or plural form
|
|
19616
|
+
def get_row_text(count: int) -> str:
|
|
19617
|
+
if count == 1:
|
|
19618
|
+
return NOTES_TEXT.get("row", {}).get(locale, NOTES_TEXT.get("row", {}).get("en", "row"))
|
|
19619
|
+
return NOTES_TEXT.get("rows", {}).get(locale, NOTES_TEXT.get("rows", {}).get("en", "rows"))
|
|
19620
|
+
|
|
19621
|
+
def get_col_text(count: int) -> str:
|
|
19622
|
+
if count == 1:
|
|
19623
|
+
return NOTES_TEXT.get("column", {}).get(
|
|
19624
|
+
locale, NOTES_TEXT.get("column", {}).get("en", "column")
|
|
19625
|
+
)
|
|
19626
|
+
return NOTES_TEXT.get("columns", {}).get(
|
|
19627
|
+
locale, NOTES_TEXT.get("columns", {}).get("en", "columns")
|
|
19628
|
+
)
|
|
19629
|
+
|
|
19630
|
+
# Determine which dimensions changed
|
|
19631
|
+
rows_changed = original_rows != processed_rows
|
|
19632
|
+
cols_changed = original_cols != processed_cols
|
|
19633
|
+
|
|
19634
|
+
# Format original dimensions
|
|
19635
|
+
original_rows_text = get_row_text(original_rows)
|
|
19636
|
+
original_cols_text = get_col_text(original_cols)
|
|
19637
|
+
original_dim = (
|
|
19638
|
+
f'<span style="font-family: monospace;">'
|
|
19639
|
+
f"[{original_rows:,} {original_rows_text}, {original_cols} {original_cols_text}]"
|
|
19640
|
+
f"</span>"
|
|
19641
|
+
)
|
|
19642
|
+
|
|
19643
|
+
# Format processed dimensions with bold for changed values
|
|
19644
|
+
processed_rows_text = get_row_text(processed_rows)
|
|
19645
|
+
processed_cols_text = get_col_text(processed_cols)
|
|
19646
|
+
|
|
19647
|
+
if rows_changed:
|
|
19648
|
+
rows_display = f"<strong>{processed_rows:,}</strong> {processed_rows_text}"
|
|
19649
|
+
else:
|
|
19650
|
+
rows_display = f"{processed_rows:,} {processed_rows_text}"
|
|
19651
|
+
|
|
19652
|
+
if cols_changed:
|
|
19653
|
+
cols_display = f"<strong>{processed_cols}</strong> {processed_cols_text}"
|
|
19654
|
+
else:
|
|
19655
|
+
cols_display = f"{processed_cols} {processed_cols_text}"
|
|
19656
|
+
|
|
19657
|
+
processed_dim = f'<span style="font-family: monospace;">[{rows_display}, {cols_display}]</span>'
|
|
19658
|
+
|
|
19659
|
+
# Build the HTML note
|
|
19660
|
+
html = f"{precondition_text}: {table_dims_text} {original_dim} → {processed_dim}."
|
|
19661
|
+
|
|
19662
|
+
return html
|
|
19663
|
+
|
|
19664
|
+
|
|
19665
|
+
def _create_preprocessing_note_text(
|
|
19666
|
+
original_rows: int,
|
|
19667
|
+
original_cols: int,
|
|
19668
|
+
processed_rows: int,
|
|
19669
|
+
processed_cols: int,
|
|
19670
|
+
) -> str:
|
|
19671
|
+
"""
|
|
19672
|
+
Create a plain text note showing table dimension changes from preprocessing.
|
|
19673
|
+
|
|
19674
|
+
Parameters
|
|
19675
|
+
----------
|
|
19676
|
+
original_rows
|
|
19677
|
+
Number of rows in the original table.
|
|
19678
|
+
original_cols
|
|
19679
|
+
Number of columns in the original table.
|
|
19680
|
+
processed_rows
|
|
19681
|
+
Number of rows after preprocessing.
|
|
19682
|
+
processed_cols
|
|
19683
|
+
Number of columns after preprocessing.
|
|
19684
|
+
|
|
19685
|
+
Returns
|
|
19686
|
+
-------
|
|
19687
|
+
str
|
|
19688
|
+
Plain text note.
|
|
19689
|
+
"""
|
|
19690
|
+
# Get singular or plural forms
|
|
19691
|
+
original_rows_text = "row" if original_rows == 1 else "rows"
|
|
19692
|
+
original_cols_text = "column" if original_cols == 1 else "columns"
|
|
19693
|
+
processed_rows_text = "row" if processed_rows == 1 else "rows"
|
|
19694
|
+
processed_cols_text = "column" if processed_cols == 1 else "columns"
|
|
19695
|
+
|
|
19696
|
+
return (
|
|
19697
|
+
f"Precondition applied: table dimensions "
|
|
19698
|
+
f"[{original_rows:,} {original_rows_text}, {original_cols} {original_cols_text}] → "
|
|
19699
|
+
f"[{processed_rows:,} {processed_rows_text}, {processed_cols} {processed_cols_text}]."
|
|
19700
|
+
)
|
|
19701
|
+
|
|
19702
|
+
|
|
19703
|
+
def _create_preprocessing_no_change_note_html(locale: str = "en") -> str:
|
|
19704
|
+
"""
|
|
19705
|
+
Create an HTML note indicating preprocessing was applied with no dimension change.
|
|
19706
|
+
|
|
19707
|
+
Parameters
|
|
19708
|
+
----------
|
|
19709
|
+
locale
|
|
19710
|
+
The locale string (e.g., 'en', 'fr').
|
|
19711
|
+
|
|
19712
|
+
Returns
|
|
19713
|
+
-------
|
|
19714
|
+
str
|
|
19715
|
+
HTML-formatted note text.
|
|
19716
|
+
"""
|
|
19717
|
+
# Get translated string
|
|
19718
|
+
note_text = NOTES_TEXT.get("precondition_applied_no_change", {}).get(
|
|
19719
|
+
locale,
|
|
19720
|
+
NOTES_TEXT.get("precondition_applied_no_change", {}).get(
|
|
19721
|
+
"en", "Precondition applied: no table dimension change"
|
|
19722
|
+
),
|
|
19723
|
+
)
|
|
19724
|
+
|
|
19725
|
+
return f"{note_text}."
|
|
19726
|
+
|
|
19727
|
+
|
|
19728
|
+
def _create_preprocessing_no_change_note_text() -> str:
|
|
19729
|
+
"""
|
|
19730
|
+
Create a plain text note indicating preprocessing was applied with no dimension change.
|
|
19731
|
+
|
|
19732
|
+
Returns
|
|
19733
|
+
-------
|
|
19734
|
+
str
|
|
19735
|
+
Plain text note.
|
|
19736
|
+
"""
|
|
19737
|
+
return "Precondition applied: no table dimension change."
|
|
19738
|
+
|
|
19739
|
+
|
|
19740
|
+
def _create_synthetic_target_column_note_html(column_name: str, locale: str = "en") -> str:
|
|
19741
|
+
"""
|
|
19742
|
+
Create an HTML note indicating that the target column was created via preprocessing.
|
|
19743
|
+
|
|
19744
|
+
Parameters
|
|
19745
|
+
----------
|
|
19746
|
+
column_name
|
|
19747
|
+
The name of the synthetic target column.
|
|
19748
|
+
locale
|
|
19749
|
+
The locale string (e.g., 'en', 'fr').
|
|
19750
|
+
|
|
19751
|
+
Returns
|
|
19752
|
+
-------
|
|
19753
|
+
str
|
|
19754
|
+
HTML-formatted note text.
|
|
19755
|
+
"""
|
|
19756
|
+
# Get translated strings
|
|
19757
|
+
synthetic_text = NOTES_TEXT.get("synthetic_target_column", {}).get(
|
|
19758
|
+
locale, NOTES_TEXT.get("synthetic_target_column", {}).get("en", "Synthetic target column")
|
|
19759
|
+
)
|
|
19760
|
+
created_via_text = NOTES_TEXT.get("created_via_preprocessing", {}).get(
|
|
19761
|
+
locale,
|
|
19762
|
+
NOTES_TEXT.get("created_via_preprocessing", {}).get("en", "created via preprocessing"),
|
|
19763
|
+
)
|
|
19764
|
+
|
|
19765
|
+
# Format the column name with monospace font
|
|
19766
|
+
col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
|
|
19767
|
+
|
|
19768
|
+
# Build the HTML note
|
|
19769
|
+
html = f"{synthetic_text} {col_name_html} {created_via_text}."
|
|
19770
|
+
|
|
19771
|
+
return html
|
|
19772
|
+
|
|
19773
|
+
|
|
19774
|
+
def _create_synthetic_target_column_note_text(column_name: str) -> str:
|
|
19775
|
+
"""
|
|
19776
|
+
Create a plain text note indicating that the target column was created via preprocessing.
|
|
19777
|
+
|
|
19778
|
+
Parameters
|
|
19779
|
+
----------
|
|
19780
|
+
column_name
|
|
19781
|
+
The name of the synthetic target column.
|
|
19782
|
+
|
|
19783
|
+
Returns
|
|
19784
|
+
-------
|
|
19785
|
+
str
|
|
19786
|
+
Plain text note.
|
|
19787
|
+
"""
|
|
19788
|
+
return f"Synthetic target column ({column_name}) created via preprocessing."
|
|
19789
|
+
|
|
19790
|
+
|
|
19791
|
+
def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> str:
|
|
19792
|
+
"""
|
|
19793
|
+
Create an HTML note with collapsible schema expectation and results.
|
|
19794
|
+
|
|
19795
|
+
This generates a disclosure-style note showing:
|
|
19796
|
+
1. A summary of what failed (if anything)
|
|
19797
|
+
2. The full step report table (collapsible)
|
|
19798
|
+
|
|
19799
|
+
Parameters
|
|
19800
|
+
----------
|
|
19801
|
+
schema_info
|
|
19802
|
+
The schema validation information dictionary from interrogation.
|
|
19803
|
+
locale
|
|
19804
|
+
The locale string (e.g., 'en', 'fr').
|
|
19805
|
+
|
|
19806
|
+
Returns
|
|
19807
|
+
-------
|
|
19808
|
+
str
|
|
19809
|
+
HTML-formatted note with collapsible schema details.
|
|
19810
|
+
"""
|
|
19811
|
+
passed = schema_info["passed"]
|
|
19812
|
+
expect_schema = schema_info["expect_schema"]
|
|
19813
|
+
target_schema = schema_info["target_schema"]
|
|
19814
|
+
params = schema_info["params"]
|
|
19815
|
+
columns_dict = schema_info["columns"]
|
|
19816
|
+
in_order = params["in_order"]
|
|
19817
|
+
|
|
19818
|
+
# Get translations for the locale
|
|
19819
|
+
passed_text = VALIDATION_REPORT_TEXT["note_schema_comparison_passed"].get(
|
|
19820
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_comparison_passed"]["en"]
|
|
19821
|
+
)
|
|
19822
|
+
failed_text = VALIDATION_REPORT_TEXT["note_schema_comparison_failed"].get(
|
|
19823
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_comparison_failed"]["en"]
|
|
19824
|
+
)
|
|
19825
|
+
disclosure_text = VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"].get(
|
|
19826
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"]["en"]
|
|
19827
|
+
)
|
|
19828
|
+
settings_title_text = VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"].get(
|
|
19829
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"]["en"]
|
|
19830
|
+
)
|
|
19831
|
+
|
|
19832
|
+
# Build summary message
|
|
19833
|
+
if passed:
|
|
19834
|
+
summary = f'<span style="color:#4CA64C;">✓</span> {passed_text}.'
|
|
19835
|
+
else:
|
|
19836
|
+
# Analyze what failed
|
|
19837
|
+
failures = []
|
|
19838
|
+
|
|
19839
|
+
# Check column count mismatch
|
|
19840
|
+
n_expect = len(expect_schema)
|
|
19841
|
+
n_target = len(target_schema)
|
|
19842
|
+
if n_expect != n_target:
|
|
19843
|
+
count_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"].get(
|
|
19844
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"]["en"]
|
|
19845
|
+
)
|
|
19846
|
+
failures.append(count_mismatch_text.format(n_expect=n_expect, n_target=n_target))
|
|
19847
|
+
|
|
19848
|
+
# Check for unmatched columns
|
|
19849
|
+
unmatched_cols = [col for col, info in columns_dict.items() if not info["colname_matched"]]
|
|
19850
|
+
if unmatched_cols:
|
|
19851
|
+
unmatched_text = VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"].get(
|
|
19852
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"]["en"]
|
|
19853
|
+
)
|
|
19854
|
+
failures.append(unmatched_text.format(n=len(unmatched_cols)))
|
|
19855
|
+
|
|
19856
|
+
# Check for wrong order (if in_order=True)
|
|
19857
|
+
if params["in_order"]:
|
|
19858
|
+
wrong_order = [
|
|
19859
|
+
col
|
|
19860
|
+
for col, info in columns_dict.items()
|
|
19861
|
+
if info["colname_matched"] and not info["index_matched"]
|
|
19862
|
+
]
|
|
19863
|
+
if wrong_order:
|
|
19864
|
+
wrong_order_text = VALIDATION_REPORT_TEXT["note_schema_wrong_order"].get(
|
|
19865
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_wrong_order"]["en"]
|
|
19866
|
+
)
|
|
19867
|
+
failures.append(wrong_order_text.format(n=len(wrong_order)))
|
|
19868
|
+
|
|
19869
|
+
# Check for dtype mismatches
|
|
19870
|
+
dtype_mismatches = [
|
|
19871
|
+
col
|
|
19872
|
+
for col, info in columns_dict.items()
|
|
19873
|
+
if info["colname_matched"] and info["dtype_present"] and not info["dtype_matched"]
|
|
19874
|
+
]
|
|
19875
|
+
if dtype_mismatches:
|
|
19876
|
+
dtype_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"].get(
|
|
19877
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"]["en"]
|
|
19878
|
+
)
|
|
19879
|
+
failures.append(dtype_mismatch_text.format(n=len(dtype_mismatches)))
|
|
19880
|
+
|
|
19881
|
+
if failures:
|
|
19882
|
+
summary = (
|
|
19883
|
+
f'<span style="color:#FF3300;">✗</span> {failed_text}: ' + ", ".join(failures) + "."
|
|
19884
|
+
)
|
|
19885
|
+
else:
|
|
19886
|
+
summary = f'<span style="color:#FF3300;">✗</span> {failed_text}.' # pragma: no cover
|
|
19887
|
+
|
|
19888
|
+
# Generate the step report table using the existing function
|
|
19889
|
+
# We'll call either _step_report_schema_in_order or _step_report_schema_any_order
|
|
19890
|
+
# depending on the in_order parameter
|
|
19891
|
+
if in_order: # pragma: no cover
|
|
19892
|
+
step_report_gt = _step_report_schema_in_order( # pragma: no cover
|
|
19893
|
+
step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
|
|
19894
|
+
)
|
|
19895
|
+
else:
|
|
19896
|
+
step_report_gt = _step_report_schema_any_order(
|
|
19897
|
+
step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
|
|
19898
|
+
)
|
|
19899
|
+
|
|
19900
|
+
# Generate the settings HTML using the existing function
|
|
19901
|
+
settings_html = _create_col_schema_match_params_html(
|
|
19902
|
+
lang=locale,
|
|
19903
|
+
complete=params["complete"],
|
|
19904
|
+
in_order=params["in_order"],
|
|
19905
|
+
case_sensitive_colnames=params["case_sensitive_colnames"],
|
|
19906
|
+
case_sensitive_dtypes=params["case_sensitive_dtypes"],
|
|
19907
|
+
full_match_dtypes=params["full_match_dtypes"],
|
|
19908
|
+
)
|
|
19909
|
+
|
|
19910
|
+
# Remove the inner div containing column_schema_match_str
|
|
19911
|
+
settings_html = re.sub(r'<div style="margin-right: 5px;">.*?</div>', "", settings_html, count=1)
|
|
19912
|
+
|
|
19913
|
+
# Change padding-top from 7px to 2px
|
|
19914
|
+
settings_html = settings_html.replace("padding-top: 7px;", "padding-top: 2px;")
|
|
19915
|
+
|
|
19916
|
+
# Create new source note HTML that includes both settings and schema
|
|
19917
|
+
source_note_html = f"""
|
|
19918
|
+
<div style='padding-bottom: 2px;'>{settings_title_text}</div>
|
|
19919
|
+
<div style='padding-bottom: 4px;'>{settings_html}</div>
|
|
19920
|
+
"""
|
|
19921
|
+
|
|
19922
|
+
# Add the settings as an additional source note to the step report
|
|
19923
|
+
step_report_gt = step_report_gt.tab_source_note(source_note=html(source_note_html)) # type: ignore[union-attr]
|
|
19924
|
+
|
|
19925
|
+
# Extract the HTML from the GT object
|
|
19926
|
+
step_report_html = step_report_gt._repr_html_()
|
|
19927
|
+
|
|
19928
|
+
# Create collapsible section with the step report
|
|
19929
|
+
note_html = f"""
|
|
19930
|
+
{summary}
|
|
19931
|
+
|
|
19932
|
+
<details style="margin-top: 2px; margin-bottom: 8px; font-size: 12px; text-indent: 12px;">
|
|
19933
|
+
<summary style="cursor: pointer; font-weight: bold; color: #555; margin-bottom: -5px;">{disclosure_text}</summary>
|
|
19934
|
+
<div style="margin-top: 6px; padding-left: 15px; padding-right: 15px;">
|
|
19935
|
+
|
|
19936
|
+
{step_report_html}
|
|
19937
|
+
|
|
19938
|
+
</div>
|
|
19939
|
+
</details>
|
|
19940
|
+
"""
|
|
19941
|
+
|
|
19942
|
+
return note_html.strip()
|
|
19943
|
+
|
|
19944
|
+
|
|
19945
|
+
def _create_col_schema_match_note_text(schema_info: dict) -> str:
|
|
19946
|
+
"""
|
|
19947
|
+
Create a plain text note for schema validation.
|
|
19948
|
+
|
|
19949
|
+
Parameters
|
|
19950
|
+
----------
|
|
19951
|
+
schema_info
|
|
19952
|
+
The schema validation information dictionary from interrogation.
|
|
19953
|
+
|
|
19954
|
+
Returns
|
|
19955
|
+
-------
|
|
19956
|
+
str
|
|
19957
|
+
Plain text note.
|
|
19958
|
+
"""
|
|
19959
|
+
passed = schema_info["passed"]
|
|
19960
|
+
expect_schema = schema_info["expect_schema"]
|
|
19961
|
+
target_schema = schema_info["target_schema"]
|
|
19962
|
+
|
|
19963
|
+
if passed:
|
|
19964
|
+
return f"Schema validation passed. Expected {len(expect_schema)} column(s), found {len(target_schema)}."
|
|
19965
|
+
else:
|
|
19966
|
+
return f"Schema validation failed. Expected {len(expect_schema)} column(s), found {len(target_schema)}."
|
|
19967
|
+
|
|
19968
|
+
|
|
19969
|
+
def _step_report_row_based(
|
|
19970
|
+
assertion_type: str,
|
|
19971
|
+
i: int,
|
|
19972
|
+
column: str,
|
|
19973
|
+
column_position: int,
|
|
19974
|
+
columns_subset: list[str] | None,
|
|
19975
|
+
values: Any,
|
|
19976
|
+
inclusive: tuple[bool, bool] | None,
|
|
19977
|
+
n: int,
|
|
19978
|
+
n_failed: int,
|
|
19979
|
+
all_passed: bool,
|
|
19980
|
+
extract: Any,
|
|
19981
|
+
tbl_preview: GT,
|
|
19982
|
+
header: str,
|
|
19983
|
+
limit: int | None,
|
|
19984
|
+
lang: str,
|
|
19985
|
+
) -> GT:
|
|
19986
|
+
# Get the length of the extracted data for the step
|
|
19987
|
+
extract_length = get_row_count(extract)
|
|
19988
|
+
|
|
19989
|
+
# Determine whether the `lang` value represents a right-to-left language
|
|
19990
|
+
is_rtl_lang = lang in RTL_LANGUAGES
|
|
19991
|
+
direction_rtl = " direction: rtl;" if is_rtl_lang else ""
|
|
19992
|
+
|
|
19993
|
+
# Generate text that indicates the assertion for the validation step
|
|
19994
|
+
if assertion_type == "col_vals_gt":
|
|
19995
|
+
text = f"{column} > {values}"
|
|
19996
|
+
elif assertion_type == "col_vals_lt":
|
|
19997
|
+
text = f"{column} < {values}"
|
|
19998
|
+
elif assertion_type == "col_vals_eq":
|
|
19999
|
+
text = f"{column} = {values}"
|
|
20000
|
+
elif assertion_type == "col_vals_ne":
|
|
20001
|
+
text = f"{column} ≠ {values}"
|
|
20002
|
+
elif assertion_type == "col_vals_ge":
|
|
20003
|
+
text = f"{column} ≥ {values}"
|
|
20004
|
+
elif assertion_type == "col_vals_le":
|
|
20005
|
+
text = f"{column} ≤ {values}"
|
|
20006
|
+
elif assertion_type == "col_vals_between":
|
|
20007
|
+
assert inclusive is not None
|
|
20008
|
+
symbol_left = "≤" if inclusive[0] else "<"
|
|
20009
|
+
symbol_right = "≤" if inclusive[1] else "<"
|
|
20010
|
+
text = f"{values[0]} {symbol_left} {column} {symbol_right} {values[1]}"
|
|
20011
|
+
elif assertion_type == "col_vals_outside":
|
|
20012
|
+
assert inclusive is not None
|
|
20013
|
+
symbol_left = "<" if inclusive[0] else "≤"
|
|
20014
|
+
symbol_right = ">" if inclusive[1] else "≥"
|
|
20015
|
+
text = f"{column} {symbol_left} {values[0]}, {column} {symbol_right} {values[1]}"
|
|
20016
|
+
elif assertion_type == "col_vals_in_set":
|
|
20017
|
+
elements = ", ".join(map(str, values))
|
|
20018
|
+
text = f"{column} ∈ {{{elements}}}"
|
|
20019
|
+
elif assertion_type == "col_vals_not_in_set":
|
|
20020
|
+
elements = ", ".join(values)
|
|
20021
|
+
text = f"{column} ∉ {{{elements}}}"
|
|
20022
|
+
elif assertion_type == "col_vals_regex":
|
|
20023
|
+
pattern = values["pattern"]
|
|
20024
|
+
text = STEP_REPORT_TEXT["column_matches_regex"][lang].format(column=column, values=pattern)
|
|
20025
|
+
elif assertion_type == "col_vals_null":
|
|
20026
|
+
text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
|
|
20027
|
+
elif assertion_type == "col_vals_not_null":
|
|
20028
|
+
text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
|
|
20029
|
+
elif assertion_type == "col_vals_expr":
|
|
20030
|
+
text = STEP_REPORT_TEXT["column_expr"][lang].format(values=values)
|
|
20031
|
+
elif assertion_type == "rows_complete":
|
|
20032
|
+
if column is None:
|
|
20033
|
+
text = STEP_REPORT_TEXT["rows_complete_all"][lang]
|
|
20034
|
+
else:
|
|
20035
|
+
text = STEP_REPORT_TEXT["rows_complete_subset"][lang]
|
|
20036
|
+
|
|
20037
|
+
# Wrap assertion text in a <code> tag
|
|
20038
|
+
text = (
|
|
20039
|
+
f"<code style='color: #303030; font-family: monospace; font-size: smaller;'>{text}</code>"
|
|
20040
|
+
)
|
|
20041
|
+
|
|
20042
|
+
if all_passed:
|
|
20043
|
+
# Style the target column in green and add borders but only if that column is present
|
|
18509
20044
|
# in the `tbl_preview` (i.e., it may not be present if `columns_subset=` didn't include it)
|
|
18510
20045
|
preview_tbl_columns = tbl_preview._boxhead._get_columns()
|
|
18511
20046
|
preview_tbl_has_target_column = column in preview_tbl_columns
|
|
@@ -18695,7 +20230,7 @@ def _step_report_rows_distinct(
|
|
|
18695
20230
|
n: int,
|
|
18696
20231
|
n_failed: int,
|
|
18697
20232
|
all_passed: bool,
|
|
18698
|
-
extract:
|
|
20233
|
+
extract: Any,
|
|
18699
20234
|
tbl_preview: GT,
|
|
18700
20235
|
header: str,
|
|
18701
20236
|
limit: int | None,
|
|
@@ -18822,8 +20357,8 @@ def _step_report_rows_distinct(
|
|
|
18822
20357
|
|
|
18823
20358
|
|
|
18824
20359
|
def _step_report_schema_in_order(
|
|
18825
|
-
step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False
|
|
18826
|
-
) -> GT |
|
|
20360
|
+
step: int, schema_info: dict, header: str | None, lang: str, debug_return_df: bool = False
|
|
20361
|
+
) -> GT | Any:
|
|
18827
20362
|
"""
|
|
18828
20363
|
This is the case for schema validation where the schema is supposed to have the same column
|
|
18829
20364
|
order as the target table.
|
|
@@ -18880,16 +20415,33 @@ def _step_report_schema_in_order(
|
|
|
18880
20415
|
dtype_exp = []
|
|
18881
20416
|
dtype_exp_correct = []
|
|
18882
20417
|
|
|
18883
|
-
for i in range(len(
|
|
20418
|
+
for i in range(len(expect_schema)):
|
|
18884
20419
|
#
|
|
18885
20420
|
# `col_name_exp` values
|
|
18886
20421
|
#
|
|
18887
20422
|
|
|
18888
|
-
#
|
|
18889
|
-
|
|
18890
|
-
col_name_exp.append(
|
|
20423
|
+
# Get the column name from expect_schema (which can have duplicates)
|
|
20424
|
+
column_name_exp_i = expect_schema[i][0]
|
|
20425
|
+
col_name_exp.append(column_name_exp_i)
|
|
18891
20426
|
|
|
18892
|
-
|
|
20427
|
+
# Check if this column exists in exp_columns_dict (it might not if it's a duplicate)
|
|
20428
|
+
# For duplicates, we need to handle them specially
|
|
20429
|
+
if column_name_exp_i not in exp_columns_dict: # pragma: no cover
|
|
20430
|
+
# This is a duplicate or invalid column, mark it as incorrect
|
|
20431
|
+
col_exp_correct.append(CROSS_MARK_SPAN) # pragma: no cover
|
|
20432
|
+
|
|
20433
|
+
# For dtype, check if there's a dtype specified in the schema
|
|
20434
|
+
if len(expect_schema[i]) > 1: # pragma: no cover
|
|
20435
|
+
dtype_value = expect_schema[i][1] # pragma: no cover
|
|
20436
|
+
if isinstance(dtype_value, list): # pragma: no cover
|
|
20437
|
+
dtype_exp.append(" | ".join(dtype_value)) # pragma: no cover
|
|
20438
|
+
else: # pragma: no cover
|
|
20439
|
+
dtype_exp.append(str(dtype_value)) # pragma: no cover
|
|
20440
|
+
else: # pragma: no cover
|
|
20441
|
+
dtype_exp.append("—") # pragma: no cover
|
|
20442
|
+
|
|
20443
|
+
dtype_exp_correct.append("—") # pragma: no cover
|
|
20444
|
+
continue # pragma: no cover
|
|
18893
20445
|
|
|
18894
20446
|
#
|
|
18895
20447
|
# `col_exp_correct` values
|
|
@@ -19112,7 +20664,9 @@ def _step_report_schema_in_order(
|
|
|
19112
20664
|
# Add a border below the row that terminates the target table schema
|
|
19113
20665
|
step_report = step_report.tab_style(
|
|
19114
20666
|
style=style.borders(sides="bottom", color="#6699CC80", style="solid", weight="1px"),
|
|
19115
|
-
locations=loc.body(
|
|
20667
|
+
locations=loc.body(
|
|
20668
|
+
rows=len(colnames_tgt) - 1 # ty: ignore (bug in GT, should allow an int)
|
|
20669
|
+
),
|
|
19116
20670
|
)
|
|
19117
20671
|
|
|
19118
20672
|
# If the version of `great_tables` is `>=0.17.0` then disable Quarto table processing
|
|
@@ -19161,8 +20715,8 @@ def _step_report_schema_in_order(
|
|
|
19161
20715
|
|
|
19162
20716
|
|
|
19163
20717
|
def _step_report_schema_any_order(
|
|
19164
|
-
step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False
|
|
19165
|
-
) -> GT |
|
|
20718
|
+
step: int, schema_info: dict, header: str | None, lang: str, debug_return_df: bool = False
|
|
20719
|
+
) -> GT | pl.DataFrame:
|
|
19166
20720
|
"""
|
|
19167
20721
|
This is the case for schema validation where the schema is permitted to not have to be in the
|
|
19168
20722
|
same column order as the target table.
|
|
@@ -19581,9 +21135,7 @@ def _step_report_schema_any_order(
|
|
|
19581
21135
|
header = header.format(title=title, details=details)
|
|
19582
21136
|
|
|
19583
21137
|
# Create the header with `header` string
|
|
19584
|
-
|
|
19585
|
-
|
|
19586
|
-
return step_report
|
|
21138
|
+
return step_report.tab_header(title=md(header))
|
|
19587
21139
|
|
|
19588
21140
|
|
|
19589
21141
|
def _create_label_text_html(
|
|
@@ -19672,3 +21224,321 @@ def _create_col_schema_match_params_html(
|
|
|
19672
21224
|
f"{full_match_dtypes_text}"
|
|
19673
21225
|
"</div>"
|
|
19674
21226
|
)
|
|
21227
|
+
|
|
21228
|
+
|
|
21229
|
+
def _generate_agg_docstring(name: str) -> str:
|
|
21230
|
+
"""Generate a comprehensive docstring for an aggregation validation method.
|
|
21231
|
+
|
|
21232
|
+
This function creates detailed documentation for dynamically generated methods like
|
|
21233
|
+
`col_sum_eq()`, `col_avg_gt()`, `col_sd_le()`, etc. The docstrings follow the same
|
|
21234
|
+
structure and quality as manually written validation methods like `col_vals_gt()`.
|
|
21235
|
+
|
|
21236
|
+
Parameters
|
|
21237
|
+
----------
|
|
21238
|
+
name
|
|
21239
|
+
The method name (e.g., "col_sum_eq", "col_avg_gt", "col_sd_le").
|
|
21240
|
+
|
|
21241
|
+
Returns
|
|
21242
|
+
-------
|
|
21243
|
+
str
|
|
21244
|
+
A complete docstring for the method.
|
|
21245
|
+
"""
|
|
21246
|
+
# Parse the method name to extract aggregation type and comparison operator
|
|
21247
|
+
# Format: col_{agg}_{comp} (e.g., col_sum_eq, col_avg_gt, col_sd_le)
|
|
21248
|
+
parts = name.split("_")
|
|
21249
|
+
agg_type = parts[1] # sum, avg, sd
|
|
21250
|
+
comp_type = parts[2] # eq, gt, ge, lt, le
|
|
21251
|
+
|
|
21252
|
+
# Human-readable names for aggregation types
|
|
21253
|
+
agg_names = {
|
|
21254
|
+
"sum": ("sum", "summed"),
|
|
21255
|
+
"avg": ("average", "averaged"),
|
|
21256
|
+
"sd": ("standard deviation", "computed for standard deviation"),
|
|
21257
|
+
}
|
|
21258
|
+
|
|
21259
|
+
# Human-readable descriptions for comparison operators (with article for title)
|
|
21260
|
+
comp_descriptions = {
|
|
21261
|
+
"eq": ("equal to", "equals", "an"),
|
|
21262
|
+
"gt": ("greater than", "is greater than", "a"),
|
|
21263
|
+
"ge": ("greater than or equal to", "is at least", "a"),
|
|
21264
|
+
"lt": ("less than", "is less than", "a"),
|
|
21265
|
+
"le": ("less than or equal to", "is at most", "a"),
|
|
21266
|
+
}
|
|
21267
|
+
|
|
21268
|
+
# Mathematical symbols for comparison operators
|
|
21269
|
+
comp_symbols = {
|
|
21270
|
+
"eq": "==",
|
|
21271
|
+
"gt": ">",
|
|
21272
|
+
"ge": ">=",
|
|
21273
|
+
"lt": "<",
|
|
21274
|
+
"le": "<=",
|
|
21275
|
+
}
|
|
21276
|
+
|
|
21277
|
+
agg_name, agg_verb = agg_names[agg_type]
|
|
21278
|
+
comp_desc, comp_phrase, comp_article = comp_descriptions[comp_type]
|
|
21279
|
+
comp_symbol = comp_symbols[comp_type]
|
|
21280
|
+
|
|
21281
|
+
# Determine the appropriate example values based on the aggregation and comparison
|
|
21282
|
+
if agg_type == "sum":
|
|
21283
|
+
example_value = "15"
|
|
21284
|
+
example_data = '{"a": [1, 2, 3, 4, 5], "b": [2, 2, 2, 2, 2]}'
|
|
21285
|
+
example_sum = "15" # sum of a
|
|
21286
|
+
example_ref_sum = "10" # sum of b
|
|
21287
|
+
elif agg_type == "avg":
|
|
21288
|
+
example_value = "3"
|
|
21289
|
+
example_data = '{"a": [1, 2, 3, 4, 5], "b": [2, 2, 2, 2, 2]}'
|
|
21290
|
+
example_sum = "3.0" # avg of a
|
|
21291
|
+
example_ref_sum = "2.0" # avg of b
|
|
21292
|
+
else: # sd
|
|
21293
|
+
example_value = "2"
|
|
21294
|
+
example_data = '{"a": [1, 2, 3, 4, 5], "b": [2, 2, 2, 2, 2]}'
|
|
21295
|
+
example_sum = "~1.58" # sd of a
|
|
21296
|
+
example_ref_sum = "0.0" # sd of b
|
|
21297
|
+
|
|
21298
|
+
# Build appropriate tolerance explanation based on comparison type
|
|
21299
|
+
if comp_type == "eq":
|
|
21300
|
+
tol_explanation = f"""The `tol=` parameter is particularly useful with `{name}()` since exact equality
|
|
21301
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
21302
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
21303
|
+
floating-point arithmetic."""
|
|
21304
|
+
else:
|
|
21305
|
+
tol_explanation = f"""The `tol=` parameter expands the acceptable range for the comparison. For
|
|
21306
|
+
`{name}()`, a tolerance of `tol=0.5` would mean the {agg_name} can be within `0.5` of the
|
|
21307
|
+
target value and still pass validation."""
|
|
21308
|
+
|
|
21309
|
+
docstring = f"""
|
|
21310
|
+
Does the column {agg_name} satisfy {comp_article} {comp_desc} comparison?
|
|
21311
|
+
|
|
21312
|
+
The `{name}()` validation method checks whether the {agg_name} of values in a column
|
|
21313
|
+
{comp_phrase} a specified `value=`. This is an aggregation-based validation where the entire
|
|
21314
|
+
column is reduced to a single {agg_name} value that is then compared against the target. The
|
|
21315
|
+
comparison used in this function is `{agg_name}(column) {comp_symbol} value`.
|
|
21316
|
+
|
|
21317
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
21318
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
21319
|
+
the comparison) or fails completely.
|
|
21320
|
+
|
|
21321
|
+
Parameters
|
|
21322
|
+
----------
|
|
21323
|
+
columns
|
|
21324
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
21325
|
+
there will be a separate validation step generated for each column. The columns must
|
|
21326
|
+
contain numeric data for the {agg_name} to be computed.
|
|
21327
|
+
value
|
|
21328
|
+
The value to compare the column {agg_name} against. This can be: (1) a numeric literal
|
|
21329
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
21330
|
+
whose {agg_name} will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
21331
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
21332
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
21333
|
+
`ref(column_name)` when reference data is set).
|
|
21334
|
+
tol
|
|
21335
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
21336
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
21337
|
+
a {agg_name} that differs from the target by up to `0.5` will still pass. {tol_explanation}
|
|
21338
|
+
thresholds
|
|
21339
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
21340
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
21341
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
21342
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
21343
|
+
acceptable.
|
|
21344
|
+
brief
|
|
21345
|
+
An optional brief description of the validation step that will be displayed in the
|
|
21346
|
+
reporting table. You can use the templating elements like `"{{step}}"` to insert
|
|
21347
|
+
the step number, or `"{{auto}}"` to include an automatically generated brief. If `True`
|
|
21348
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
21349
|
+
won't be a brief.
|
|
21350
|
+
actions
|
|
21351
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
21352
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
21353
|
+
define the actions.
|
|
21354
|
+
active
|
|
21355
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
21356
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
21357
|
+
for the steps unchanged).
|
|
21358
|
+
|
|
21359
|
+
Returns
|
|
21360
|
+
-------
|
|
21361
|
+
Validate
|
|
21362
|
+
The `Validate` object with the added validation step.
|
|
21363
|
+
|
|
21364
|
+
Using Reference Data
|
|
21365
|
+
--------------------
|
|
21366
|
+
The `{name}()` method supports comparing column aggregations against reference data. This
|
|
21367
|
+
is useful for validating that statistical properties remain consistent across different
|
|
21368
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
21369
|
+
|
|
21370
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
21371
|
+
|
|
21372
|
+
```python
|
|
21373
|
+
validation = (
|
|
21374
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
21375
|
+
.{name}(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
21376
|
+
.interrogate()
|
|
21377
|
+
)
|
|
21378
|
+
```
|
|
21379
|
+
|
|
21380
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
21381
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
21382
|
+
the `ref()` helper:
|
|
21383
|
+
|
|
21384
|
+
```python
|
|
21385
|
+
.{name}(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
21386
|
+
```
|
|
21387
|
+
|
|
21388
|
+
Understanding Tolerance
|
|
21389
|
+
-----------------------
|
|
21390
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
21391
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
21392
|
+
|
|
21393
|
+
{tol_explanation}
|
|
21394
|
+
|
|
21395
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
21396
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
21397
|
+
shifts the comparison boundary.
|
|
21398
|
+
|
|
21399
|
+
Thresholds
|
|
21400
|
+
----------
|
|
21401
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
21402
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
21403
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
21404
|
+
|
|
21405
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
21406
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
21407
|
+
typically set as absolute counts:
|
|
21408
|
+
|
|
21409
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
21410
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
21411
|
+
|
|
21412
|
+
Thresholds can be defined using one of these input schemes:
|
|
21413
|
+
|
|
21414
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
21415
|
+
thresholds)
|
|
21416
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
21417
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
21418
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
21419
|
+
'critical'
|
|
21420
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
21421
|
+
for the 'warning' level only
|
|
21422
|
+
|
|
21423
|
+
Examples
|
|
21424
|
+
--------
|
|
21425
|
+
```{{python}}
|
|
21426
|
+
#| echo: false
|
|
21427
|
+
#| output: false
|
|
21428
|
+
import pointblank as pb
|
|
21429
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
21430
|
+
```
|
|
21431
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
21432
|
+
shown below:
|
|
21433
|
+
|
|
21434
|
+
```{{python}}
|
|
21435
|
+
import pointblank as pb
|
|
21436
|
+
import polars as pl
|
|
21437
|
+
|
|
21438
|
+
tbl = pl.DataFrame(
|
|
21439
|
+
{{
|
|
21440
|
+
"a": [1, 2, 3, 4, 5],
|
|
21441
|
+
"b": [2, 2, 2, 2, 2],
|
|
21442
|
+
}}
|
|
21443
|
+
)
|
|
21444
|
+
|
|
21445
|
+
pb.preview(tbl)
|
|
21446
|
+
```
|
|
21447
|
+
|
|
21448
|
+
Let's validate that the {agg_name} of column `a` {comp_phrase} `{example_value}`:
|
|
21449
|
+
|
|
21450
|
+
```{{python}}
|
|
21451
|
+
validation = (
|
|
21452
|
+
pb.Validate(data=tbl)
|
|
21453
|
+
.{name}(columns="a", value={example_value})
|
|
21454
|
+
.interrogate()
|
|
21455
|
+
)
|
|
21456
|
+
|
|
21457
|
+
validation
|
|
21458
|
+
```
|
|
21459
|
+
|
|
21460
|
+
The validation result shows whether the {agg_name} comparison passed or failed. Since this
|
|
21461
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
21462
|
+
|
|
21463
|
+
When validating multiple columns, each column gets its own validation step:
|
|
21464
|
+
|
|
21465
|
+
```{{python}}
|
|
21466
|
+
validation = (
|
|
21467
|
+
pb.Validate(data=tbl)
|
|
21468
|
+
.{name}(columns=["a", "b"], value={example_value})
|
|
21469
|
+
.interrogate()
|
|
21470
|
+
)
|
|
21471
|
+
|
|
21472
|
+
validation
|
|
21473
|
+
```
|
|
21474
|
+
|
|
21475
|
+
Using tolerance for flexible comparisons:
|
|
21476
|
+
|
|
21477
|
+
```{{python}}
|
|
21478
|
+
validation = (
|
|
21479
|
+
pb.Validate(data=tbl)
|
|
21480
|
+
.{name}(columns="a", value={example_value}, tol=1.0)
|
|
21481
|
+
.interrogate()
|
|
21482
|
+
)
|
|
21483
|
+
|
|
21484
|
+
validation
|
|
21485
|
+
```
|
|
21486
|
+
"""
|
|
21487
|
+
|
|
21488
|
+
return docstring.strip()
|
|
21489
|
+
|
|
21490
|
+
|
|
21491
|
+
def make_agg_validator(name: str):
|
|
21492
|
+
"""Factory for dynamically generated aggregate validation methods.
|
|
21493
|
+
|
|
21494
|
+
Why this exists:
|
|
21495
|
+
Aggregate validators all share identical behavior. The only thing that differs
|
|
21496
|
+
between them is the semantic assertion type (their name). The implementation
|
|
21497
|
+
of each aggregate validator is fetched from `from_agg_validator`.
|
|
21498
|
+
|
|
21499
|
+
Instead of copy/pasting dozens of identical methods, we generate
|
|
21500
|
+
them dynamically and attach them to the Validate class. The types are generated
|
|
21501
|
+
at build time with `make pyi` to allow the methods to be visible to the type checker,
|
|
21502
|
+
documentation builders and the IDEs/LSPs.
|
|
21503
|
+
|
|
21504
|
+
The returned function is a thin adapter that forwards all arguments to
|
|
21505
|
+
`_add_agg_validation`, supplying the assertion type explicitly.
|
|
21506
|
+
"""
|
|
21507
|
+
|
|
21508
|
+
def agg_validator(
|
|
21509
|
+
self: Validate,
|
|
21510
|
+
columns: str | Collection[str],
|
|
21511
|
+
value: float | int | Column | ReferenceColumn | None = None,
|
|
21512
|
+
tol: float = 0,
|
|
21513
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
21514
|
+
brief: str | bool | None = None,
|
|
21515
|
+
actions: Actions | None = None,
|
|
21516
|
+
active: bool = True,
|
|
21517
|
+
) -> Validate:
|
|
21518
|
+
# Dynamically generated aggregate validator.
|
|
21519
|
+
# This method is generated per assertion type and forwards all arguments
|
|
21520
|
+
# to the shared aggregate validation implementation.
|
|
21521
|
+
return self._add_agg_validation(
|
|
21522
|
+
assertion_type=name,
|
|
21523
|
+
columns=columns,
|
|
21524
|
+
value=value,
|
|
21525
|
+
tol=tol,
|
|
21526
|
+
thresholds=thresholds,
|
|
21527
|
+
brief=brief,
|
|
21528
|
+
actions=actions,
|
|
21529
|
+
active=active,
|
|
21530
|
+
)
|
|
21531
|
+
|
|
21532
|
+
# Manually set function identity so this behaves like a real method.
|
|
21533
|
+
# These must be set before attaching the function to the class.
|
|
21534
|
+
agg_validator.__name__ = name
|
|
21535
|
+
agg_validator.__qualname__ = f"Validate.{name}"
|
|
21536
|
+
agg_validator.__doc__ = _generate_agg_docstring(name)
|
|
21537
|
+
|
|
21538
|
+
return agg_validator
|
|
21539
|
+
|
|
21540
|
+
|
|
21541
|
+
# Finally, we grab all the valid aggregation method names and attach them to
|
|
21542
|
+
# the Validate class, registering each one appropriately.
|
|
21543
|
+
for method in load_validation_method_grid(): # -> `col_sum_*`, `col_mean_*`, etc.
|
|
21544
|
+
setattr(Validate, method, make_agg_validator(method))
|