pointblank 0.17.0__py3-none-any.whl → 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_agg.py +120 -0
- pointblank/_constants.py +334 -55
- pointblank/_constants_translations.py +378 -0
- pointblank/_datascan_utils.py +28 -10
- pointblank/_interrogation.py +406 -149
- pointblank/_typing.py +12 -0
- pointblank/_utils.py +81 -44
- pointblank/_utils_ai.py +4 -5
- pointblank/_utils_check_args.py +3 -3
- pointblank/_utils_llms_txt.py +40 -2
- pointblank/actions.py +1 -1
- pointblank/assistant.py +2 -3
- pointblank/cli.py +1 -1
- pointblank/column.py +162 -46
- pointblank/data/api-docs.txt +2695 -49
- pointblank/datascan.py +17 -17
- pointblank/draft.py +2 -3
- pointblank/scan_profile.py +2 -1
- pointblank/schema.py +61 -20
- pointblank/thresholds.py +15 -13
- pointblank/validate.py +2034 -233
- pointblank/validate.pyi +1104 -0
- pointblank/yaml.py +10 -6
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/METADATA +2 -2
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/RECORD +30 -28
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/WHEEL +1 -1
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/licenses/LICENSE +1 -1
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -15,8 +15,9 @@ from enum import Enum
|
|
|
15
15
|
from functools import partial
|
|
16
16
|
from importlib.metadata import version
|
|
17
17
|
from pathlib import Path
|
|
18
|
-
from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
18
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal, NoReturn, ParamSpec, TypeVar
|
|
19
19
|
from zipfile import ZipFile
|
|
20
|
+
from zoneinfo import ZoneInfo
|
|
20
21
|
|
|
21
22
|
import commonmark
|
|
22
23
|
import narwhals as nw
|
|
@@ -24,8 +25,8 @@ from great_tables import GT, from_column, google_font, html, loc, md, style, val
|
|
|
24
25
|
from great_tables.gt import _get_column_of_values
|
|
25
26
|
from great_tables.vals import fmt_integer, fmt_number
|
|
26
27
|
from importlib_resources import files
|
|
27
|
-
from narwhals.typing import FrameT
|
|
28
28
|
|
|
29
|
+
from pointblank._agg import is_valid_agg, load_validation_method_grid, resolve_agg_registries
|
|
29
30
|
from pointblank._constants import (
|
|
30
31
|
ASSERTION_TYPE_METHOD_MAP,
|
|
31
32
|
CHECK_MARK_SPAN,
|
|
@@ -92,6 +93,8 @@ from pointblank._utils import (
|
|
|
92
93
|
_is_lib_present,
|
|
93
94
|
_is_narwhals_table,
|
|
94
95
|
_is_value_a_df,
|
|
96
|
+
_PBUnresolvedColumn,
|
|
97
|
+
_resolve_columns,
|
|
95
98
|
_select_df_lib,
|
|
96
99
|
)
|
|
97
100
|
from pointblank._utils_check_args import (
|
|
@@ -102,7 +105,14 @@ from pointblank._utils_check_args import (
|
|
|
102
105
|
_check_thresholds,
|
|
103
106
|
)
|
|
104
107
|
from pointblank._utils_html import _create_table_dims_html, _create_table_type_html
|
|
105
|
-
from pointblank.column import
|
|
108
|
+
from pointblank.column import (
|
|
109
|
+
Column,
|
|
110
|
+
ColumnLiteral,
|
|
111
|
+
ColumnSelector,
|
|
112
|
+
ColumnSelectorNarwhals,
|
|
113
|
+
ReferenceColumn,
|
|
114
|
+
col,
|
|
115
|
+
)
|
|
106
116
|
from pointblank.schema import Schema, _get_schema_validation_info
|
|
107
117
|
from pointblank.segments import Segment
|
|
108
118
|
from pointblank.thresholds import (
|
|
@@ -113,10 +123,18 @@ from pointblank.thresholds import (
|
|
|
113
123
|
_normalize_thresholds_creation,
|
|
114
124
|
)
|
|
115
125
|
|
|
126
|
+
P = ParamSpec("P")
|
|
127
|
+
R = TypeVar("R")
|
|
128
|
+
|
|
116
129
|
if TYPE_CHECKING:
|
|
117
130
|
from collections.abc import Collection
|
|
131
|
+
from typing import Any
|
|
132
|
+
|
|
133
|
+
import polars as pl
|
|
134
|
+
from narwhals.typing import IntoDataFrame, IntoFrame
|
|
135
|
+
|
|
136
|
+
from pointblank._typing import AbsoluteBounds, Tolerance, _CompliantValue, _CompliantValues
|
|
118
137
|
|
|
119
|
-
from pointblank._typing import AbsoluteBounds, Tolerance
|
|
120
138
|
|
|
121
139
|
__all__ = [
|
|
122
140
|
"Validate",
|
|
@@ -135,6 +153,7 @@ __all__ = [
|
|
|
135
153
|
"get_validation_summary",
|
|
136
154
|
]
|
|
137
155
|
|
|
156
|
+
|
|
138
157
|
# Create a thread-local storage for the metadata
|
|
139
158
|
_action_context = threading.local()
|
|
140
159
|
|
|
@@ -424,12 +443,13 @@ def config(
|
|
|
424
443
|
global_config.report_incl_footer_timings = report_incl_footer_timings # pragma: no cover
|
|
425
444
|
global_config.report_incl_footer_notes = report_incl_footer_notes # pragma: no cover
|
|
426
445
|
global_config.preview_incl_header = preview_incl_header # pragma: no cover
|
|
446
|
+
return global_config # pragma: no cover
|
|
427
447
|
|
|
428
448
|
|
|
429
449
|
def load_dataset(
|
|
430
450
|
dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
|
|
431
451
|
tbl_type: Literal["polars", "pandas", "duckdb"] = "polars",
|
|
432
|
-
) ->
|
|
452
|
+
) -> Any:
|
|
433
453
|
"""
|
|
434
454
|
Load a dataset hosted in the library as specified table type.
|
|
435
455
|
|
|
@@ -450,7 +470,7 @@ def load_dataset(
|
|
|
450
470
|
|
|
451
471
|
Returns
|
|
452
472
|
-------
|
|
453
|
-
|
|
473
|
+
Any
|
|
454
474
|
The dataset for the `Validate` object. This could be a Polars DataFrame, a Pandas DataFrame,
|
|
455
475
|
or a DuckDB table as an Ibis table.
|
|
456
476
|
|
|
@@ -1523,7 +1543,7 @@ def get_data_path(
|
|
|
1523
1543
|
return tmp_file.name
|
|
1524
1544
|
|
|
1525
1545
|
|
|
1526
|
-
def _process_data(data:
|
|
1546
|
+
def _process_data(data: Any) -> Any:
|
|
1527
1547
|
"""
|
|
1528
1548
|
Centralized data processing pipeline that handles all supported input types.
|
|
1529
1549
|
|
|
@@ -1540,7 +1560,7 @@ def _process_data(data: FrameT | Any) -> FrameT | Any:
|
|
|
1540
1560
|
|
|
1541
1561
|
Parameters
|
|
1542
1562
|
----------
|
|
1543
|
-
data
|
|
1563
|
+
data
|
|
1544
1564
|
The input data which could be:
|
|
1545
1565
|
- a DataFrame object (Polars, Pandas, Ibis, etc.)
|
|
1546
1566
|
- a GitHub URL pointing to a CSV or Parquet file
|
|
@@ -1551,7 +1571,7 @@ def _process_data(data: FrameT | Any) -> FrameT | Any:
|
|
|
1551
1571
|
|
|
1552
1572
|
Returns
|
|
1553
1573
|
-------
|
|
1554
|
-
|
|
1574
|
+
Any
|
|
1555
1575
|
Processed data as a DataFrame if input was a supported data source type,
|
|
1556
1576
|
otherwise the original data unchanged.
|
|
1557
1577
|
"""
|
|
@@ -1570,7 +1590,7 @@ def _process_data(data: FrameT | Any) -> FrameT | Any:
|
|
|
1570
1590
|
return data
|
|
1571
1591
|
|
|
1572
1592
|
|
|
1573
|
-
def _process_github_url(data:
|
|
1593
|
+
def _process_github_url(data: Any) -> Any:
|
|
1574
1594
|
"""
|
|
1575
1595
|
Process data parameter to handle GitHub URLs pointing to CSV or Parquet files.
|
|
1576
1596
|
|
|
@@ -1585,12 +1605,12 @@ def _process_github_url(data: FrameT | Any) -> FrameT | Any:
|
|
|
1585
1605
|
|
|
1586
1606
|
Parameters
|
|
1587
1607
|
----------
|
|
1588
|
-
data
|
|
1608
|
+
data
|
|
1589
1609
|
The data parameter which may be a GitHub URL string or any other data type.
|
|
1590
1610
|
|
|
1591
1611
|
Returns
|
|
1592
1612
|
-------
|
|
1593
|
-
|
|
1613
|
+
Any
|
|
1594
1614
|
If the input is a supported GitHub URL, returns a DataFrame loaded from the downloaded file.
|
|
1595
1615
|
Otherwise, returns the original data unchanged.
|
|
1596
1616
|
|
|
@@ -1675,7 +1695,7 @@ def _process_github_url(data: FrameT | Any) -> FrameT | Any:
|
|
|
1675
1695
|
return data
|
|
1676
1696
|
|
|
1677
1697
|
|
|
1678
|
-
def _process_connection_string(data:
|
|
1698
|
+
def _process_connection_string(data: Any) -> Any:
|
|
1679
1699
|
"""
|
|
1680
1700
|
Process data parameter to handle database connection strings.
|
|
1681
1701
|
|
|
@@ -1702,7 +1722,7 @@ def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
|
|
|
1702
1722
|
return connect_to_table(data)
|
|
1703
1723
|
|
|
1704
1724
|
|
|
1705
|
-
def _process_csv_input(data:
|
|
1725
|
+
def _process_csv_input(data: Any) -> Any:
|
|
1706
1726
|
"""
|
|
1707
1727
|
Process data parameter to handle CSV file inputs.
|
|
1708
1728
|
|
|
@@ -1760,7 +1780,7 @@ def _process_csv_input(data: FrameT | Any) -> FrameT | Any:
|
|
|
1760
1780
|
)
|
|
1761
1781
|
|
|
1762
1782
|
|
|
1763
|
-
def _process_parquet_input(data:
|
|
1783
|
+
def _process_parquet_input(data: Any) -> Any:
|
|
1764
1784
|
"""
|
|
1765
1785
|
Process data parameter to handle Parquet file inputs.
|
|
1766
1786
|
|
|
@@ -1903,7 +1923,7 @@ def _process_parquet_input(data: FrameT | Any) -> FrameT | Any:
|
|
|
1903
1923
|
|
|
1904
1924
|
|
|
1905
1925
|
def preview(
|
|
1906
|
-
data:
|
|
1926
|
+
data: Any,
|
|
1907
1927
|
columns_subset: str | list[str] | Column | None = None,
|
|
1908
1928
|
n_head: int = 5,
|
|
1909
1929
|
n_tail: int = 5,
|
|
@@ -1911,7 +1931,7 @@ def preview(
|
|
|
1911
1931
|
show_row_numbers: bool = True,
|
|
1912
1932
|
max_col_width: int = 250,
|
|
1913
1933
|
min_tbl_width: int = 500,
|
|
1914
|
-
incl_header: bool = None,
|
|
1934
|
+
incl_header: bool | None = None,
|
|
1915
1935
|
) -> GT:
|
|
1916
1936
|
"""
|
|
1917
1937
|
Display a table preview that shows some rows from the top, some from the bottom.
|
|
@@ -2169,7 +2189,7 @@ def preview(
|
|
|
2169
2189
|
|
|
2170
2190
|
|
|
2171
2191
|
def _generate_display_table(
|
|
2172
|
-
data:
|
|
2192
|
+
data: Any,
|
|
2173
2193
|
columns_subset: str | list[str] | Column | None = None,
|
|
2174
2194
|
n_head: int = 5,
|
|
2175
2195
|
n_tail: int = 5,
|
|
@@ -2177,7 +2197,7 @@ def _generate_display_table(
|
|
|
2177
2197
|
show_row_numbers: bool = True,
|
|
2178
2198
|
max_col_width: int = 250,
|
|
2179
2199
|
min_tbl_width: int = 500,
|
|
2180
|
-
incl_header: bool = None,
|
|
2200
|
+
incl_header: bool | None = None,
|
|
2181
2201
|
mark_missing_values: bool = True,
|
|
2182
2202
|
row_number_list: list[int] | None = None,
|
|
2183
2203
|
) -> GT:
|
|
@@ -2274,7 +2294,8 @@ def _generate_display_table(
|
|
|
2274
2294
|
tbl_schema = Schema(tbl=data)
|
|
2275
2295
|
|
|
2276
2296
|
# Get the row count for the table
|
|
2277
|
-
|
|
2297
|
+
# Note: ibis tables have count(), to_polars(), to_pandas() methods
|
|
2298
|
+
ibis_rows = data.count() # type: ignore[union-attr]
|
|
2278
2299
|
n_rows = ibis_rows.to_polars() if df_lib_name_gt == "polars" else int(ibis_rows.to_pandas())
|
|
2279
2300
|
|
|
2280
2301
|
# If n_head + n_tail is greater than the row count, display the entire table
|
|
@@ -2283,11 +2304,11 @@ def _generate_display_table(
|
|
|
2283
2304
|
data_subset = data
|
|
2284
2305
|
|
|
2285
2306
|
if row_number_list is None:
|
|
2286
|
-
row_number_list = range(1, n_rows + 1)
|
|
2307
|
+
row_number_list = list(range(1, n_rows + 1))
|
|
2287
2308
|
else:
|
|
2288
2309
|
# Get the first n and last n rows of the table
|
|
2289
|
-
data_head = data.head(n_head)
|
|
2290
|
-
data_tail = data.filter(
|
|
2310
|
+
data_head = data.head(n_head) # type: ignore[union-attr]
|
|
2311
|
+
data_tail = data.filter( # type: ignore[union-attr]
|
|
2291
2312
|
[ibis.row_number() >= (n_rows - n_tail), ibis.row_number() <= n_rows]
|
|
2292
2313
|
)
|
|
2293
2314
|
data_subset = data_head.union(data_tail)
|
|
@@ -2299,9 +2320,9 @@ def _generate_display_table(
|
|
|
2299
2320
|
|
|
2300
2321
|
# Convert either to Polars or Pandas depending on the available library
|
|
2301
2322
|
if df_lib_name_gt == "polars":
|
|
2302
|
-
data = data_subset.to_polars()
|
|
2323
|
+
data = data_subset.to_polars() # type: ignore[union-attr]
|
|
2303
2324
|
else:
|
|
2304
|
-
data = data_subset.to_pandas()
|
|
2325
|
+
data = data_subset.to_pandas() # type: ignore[union-attr]
|
|
2305
2326
|
|
|
2306
2327
|
# From a DataFrame:
|
|
2307
2328
|
# - get the row count
|
|
@@ -2312,17 +2333,18 @@ def _generate_display_table(
|
|
|
2312
2333
|
tbl_schema = Schema(tbl=data)
|
|
2313
2334
|
|
|
2314
2335
|
if tbl_type == "polars":
|
|
2315
|
-
|
|
2336
|
+
# Note: polars DataFrames have height, head(), tail() attributes
|
|
2337
|
+
n_rows = int(data.height) # type: ignore[union-attr]
|
|
2316
2338
|
|
|
2317
2339
|
# If n_head + n_tail is greater than the row count, display the entire table
|
|
2318
2340
|
if n_head + n_tail >= n_rows:
|
|
2319
2341
|
full_dataset = True
|
|
2320
2342
|
|
|
2321
2343
|
if row_number_list is None:
|
|
2322
|
-
row_number_list = range(1, n_rows + 1)
|
|
2344
|
+
row_number_list = list(range(1, n_rows + 1))
|
|
2323
2345
|
|
|
2324
2346
|
else:
|
|
2325
|
-
data = pl.concat([data.head(n=n_head), data.tail(n=n_tail)])
|
|
2347
|
+
data = pl.concat([data.head(n=n_head), data.tail(n=n_tail)]) # type: ignore[union-attr]
|
|
2326
2348
|
|
|
2327
2349
|
if row_number_list is None:
|
|
2328
2350
|
row_number_list = list(range(1, n_head + 1)) + list(
|
|
@@ -2330,40 +2352,42 @@ def _generate_display_table(
|
|
|
2330
2352
|
)
|
|
2331
2353
|
|
|
2332
2354
|
if tbl_type == "pandas":
|
|
2333
|
-
|
|
2355
|
+
# Note: pandas DataFrames have shape, head(), tail() attributes
|
|
2356
|
+
n_rows = data.shape[0] # type: ignore[union-attr]
|
|
2334
2357
|
|
|
2335
2358
|
# If n_head + n_tail is greater than the row count, display the entire table
|
|
2336
2359
|
if n_head + n_tail >= n_rows:
|
|
2337
2360
|
full_dataset = True
|
|
2338
2361
|
data_subset = data
|
|
2339
2362
|
|
|
2340
|
-
row_number_list = range(1, n_rows + 1)
|
|
2363
|
+
row_number_list = list(range(1, n_rows + 1))
|
|
2341
2364
|
else:
|
|
2342
|
-
data = pd.concat([data.head(n=n_head), data.tail(n=n_tail)])
|
|
2365
|
+
data = pd.concat([data.head(n=n_head), data.tail(n=n_tail)]) # type: ignore[union-attr]
|
|
2343
2366
|
|
|
2344
2367
|
row_number_list = list(range(1, n_head + 1)) + list(
|
|
2345
2368
|
range(n_rows - n_tail + 1, n_rows + 1)
|
|
2346
2369
|
)
|
|
2347
2370
|
|
|
2348
2371
|
if tbl_type == "pyspark":
|
|
2349
|
-
|
|
2372
|
+
# Note: pyspark DataFrames have count(), toPandas(), limit(), tail(), sparkSession
|
|
2373
|
+
n_rows = data.count() # type: ignore[union-attr]
|
|
2350
2374
|
|
|
2351
2375
|
# If n_head + n_tail is greater than the row count, display the entire table
|
|
2352
2376
|
if n_head + n_tail >= n_rows:
|
|
2353
2377
|
full_dataset = True
|
|
2354
2378
|
# Convert to pandas for Great Tables compatibility
|
|
2355
|
-
data = data.toPandas()
|
|
2379
|
+
data = data.toPandas() # type: ignore[union-attr]
|
|
2356
2380
|
|
|
2357
|
-
row_number_list = range(1, n_rows + 1)
|
|
2381
|
+
row_number_list = list(range(1, n_rows + 1))
|
|
2358
2382
|
else:
|
|
2359
2383
|
# Get head and tail samples, then convert to pandas
|
|
2360
|
-
head_data = data.limit(n_head).toPandas()
|
|
2384
|
+
head_data = data.limit(n_head).toPandas() # type: ignore[union-attr]
|
|
2361
2385
|
|
|
2362
2386
|
# PySpark tail() returns a list of Row objects, need to convert to DataFrame
|
|
2363
|
-
tail_rows = data.tail(n_tail)
|
|
2387
|
+
tail_rows = data.tail(n_tail) # type: ignore[union-attr]
|
|
2364
2388
|
if tail_rows:
|
|
2365
2389
|
# Convert list of Row objects back to DataFrame, then to pandas
|
|
2366
|
-
tail_df = data.sparkSession.createDataFrame(tail_rows, data.schema)
|
|
2390
|
+
tail_df = data.sparkSession.createDataFrame(tail_rows, data.schema) # type: ignore[union-attr]
|
|
2367
2391
|
tail_data = tail_df.toPandas()
|
|
2368
2392
|
else:
|
|
2369
2393
|
# If no tail data, create empty DataFrame with same schema
|
|
@@ -2391,14 +2415,14 @@ def _generate_display_table(
|
|
|
2391
2415
|
tbl_schema = Schema(tbl=data)
|
|
2392
2416
|
|
|
2393
2417
|
# From the table schema, get a list of tuples containing column names and data types
|
|
2394
|
-
|
|
2418
|
+
col_dtype_list = tbl_schema.columns or []
|
|
2395
2419
|
|
|
2396
2420
|
# Extract the column names from the list of tuples (first element of each tuple)
|
|
2397
|
-
col_names = [col[0] for col in
|
|
2421
|
+
col_names = [col[0] for col in col_dtype_list]
|
|
2398
2422
|
|
|
2399
2423
|
# Iterate over the list of tuples and create a new dictionary with the
|
|
2400
2424
|
# column names and data types
|
|
2401
|
-
col_dtype_dict = {k: v for k, v in
|
|
2425
|
+
col_dtype_dict = {k: v for k, v in col_dtype_list}
|
|
2402
2426
|
|
|
2403
2427
|
# Create short versions of the data types by omitting any text in parentheses
|
|
2404
2428
|
col_dtype_dict_short = {
|
|
@@ -2497,21 +2521,21 @@ def _generate_display_table(
|
|
|
2497
2521
|
# Prepend a column that contains the row numbers if `show_row_numbers=True`
|
|
2498
2522
|
if show_row_numbers or has_leading_row_num_col:
|
|
2499
2523
|
if has_leading_row_num_col:
|
|
2500
|
-
row_number_list = data["_row_num_"].to_list()
|
|
2524
|
+
row_number_list = data["_row_num_"].to_list() # type: ignore[union-attr]
|
|
2501
2525
|
|
|
2502
2526
|
else:
|
|
2503
2527
|
if df_lib_name_gt == "polars":
|
|
2504
2528
|
import polars as pl
|
|
2505
2529
|
|
|
2506
2530
|
row_number_series = pl.Series("_row_num_", row_number_list)
|
|
2507
|
-
data = data.insert_column(0, row_number_series)
|
|
2531
|
+
data = data.insert_column(0, row_number_series) # type: ignore[union-attr]
|
|
2508
2532
|
|
|
2509
2533
|
if df_lib_name_gt == "pandas":
|
|
2510
|
-
data.insert(0, "_row_num_", row_number_list)
|
|
2534
|
+
data.insert(0, "_row_num_", row_number_list) # type: ignore[union-attr]
|
|
2511
2535
|
|
|
2512
2536
|
if df_lib_name_gt == "pyspark":
|
|
2513
2537
|
# For PySpark converted to pandas, use pandas method
|
|
2514
|
-
data.insert(0, "_row_num_", row_number_list)
|
|
2538
|
+
data.insert(0, "_row_num_", row_number_list) # type: ignore[union-attr]
|
|
2515
2539
|
|
|
2516
2540
|
# Get the highest number in the `row_number_list` and calculate a width that will
|
|
2517
2541
|
# safely fit a number of that magnitude
|
|
@@ -2620,7 +2644,7 @@ def _generate_display_table(
|
|
|
2620
2644
|
return gt_tbl
|
|
2621
2645
|
|
|
2622
2646
|
|
|
2623
|
-
def missing_vals_tbl(data:
|
|
2647
|
+
def missing_vals_tbl(data: Any) -> GT:
|
|
2624
2648
|
"""
|
|
2625
2649
|
Display a table that shows the missing values in the input table.
|
|
2626
2650
|
|
|
@@ -3221,7 +3245,7 @@ def _get_column_names_safe(data: Any) -> list[str]:
|
|
|
3221
3245
|
return list(data.columns) # pragma: no cover
|
|
3222
3246
|
|
|
3223
3247
|
|
|
3224
|
-
def _get_column_names(data:
|
|
3248
|
+
def _get_column_names(data: Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
|
|
3225
3249
|
if ibis_tbl:
|
|
3226
3250
|
return data.columns if df_lib_name_gt == "polars" else list(data.columns)
|
|
3227
3251
|
|
|
@@ -3245,12 +3269,10 @@ def _validate_columns_subset(
|
|
|
3245
3269
|
)
|
|
3246
3270
|
return columns_subset
|
|
3247
3271
|
|
|
3248
|
-
return columns_subset.resolve(columns=col_names)
|
|
3272
|
+
return columns_subset.resolve(columns=col_names) # type: ignore[union-attr]
|
|
3249
3273
|
|
|
3250
3274
|
|
|
3251
|
-
def _select_columns(
|
|
3252
|
-
data: FrameT | Any, resolved_columns: list[str], ibis_tbl: bool, tbl_type: str
|
|
3253
|
-
) -> FrameT | Any:
|
|
3275
|
+
def _select_columns(data: Any, resolved_columns: list[str], ibis_tbl: bool, tbl_type: str) -> Any:
|
|
3254
3276
|
if ibis_tbl:
|
|
3255
3277
|
return data[resolved_columns]
|
|
3256
3278
|
if tbl_type == "polars":
|
|
@@ -3258,7 +3280,7 @@ def _select_columns(
|
|
|
3258
3280
|
return data[resolved_columns]
|
|
3259
3281
|
|
|
3260
3282
|
|
|
3261
|
-
def get_column_count(data:
|
|
3283
|
+
def get_column_count(data: Any) -> int:
|
|
3262
3284
|
"""
|
|
3263
3285
|
Get the number of columns in a table.
|
|
3264
3286
|
|
|
@@ -3470,7 +3492,7 @@ def _extract_enum_values(set_values: Any) -> list[Any]:
|
|
|
3470
3492
|
return [set_values]
|
|
3471
3493
|
|
|
3472
3494
|
|
|
3473
|
-
def get_row_count(data:
|
|
3495
|
+
def get_row_count(data: Any) -> int:
|
|
3474
3496
|
"""
|
|
3475
3497
|
Get the number of rows in a table.
|
|
3476
3498
|
|
|
@@ -3723,18 +3745,46 @@ class _ValidationInfo:
|
|
|
3723
3745
|
insertion order, ensuring notes appear in a consistent sequence in reports and logs.
|
|
3724
3746
|
"""
|
|
3725
3747
|
|
|
3748
|
+
@classmethod
|
|
3749
|
+
def from_agg_validator(
|
|
3750
|
+
cls,
|
|
3751
|
+
assertion_type: str,
|
|
3752
|
+
columns: _PBUnresolvedColumn,
|
|
3753
|
+
value: float | Column | ReferenceColumn,
|
|
3754
|
+
tol: Tolerance = 0,
|
|
3755
|
+
thresholds: float | bool | tuple | dict | Thresholds | None = None,
|
|
3756
|
+
brief: str | bool = False,
|
|
3757
|
+
actions: Actions | None = None,
|
|
3758
|
+
active: bool = True,
|
|
3759
|
+
) -> _ValidationInfo:
|
|
3760
|
+
# This factory method creates a `_ValidationInfo` instance for aggregate
|
|
3761
|
+
# methods. The reason this is created, is because all agg methods share the same
|
|
3762
|
+
# signature so instead of instantiating the class directly each time, this method
|
|
3763
|
+
# can be used to reduce redundancy, boilerplate and mistakes :)
|
|
3764
|
+
_check_thresholds(thresholds=thresholds)
|
|
3765
|
+
|
|
3766
|
+
return cls(
|
|
3767
|
+
assertion_type=assertion_type,
|
|
3768
|
+
column=_resolve_columns(columns),
|
|
3769
|
+
values={"value": value, "tol": tol},
|
|
3770
|
+
thresholds=_normalize_thresholds_creation(thresholds),
|
|
3771
|
+
brief=_transform_auto_brief(brief=brief),
|
|
3772
|
+
actions=actions,
|
|
3773
|
+
active=active,
|
|
3774
|
+
)
|
|
3775
|
+
|
|
3726
3776
|
# Validation plan
|
|
3727
3777
|
i: int | None = None
|
|
3728
3778
|
i_o: int | None = None
|
|
3729
3779
|
step_id: str | None = None
|
|
3730
3780
|
sha1: str | None = None
|
|
3731
3781
|
assertion_type: str | None = None
|
|
3732
|
-
column:
|
|
3733
|
-
values:
|
|
3782
|
+
column: Any | None = None
|
|
3783
|
+
values: Any | list[Any] | tuple | None = None
|
|
3734
3784
|
inclusive: tuple[bool, bool] | None = None
|
|
3735
3785
|
na_pass: bool | None = None
|
|
3736
3786
|
pre: Callable | None = None
|
|
3737
|
-
segments:
|
|
3787
|
+
segments: Any | None = None
|
|
3738
3788
|
thresholds: Thresholds | None = None
|
|
3739
3789
|
actions: Actions | None = None
|
|
3740
3790
|
label: str | None = None
|
|
@@ -3753,14 +3803,14 @@ class _ValidationInfo:
|
|
|
3753
3803
|
error: bool | None = None
|
|
3754
3804
|
critical: bool | None = None
|
|
3755
3805
|
failure_text: str | None = None
|
|
3756
|
-
tbl_checked:
|
|
3757
|
-
extract:
|
|
3758
|
-
val_info: dict[str,
|
|
3806
|
+
tbl_checked: Any = None
|
|
3807
|
+
extract: Any = None
|
|
3808
|
+
val_info: dict[str, Any] | None = None
|
|
3759
3809
|
time_processed: str | None = None
|
|
3760
3810
|
proc_duration_s: float | None = None
|
|
3761
3811
|
notes: dict[str, dict[str, str]] | None = None
|
|
3762
3812
|
|
|
3763
|
-
def get_val_info(self) -> dict[str,
|
|
3813
|
+
def get_val_info(self) -> dict[str, Any] | None:
|
|
3764
3814
|
return self.val_info
|
|
3765
3815
|
|
|
3766
3816
|
def _add_note(self, key: str, markdown: str, text: str | None = None) -> None:
|
|
@@ -3936,7 +3986,7 @@ class _ValidationInfo:
|
|
|
3936
3986
|
return self.notes is not None and len(self.notes) > 0
|
|
3937
3987
|
|
|
3938
3988
|
|
|
3939
|
-
def _handle_connection_errors(e: Exception, connection_string: str) ->
|
|
3989
|
+
def _handle_connection_errors(e: Exception, connection_string: str) -> NoReturn:
|
|
3940
3990
|
"""
|
|
3941
3991
|
Shared error handling for database connection failures.
|
|
3942
3992
|
|
|
@@ -4301,6 +4351,18 @@ class Validate:
|
|
|
4301
4351
|
locale's rules. Examples include `"en-US"` for English (United States) and `"fr-FR"` for
|
|
4302
4352
|
French (France). More simply, this can be a language identifier without a designation of
|
|
4303
4353
|
territory, like `"es"` for Spanish.
|
|
4354
|
+
owner
|
|
4355
|
+
An optional string identifying the owner of the data being validated. This is useful for
|
|
4356
|
+
governance purposes, indicating who is responsible for the quality and maintenance of the
|
|
4357
|
+
data. For example, `"data-platform-team"` or `"analytics-engineering"`.
|
|
4358
|
+
consumers
|
|
4359
|
+
An optional string or list of strings identifying who depends on or consumes this data.
|
|
4360
|
+
This helps document data dependencies and can be useful for impact analysis when data
|
|
4361
|
+
quality issues are detected. For example, `"ml-team"` or `["ml-team", "analytics"]`.
|
|
4362
|
+
version
|
|
4363
|
+
An optional string representing the version of the validation plan or data contract. This
|
|
4364
|
+
supports semantic versioning (e.g., `"1.0.0"`, `"2.1.0"`) and is useful for tracking changes
|
|
4365
|
+
to validation rules over time and for organizational governance.
|
|
4304
4366
|
|
|
4305
4367
|
Returns
|
|
4306
4368
|
-------
|
|
@@ -4777,7 +4839,8 @@ class Validate:
|
|
|
4777
4839
|
when table specifications are missing or backend dependencies are not installed.
|
|
4778
4840
|
"""
|
|
4779
4841
|
|
|
4780
|
-
data:
|
|
4842
|
+
data: IntoDataFrame
|
|
4843
|
+
reference: IntoFrame | None = None
|
|
4781
4844
|
tbl_name: str | None = None
|
|
4782
4845
|
label: str | None = None
|
|
4783
4846
|
thresholds: int | float | bool | tuple | dict | Thresholds | None = None
|
|
@@ -4786,11 +4849,18 @@ class Validate:
|
|
|
4786
4849
|
brief: str | bool | None = None
|
|
4787
4850
|
lang: str | None = None
|
|
4788
4851
|
locale: str | None = None
|
|
4852
|
+
owner: str | None = None
|
|
4853
|
+
consumers: str | list[str] | None = None
|
|
4854
|
+
version: str | None = None
|
|
4789
4855
|
|
|
4790
4856
|
def __post_init__(self):
|
|
4791
4857
|
# Process data through the centralized data processing pipeline
|
|
4792
4858
|
self.data = _process_data(self.data)
|
|
4793
4859
|
|
|
4860
|
+
# Process reference data if provided
|
|
4861
|
+
if self.reference is not None:
|
|
4862
|
+
self.reference = _process_data(self.reference)
|
|
4863
|
+
|
|
4794
4864
|
# Check input of the `thresholds=` argument
|
|
4795
4865
|
_check_thresholds(thresholds=self.thresholds)
|
|
4796
4866
|
|
|
@@ -4826,6 +4896,36 @@ class Validate:
|
|
|
4826
4896
|
# Transform any shorthands of `brief` to string representations
|
|
4827
4897
|
self.brief = _transform_auto_brief(brief=self.brief)
|
|
4828
4898
|
|
|
4899
|
+
# Validate and normalize the `owner` parameter
|
|
4900
|
+
if self.owner is not None and not isinstance(self.owner, str):
|
|
4901
|
+
raise TypeError(
|
|
4902
|
+
"The `owner=` parameter must be a string representing the owner of the data. "
|
|
4903
|
+
f"Received type: {type(self.owner).__name__}"
|
|
4904
|
+
)
|
|
4905
|
+
|
|
4906
|
+
# Validate and normalize the `consumers` parameter
|
|
4907
|
+
if self.consumers is not None:
|
|
4908
|
+
if isinstance(self.consumers, str):
|
|
4909
|
+
self.consumers = [self.consumers]
|
|
4910
|
+
elif isinstance(self.consumers, list):
|
|
4911
|
+
if not all(isinstance(c, str) for c in self.consumers):
|
|
4912
|
+
raise TypeError(
|
|
4913
|
+
"The `consumers=` parameter must be a string or a list of strings. "
|
|
4914
|
+
"All elements in the list must be strings."
|
|
4915
|
+
)
|
|
4916
|
+
else:
|
|
4917
|
+
raise TypeError(
|
|
4918
|
+
"The `consumers=` parameter must be a string or a list of strings. "
|
|
4919
|
+
f"Received type: {type(self.consumers).__name__}"
|
|
4920
|
+
)
|
|
4921
|
+
|
|
4922
|
+
# Validate the `version` parameter
|
|
4923
|
+
if self.version is not None and not isinstance(self.version, str):
|
|
4924
|
+
raise TypeError(
|
|
4925
|
+
"The `version=` parameter must be a string representing the version. "
|
|
4926
|
+
f"Received type: {type(self.version).__name__}"
|
|
4927
|
+
)
|
|
4928
|
+
|
|
4829
4929
|
# TODO: Add functionality to obtain the column names and types from the table
|
|
4830
4930
|
self.col_names = None
|
|
4831
4931
|
self.col_types = None
|
|
@@ -4835,9 +4935,107 @@ class Validate:
|
|
|
4835
4935
|
|
|
4836
4936
|
self.validation_info = []
|
|
4837
4937
|
|
|
4938
|
+
def _add_agg_validation(
|
|
4939
|
+
self,
|
|
4940
|
+
*,
|
|
4941
|
+
assertion_type: str,
|
|
4942
|
+
columns: str | Collection[str],
|
|
4943
|
+
value,
|
|
4944
|
+
tol=0,
|
|
4945
|
+
thresholds=None,
|
|
4946
|
+
brief=False,
|
|
4947
|
+
actions=None,
|
|
4948
|
+
active=True,
|
|
4949
|
+
):
|
|
4950
|
+
"""
|
|
4951
|
+
Add an aggregation-based validation step to the validation plan.
|
|
4952
|
+
|
|
4953
|
+
This internal method is used by all aggregation-based column validation methods
|
|
4954
|
+
(e.g., `col_sum_eq`, `col_avg_gt`, `col_sd_le`) to create and register validation
|
|
4955
|
+
steps. It relies heavily on the `_ValidationInfo.from_agg_validator()` class method.
|
|
4956
|
+
|
|
4957
|
+
Automatic Reference Inference
|
|
4958
|
+
-----------------------------
|
|
4959
|
+
When `value` is None and reference data has been set on the Validate object,
|
|
4960
|
+
this method automatically creates a `ReferenceColumn` pointing to the same
|
|
4961
|
+
column name in the reference data. This enables a convenient shorthand:
|
|
4962
|
+
|
|
4963
|
+
.. code-block:: python
|
|
4964
|
+
|
|
4965
|
+
# Instead of writing:
|
|
4966
|
+
Validate(data=df, reference=ref_df).col_sum_eq("a", ref("a"))
|
|
4967
|
+
|
|
4968
|
+
# You can simply write:
|
|
4969
|
+
Validate(data=df, reference=ref_df).col_sum_eq("a")
|
|
4970
|
+
|
|
4971
|
+
If `value` is None and no reference data is set, a `ValueError` is raised
|
|
4972
|
+
immediately to provide clear feedback to the user.
|
|
4973
|
+
|
|
4974
|
+
Parameters
|
|
4975
|
+
----------
|
|
4976
|
+
assertion_type
|
|
4977
|
+
The type of assertion (e.g., "col_sum_eq", "col_avg_gt").
|
|
4978
|
+
columns
|
|
4979
|
+
Column name or collection of column names to validate.
|
|
4980
|
+
value
|
|
4981
|
+
The target value to compare against. Can be:
|
|
4982
|
+
- A numeric literal (int or float)
|
|
4983
|
+
- A `Column` object for cross-column comparison
|
|
4984
|
+
- A `ReferenceColumn` object for reference data comparison
|
|
4985
|
+
- None to automatically use `ref(column)` when reference data is set
|
|
4986
|
+
tol
|
|
4987
|
+
Tolerance for the comparison. Defaults to 0.
|
|
4988
|
+
thresholds
|
|
4989
|
+
Custom thresholds for the validation step.
|
|
4990
|
+
brief
|
|
4991
|
+
Brief description or auto-generate flag.
|
|
4992
|
+
actions
|
|
4993
|
+
Actions to take based on validation results.
|
|
4994
|
+
active
|
|
4995
|
+
Whether this validation step is active.
|
|
4996
|
+
|
|
4997
|
+
Returns
|
|
4998
|
+
-------
|
|
4999
|
+
Validate
|
|
5000
|
+
The Validate instance for method chaining.
|
|
5001
|
+
|
|
5002
|
+
Raises
|
|
5003
|
+
------
|
|
5004
|
+
ValueError
|
|
5005
|
+
If `value` is None and no reference data is set on the Validate object.
|
|
5006
|
+
"""
|
|
5007
|
+
if isinstance(columns, str):
|
|
5008
|
+
columns = [columns]
|
|
5009
|
+
for column in columns:
|
|
5010
|
+
# If value is None, default to referencing the same column from reference data
|
|
5011
|
+
resolved_value = value
|
|
5012
|
+
if value is None:
|
|
5013
|
+
if self.reference is None:
|
|
5014
|
+
raise ValueError(
|
|
5015
|
+
f"The 'value' parameter is required for {assertion_type}() "
|
|
5016
|
+
"when no reference data is set. Either provide a value, or "
|
|
5017
|
+
"set reference data on the Validate object using "
|
|
5018
|
+
"Validate(data=..., reference=...)."
|
|
5019
|
+
)
|
|
5020
|
+
resolved_value = ReferenceColumn(column_name=column)
|
|
5021
|
+
|
|
5022
|
+
val_info = _ValidationInfo.from_agg_validator(
|
|
5023
|
+
assertion_type=assertion_type,
|
|
5024
|
+
columns=column,
|
|
5025
|
+
value=resolved_value,
|
|
5026
|
+
tol=tol,
|
|
5027
|
+
thresholds=self.thresholds if thresholds is None else thresholds,
|
|
5028
|
+
actions=self.actions if actions is None else actions,
|
|
5029
|
+
brief=self.brief if brief is None else brief,
|
|
5030
|
+
active=active,
|
|
5031
|
+
)
|
|
5032
|
+
self._add_validation(validation_info=val_info)
|
|
5033
|
+
|
|
5034
|
+
return self
|
|
5035
|
+
|
|
4838
5036
|
def set_tbl(
|
|
4839
5037
|
self,
|
|
4840
|
-
tbl:
|
|
5038
|
+
tbl: Any,
|
|
4841
5039
|
tbl_name: str | None = None,
|
|
4842
5040
|
label: str | None = None,
|
|
4843
5041
|
) -> Validate:
|
|
@@ -4980,7 +5178,7 @@ class Validate:
|
|
|
4980
5178
|
na_pass: bool = False,
|
|
4981
5179
|
pre: Callable | None = None,
|
|
4982
5180
|
segments: SegmentSpec | None = None,
|
|
4983
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5181
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
4984
5182
|
actions: Actions | None = None,
|
|
4985
5183
|
brief: str | bool | None = None,
|
|
4986
5184
|
active: bool = True,
|
|
@@ -5214,7 +5412,6 @@ class Validate:
|
|
|
5214
5412
|
- Row 1: `c` is `1` and `b` is `2`.
|
|
5215
5413
|
- Row 3: `c` is `2` and `b` is `2`.
|
|
5216
5414
|
"""
|
|
5217
|
-
|
|
5218
5415
|
assertion_type = _get_fn_name()
|
|
5219
5416
|
|
|
5220
5417
|
_check_column(column=columns)
|
|
@@ -5234,14 +5431,7 @@ class Validate:
|
|
|
5234
5431
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
5235
5432
|
)
|
|
5236
5433
|
|
|
5237
|
-
|
|
5238
|
-
# resolve the columns
|
|
5239
|
-
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
5240
|
-
columns = col(columns)
|
|
5241
|
-
|
|
5242
|
-
# If `columns` is Column value or a string, place it in a list for iteration
|
|
5243
|
-
if isinstance(columns, (Column, str)):
|
|
5244
|
-
columns = [columns]
|
|
5434
|
+
columns = _resolve_columns(columns)
|
|
5245
5435
|
|
|
5246
5436
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
5247
5437
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
@@ -5272,7 +5462,7 @@ class Validate:
|
|
|
5272
5462
|
na_pass: bool = False,
|
|
5273
5463
|
pre: Callable | None = None,
|
|
5274
5464
|
segments: SegmentSpec | None = None,
|
|
5275
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5465
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
5276
5466
|
actions: Actions | None = None,
|
|
5277
5467
|
brief: str | bool | None = None,
|
|
5278
5468
|
active: bool = True,
|
|
@@ -5563,7 +5753,7 @@ class Validate:
|
|
|
5563
5753
|
na_pass: bool = False,
|
|
5564
5754
|
pre: Callable | None = None,
|
|
5565
5755
|
segments: SegmentSpec | None = None,
|
|
5566
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
5756
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
5567
5757
|
actions: Actions | None = None,
|
|
5568
5758
|
brief: str | bool | None = None,
|
|
5569
5759
|
active: bool = True,
|
|
@@ -5854,7 +6044,7 @@ class Validate:
|
|
|
5854
6044
|
na_pass: bool = False,
|
|
5855
6045
|
pre: Callable | None = None,
|
|
5856
6046
|
segments: SegmentSpec | None = None,
|
|
5857
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
6047
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
5858
6048
|
actions: Actions | None = None,
|
|
5859
6049
|
brief: str | bool | None = None,
|
|
5860
6050
|
active: bool = True,
|
|
@@ -6143,7 +6333,7 @@ class Validate:
|
|
|
6143
6333
|
na_pass: bool = False,
|
|
6144
6334
|
pre: Callable | None = None,
|
|
6145
6335
|
segments: SegmentSpec | None = None,
|
|
6146
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
6336
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
6147
6337
|
actions: Actions | None = None,
|
|
6148
6338
|
brief: str | bool | None = None,
|
|
6149
6339
|
active: bool = True,
|
|
@@ -6435,7 +6625,7 @@ class Validate:
|
|
|
6435
6625
|
na_pass: bool = False,
|
|
6436
6626
|
pre: Callable | None = None,
|
|
6437
6627
|
segments: SegmentSpec | None = None,
|
|
6438
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
6628
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
6439
6629
|
actions: Actions | None = None,
|
|
6440
6630
|
brief: str | bool | None = None,
|
|
6441
6631
|
active: bool = True,
|
|
@@ -6729,7 +6919,7 @@ class Validate:
|
|
|
6729
6919
|
na_pass: bool = False,
|
|
6730
6920
|
pre: Callable | None = None,
|
|
6731
6921
|
segments: SegmentSpec | None = None,
|
|
6732
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
6922
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
6733
6923
|
actions: Actions | None = None,
|
|
6734
6924
|
brief: str | bool | None = None,
|
|
6735
6925
|
active: bool = True,
|
|
@@ -7049,7 +7239,7 @@ class Validate:
|
|
|
7049
7239
|
na_pass: bool = False,
|
|
7050
7240
|
pre: Callable | None = None,
|
|
7051
7241
|
segments: SegmentSpec | None = None,
|
|
7052
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
7242
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
7053
7243
|
actions: Actions | None = None,
|
|
7054
7244
|
brief: str | bool | None = None,
|
|
7055
7245
|
active: bool = True,
|
|
@@ -7366,7 +7556,7 @@ class Validate:
|
|
|
7366
7556
|
set: Collection[Any],
|
|
7367
7557
|
pre: Callable | None = None,
|
|
7368
7558
|
segments: SegmentSpec | None = None,
|
|
7369
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
7559
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
7370
7560
|
actions: Actions | None = None,
|
|
7371
7561
|
brief: str | bool | None = None,
|
|
7372
7562
|
active: bool = True,
|
|
@@ -7683,7 +7873,7 @@ class Validate:
|
|
|
7683
7873
|
set: Collection[Any],
|
|
7684
7874
|
pre: Callable | None = None,
|
|
7685
7875
|
segments: SegmentSpec | None = None,
|
|
7686
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
7876
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
7687
7877
|
actions: Actions | None = None,
|
|
7688
7878
|
brief: str | bool | None = None,
|
|
7689
7879
|
active: bool = True,
|
|
@@ -7974,7 +8164,7 @@ class Validate:
|
|
|
7974
8164
|
na_pass: bool = False,
|
|
7975
8165
|
pre: Callable | None = None,
|
|
7976
8166
|
segments: SegmentSpec | None = None,
|
|
7977
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8167
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
7978
8168
|
actions: Actions | None = None,
|
|
7979
8169
|
brief: str | bool | None = None,
|
|
7980
8170
|
active: bool = True,
|
|
@@ -8162,7 +8352,7 @@ class Validate:
|
|
|
8162
8352
|
na_pass: bool = False,
|
|
8163
8353
|
pre: Callable | None = None,
|
|
8164
8354
|
segments: SegmentSpec | None = None,
|
|
8165
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8355
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
8166
8356
|
actions: Actions | None = None,
|
|
8167
8357
|
brief: str | bool | None = None,
|
|
8168
8358
|
active: bool = True,
|
|
@@ -8347,7 +8537,7 @@ class Validate:
|
|
|
8347
8537
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
8348
8538
|
pre: Callable | None = None,
|
|
8349
8539
|
segments: SegmentSpec | None = None,
|
|
8350
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8540
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
8351
8541
|
actions: Actions | None = None,
|
|
8352
8542
|
brief: str | bool | None = None,
|
|
8353
8543
|
active: bool = True,
|
|
@@ -8590,7 +8780,7 @@ class Validate:
|
|
|
8590
8780
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
8591
8781
|
pre: Callable | None = None,
|
|
8592
8782
|
segments: SegmentSpec | None = None,
|
|
8593
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8783
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
8594
8784
|
actions: Actions | None = None,
|
|
8595
8785
|
brief: str | bool | None = None,
|
|
8596
8786
|
active: bool = True,
|
|
@@ -8836,7 +9026,7 @@ class Validate:
|
|
|
8836
9026
|
inverse: bool = False,
|
|
8837
9027
|
pre: Callable | None = None,
|
|
8838
9028
|
segments: SegmentSpec | None = None,
|
|
8839
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9029
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
8840
9030
|
actions: Actions | None = None,
|
|
8841
9031
|
brief: str | bool | None = None,
|
|
8842
9032
|
active: bool = True,
|
|
@@ -9099,7 +9289,7 @@ class Validate:
|
|
|
9099
9289
|
na_pass: bool = False,
|
|
9100
9290
|
pre: Callable | None = None,
|
|
9101
9291
|
segments: SegmentSpec | None = None,
|
|
9102
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9292
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
9103
9293
|
actions: Actions | None = None,
|
|
9104
9294
|
brief: str | bool | None = None,
|
|
9105
9295
|
active: bool = True,
|
|
@@ -9379,10 +9569,10 @@ class Validate:
|
|
|
9379
9569
|
|
|
9380
9570
|
def col_vals_expr(
|
|
9381
9571
|
self,
|
|
9382
|
-
expr:
|
|
9572
|
+
expr: Any,
|
|
9383
9573
|
pre: Callable | None = None,
|
|
9384
9574
|
segments: SegmentSpec | None = None,
|
|
9385
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9575
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
9386
9576
|
actions: Actions | None = None,
|
|
9387
9577
|
brief: str | bool | None = None,
|
|
9388
9578
|
active: bool = True,
|
|
@@ -9600,7 +9790,7 @@ class Validate:
|
|
|
9600
9790
|
def col_exists(
|
|
9601
9791
|
self,
|
|
9602
9792
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
9603
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9793
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
9604
9794
|
actions: Actions | None = None,
|
|
9605
9795
|
brief: str | bool | None = None,
|
|
9606
9796
|
active: bool = True,
|
|
@@ -10072,7 +10262,7 @@ class Validate:
|
|
|
10072
10262
|
columns_subset: str | list[str] | None = None,
|
|
10073
10263
|
pre: Callable | None = None,
|
|
10074
10264
|
segments: SegmentSpec | None = None,
|
|
10075
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
10265
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
10076
10266
|
actions: Actions | None = None,
|
|
10077
10267
|
brief: str | bool | None = None,
|
|
10078
10268
|
active: bool = True,
|
|
@@ -10313,7 +10503,7 @@ class Validate:
|
|
|
10313
10503
|
columns_subset: str | list[str] | None = None,
|
|
10314
10504
|
pre: Callable | None = None,
|
|
10315
10505
|
segments: SegmentSpec | None = None,
|
|
10316
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
10506
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
10317
10507
|
actions: Actions | None = None,
|
|
10318
10508
|
brief: str | bool | None = None,
|
|
10319
10509
|
active: bool = True,
|
|
@@ -10558,7 +10748,7 @@ class Validate:
|
|
|
10558
10748
|
max_concurrent: int = 3,
|
|
10559
10749
|
pre: Callable | None = None,
|
|
10560
10750
|
segments: SegmentSpec | None = None,
|
|
10561
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
10751
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
10562
10752
|
actions: Actions | None = None,
|
|
10563
10753
|
brief: str | bool | None = None,
|
|
10564
10754
|
active: bool = True,
|
|
@@ -10953,7 +11143,7 @@ class Validate:
|
|
|
10953
11143
|
case_sensitive_dtypes: bool = True,
|
|
10954
11144
|
full_match_dtypes: bool = True,
|
|
10955
11145
|
pre: Callable | None = None,
|
|
10956
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
11146
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
10957
11147
|
actions: Actions | None = None,
|
|
10958
11148
|
brief: str | bool | None = None,
|
|
10959
11149
|
active: bool = True,
|
|
@@ -11169,11 +11359,11 @@ class Validate:
|
|
|
11169
11359
|
|
|
11170
11360
|
def row_count_match(
|
|
11171
11361
|
self,
|
|
11172
|
-
count: int |
|
|
11362
|
+
count: int | Any,
|
|
11173
11363
|
tol: Tolerance = 0,
|
|
11174
11364
|
inverse: bool = False,
|
|
11175
11365
|
pre: Callable | None = None,
|
|
11176
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
11366
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
11177
11367
|
actions: Actions | None = None,
|
|
11178
11368
|
brief: str | bool | None = None,
|
|
11179
11369
|
active: bool = True,
|
|
@@ -11386,12 +11576,375 @@ class Validate:
|
|
|
11386
11576
|
|
|
11387
11577
|
return self
|
|
11388
11578
|
|
|
11579
|
+
def data_freshness(
|
|
11580
|
+
self,
|
|
11581
|
+
column: str,
|
|
11582
|
+
max_age: str | datetime.timedelta,
|
|
11583
|
+
reference_time: datetime.datetime | str | None = None,
|
|
11584
|
+
timezone: str | None = None,
|
|
11585
|
+
allow_tz_mismatch: bool = False,
|
|
11586
|
+
pre: Callable | None = None,
|
|
11587
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
11588
|
+
actions: Actions | None = None,
|
|
11589
|
+
brief: str | bool | None = None,
|
|
11590
|
+
active: bool = True,
|
|
11591
|
+
) -> Validate:
|
|
11592
|
+
"""
|
|
11593
|
+
Validate that data in a datetime column is not older than a specified maximum age.
|
|
11594
|
+
|
|
11595
|
+
The `data_freshness()` validation method checks whether the most recent timestamp in the
|
|
11596
|
+
specified datetime column is within the allowed `max_age=` from the `reference_time=` (which
|
|
11597
|
+
defaults to the current time). This is useful for ensuring data pipelines are delivering
|
|
11598
|
+
fresh data and for enforcing data SLAs.
|
|
11599
|
+
|
|
11600
|
+
This method helps detect stale data by comparing the maximum (most recent) value in a
|
|
11601
|
+
datetime column against an expected freshness threshold.
|
|
11602
|
+
|
|
11603
|
+
Parameters
|
|
11604
|
+
----------
|
|
11605
|
+
column
|
|
11606
|
+
The name of the datetime column to check for freshness. This column should contain
|
|
11607
|
+
date or datetime values.
|
|
11608
|
+
max_age
|
|
11609
|
+
The maximum allowed age of the data. Can be specified as: (1) a string with a
|
|
11610
|
+
human-readable duration like `"24 hours"`, `"1 day"`, `"30 minutes"`, `"2 weeks"`, etc.
|
|
11611
|
+
(supported units: `seconds`, `minutes`, `hours`, `days`, `weeks`), or (2) a
|
|
11612
|
+
`datetime.timedelta` object for precise control.
|
|
11613
|
+
reference_time
|
|
11614
|
+
The reference point in time to compare against. Defaults to `None`, which uses the
|
|
11615
|
+
current time (UTC if `timezone=` is not specified). Can be: (1) a `datetime.datetime`
|
|
11616
|
+
object (timezone-aware recommended), (2) a string in ISO 8601 format (e.g.,
|
|
11617
|
+
`"2024-01-15T10:30:00"` or `"2024-01-15T10:30:00+05:30"`), or (3) `None` to use the
|
|
11618
|
+
current time.
|
|
11619
|
+
timezone
|
|
11620
|
+
The timezone to use for interpreting the data and reference time. Accepts IANA
|
|
11621
|
+
timezone names (e.g., `"America/New_York"`), hour offsets (e.g., `"-7"`), or ISO 8601
|
|
11622
|
+
offsets (e.g., `"-07:00"`). When `None` (default), naive datetimes are treated as UTC.
|
|
11623
|
+
See the *The `timezone=` Parameter* section for details.
|
|
11624
|
+
allow_tz_mismatch
|
|
11625
|
+
Whether to allow timezone mismatches between the column data and reference time.
|
|
11626
|
+
By default (`False`), a warning note is added when comparing timezone-naive with
|
|
11627
|
+
timezone-aware datetimes. Set to `True` to suppress these warnings.
|
|
11628
|
+
pre
|
|
11629
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
11630
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
11631
|
+
thresholds
|
|
11632
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
11633
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
11634
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
11635
|
+
be set locally and global thresholds (if any) will take effect.
|
|
11636
|
+
actions
|
|
11637
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
11638
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
11639
|
+
define the actions.
|
|
11640
|
+
brief
|
|
11641
|
+
An optional brief description of the validation step that will be displayed in the
|
|
11642
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
11643
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
11644
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
11645
|
+
won't be a brief.
|
|
11646
|
+
active
|
|
11647
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
11648
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
11649
|
+
for the steps unchanged).
|
|
11650
|
+
|
|
11651
|
+
Returns
|
|
11652
|
+
-------
|
|
11653
|
+
Validate
|
|
11654
|
+
The `Validate` object with the added validation step.
|
|
11655
|
+
|
|
11656
|
+
How Timezones Affect Freshness Checks
|
|
11657
|
+
-------------------------------------
|
|
11658
|
+
Freshness validation involves comparing two times: the **data time** (the most recent
|
|
11659
|
+
timestamp in your column) and the **execution time** (when and where the validation runs).
|
|
11660
|
+
Timezone confusion typically arises because these two times may originate from different
|
|
11661
|
+
contexts.
|
|
11662
|
+
|
|
11663
|
+
Consider these common scenarios:
|
|
11664
|
+
|
|
11665
|
+
- your data timestamps are stored in UTC (common for databases), but you're running
|
|
11666
|
+
validation on your laptop in New York (Eastern Time)
|
|
11667
|
+
- you develop and test validation locally, then deploy it to a cloud workflow that runs
|
|
11668
|
+
in UTC—suddenly your 'same' validation behaves differently
|
|
11669
|
+
- your data comes from servers in multiple regions, each recording timestamps in their
|
|
11670
|
+
local timezone
|
|
11671
|
+
|
|
11672
|
+
The `timezone=` parameter exists to solve this problem by establishing a single, explicit
|
|
11673
|
+
timezone context for the freshness comparison. When you specify a timezone, Pointblank
|
|
11674
|
+
interprets both the data timestamps (if naive) and the execution time in that timezone,
|
|
11675
|
+
ensuring consistent behavior whether you run validation on your laptop or in a cloud
|
|
11676
|
+
workflow.
|
|
11677
|
+
|
|
11678
|
+
**Scenario 1: Data has timezone-aware datetimes**
|
|
11679
|
+
|
|
11680
|
+
```python
|
|
11681
|
+
# Your data column has values like: 2024-01-15 10:30:00+00:00 (UTC)
|
|
11682
|
+
# Comparison is straightforward as both sides have explicit timezones
|
|
11683
|
+
.data_freshness(column="updated_at", max_age="24 hours")
|
|
11684
|
+
```
|
|
11685
|
+
|
|
11686
|
+
**Scenario 2: Data has naive datetimes (no timezone)**
|
|
11687
|
+
|
|
11688
|
+
```python
|
|
11689
|
+
# Your data column has values like: 2024-01-15 10:30:00 (no timezone)
|
|
11690
|
+
# Specify the timezone the data was recorded in:
|
|
11691
|
+
.data_freshness(column="updated_at", max_age="24 hours", timezone="America/New_York")
|
|
11692
|
+
```
|
|
11693
|
+
|
|
11694
|
+
**Scenario 3: Ensuring consistent behavior across environments**
|
|
11695
|
+
|
|
11696
|
+
```python
|
|
11697
|
+
# Pin the timezone to ensure identical results whether running locally or in the cloud
|
|
11698
|
+
.data_freshness(
|
|
11699
|
+
column="updated_at",
|
|
11700
|
+
max_age="24 hours",
|
|
11701
|
+
timezone="UTC", # Explicit timezone removes environment dependence
|
|
11702
|
+
)
|
|
11703
|
+
```
|
|
11704
|
+
|
|
11705
|
+
The `timezone=` Parameter
|
|
11706
|
+
---------------------------
|
|
11707
|
+
The `timezone=` parameter accepts several convenient formats, making it easy to specify
|
|
11708
|
+
timezones in whatever way is most natural for your use case. The following examples
|
|
11709
|
+
illustrate the three supported input styles.
|
|
11710
|
+
|
|
11711
|
+
**IANA Timezone Names** (recommended for regions with daylight saving time):
|
|
11712
|
+
|
|
11713
|
+
```python
|
|
11714
|
+
timezone="America/New_York" # Eastern Time (handles DST automatically)
|
|
11715
|
+
timezone="Europe/London" # UK time
|
|
11716
|
+
timezone="Asia/Tokyo" # Japan Standard Time
|
|
11717
|
+
timezone="Australia/Sydney" # Australian Eastern Time
|
|
11718
|
+
timezone="UTC" # Coordinated Universal Time
|
|
11719
|
+
```
|
|
11720
|
+
|
|
11721
|
+
**Simple Hour Offsets** (quick and easy):
|
|
11722
|
+
|
|
11723
|
+
```python
|
|
11724
|
+
timezone="-7" # UTC-7 (e.g., Mountain Standard Time)
|
|
11725
|
+
timezone="+5" # UTC+5 (e.g., Pakistan Standard Time)
|
|
11726
|
+
timezone="0" # UTC
|
|
11727
|
+
timezone="-12" # UTC-12
|
|
11728
|
+
```
|
|
11729
|
+
|
|
11730
|
+
**ISO 8601 Offset Format** (precise, including fractional hours):
|
|
11731
|
+
|
|
11732
|
+
```python
|
|
11733
|
+
timezone="-07:00" # UTC-7
|
|
11734
|
+
timezone="+05:30" # UTC+5:30 (e.g., India Standard Time)
|
|
11735
|
+
timezone="+00:00" # UTC
|
|
11736
|
+
timezone="-09:30" # UTC-9:30
|
|
11737
|
+
```
|
|
11738
|
+
|
|
11739
|
+
When a timezone is specified:
|
|
11740
|
+
|
|
11741
|
+
- naive datetime values in the column are assumed to be in this timezone.
|
|
11742
|
+
- the reference time (if naive) is assumed to be in this timezone.
|
|
11743
|
+
- the validation report will show times in this timezone.
|
|
11744
|
+
|
|
11745
|
+
When `None` (default):
|
|
11746
|
+
|
|
11747
|
+
- if your column has timezone-aware datetimes, those timezones are used
|
|
11748
|
+
- if your column has naive datetimes, they're treated as UTC
|
|
11749
|
+
- the current time reference uses UTC
|
|
11750
|
+
|
|
11751
|
+
Note that IANA timezone names are preferred when daylight saving time transitions matter, as
|
|
11752
|
+
they automatically handle the offset changes. Fixed offsets like `"-7"` or `"-07:00"` do not
|
|
11753
|
+
account for DST.
|
|
11754
|
+
|
|
11755
|
+
Recommendations for Working with Timestamps
|
|
11756
|
+
-------------------------------------------
|
|
11757
|
+
When working with datetime data, storing timestamps in UTC in your databases is strongly
|
|
11758
|
+
recommended since it provides a consistent reference point regardless of where your data
|
|
11759
|
+
originates or where it's consumed. Using timezone-aware datetimes whenever possible helps
|
|
11760
|
+
avoid ambiguity—when a datetime has an explicit timezone, there's no guessing about what
|
|
11761
|
+
time it actually represents.
|
|
11762
|
+
|
|
11763
|
+
If you're working with naive datetimes (which lack timezone information), always specify the
|
|
11764
|
+
`timezone=` parameter so Pointblank knows how to interpret those values. When providing
|
|
11765
|
+
`reference_time=` as a string, use ISO 8601 format with the timezone offset included (e.g.,
|
|
11766
|
+
`"2024-01-15T10:30:00+00:00"`) to ensure unambiguous parsing. Finally, prefer IANA timezone
|
|
11767
|
+
names (like `"America/New_York"`) over fixed offsets (like `"-05:00"`) when daylight saving
|
|
11768
|
+
time transitions matter, since IANA names automatically handle the twice-yearly offset
|
|
11769
|
+
changes. To see all available IANA timezone names in Python, use
|
|
11770
|
+
`zoneinfo.available_timezones()` from the standard library's `zoneinfo` module.
|
|
11771
|
+
|
|
11772
|
+
Examples
|
|
11773
|
+
--------
|
|
11774
|
+
```{python}
|
|
11775
|
+
#| echo: false
|
|
11776
|
+
#| output: false
|
|
11777
|
+
import pointblank as pb
|
|
11778
|
+
pb.config(report_incl_header=False, report_incl_footer=False)
|
|
11779
|
+
```
|
|
11780
|
+
|
|
11781
|
+
The simplest use of `data_freshness()` requires just two arguments: the `column=` containing
|
|
11782
|
+
your timestamps and `max_age=` specifying how old the data can be. In this first example,
|
|
11783
|
+
we create sample data with an `"updated_at"` column containing timestamps from 1, 12, and
|
|
11784
|
+
20 hours ago. By setting `max_age="24 hours"`, we're asserting that the most recent
|
|
11785
|
+
timestamp should be within 24 hours of the current time. Since the newest record is only
|
|
11786
|
+
1 hour old, this validation passes.
|
|
11787
|
+
|
|
11788
|
+
```{python}
|
|
11789
|
+
import pointblank as pb
|
|
11790
|
+
import polars as pl
|
|
11791
|
+
from datetime import datetime, timedelta
|
|
11792
|
+
|
|
11793
|
+
# Create sample data with recent timestamps
|
|
11794
|
+
recent_data = pl.DataFrame({
|
|
11795
|
+
"id": [1, 2, 3],
|
|
11796
|
+
"updated_at": [
|
|
11797
|
+
datetime.now() - timedelta(hours=1),
|
|
11798
|
+
datetime.now() - timedelta(hours=12),
|
|
11799
|
+
datetime.now() - timedelta(hours=20),
|
|
11800
|
+
]
|
|
11801
|
+
})
|
|
11802
|
+
|
|
11803
|
+
validation = (
|
|
11804
|
+
pb.Validate(data=recent_data)
|
|
11805
|
+
.data_freshness(column="updated_at", max_age="24 hours")
|
|
11806
|
+
.interrogate()
|
|
11807
|
+
)
|
|
11808
|
+
|
|
11809
|
+
validation
|
|
11810
|
+
```
|
|
11811
|
+
|
|
11812
|
+
The `max_age=` parameter accepts human-readable strings with various time units. You can
|
|
11813
|
+
chain multiple `data_freshness()` calls to check different freshness thresholds
|
|
11814
|
+
simultaneously—useful for tiered SLAs where you might want warnings at 30 minutes but
|
|
11815
|
+
errors at 2 days.
|
|
11816
|
+
|
|
11817
|
+
```{python}
|
|
11818
|
+
# Check data is fresh within different time windows
|
|
11819
|
+
validation = (
|
|
11820
|
+
pb.Validate(data=recent_data)
|
|
11821
|
+
.data_freshness(column="updated_at", max_age="30 minutes") # Very fresh
|
|
11822
|
+
.data_freshness(column="updated_at", max_age="2 days") # Reasonably fresh
|
|
11823
|
+
.data_freshness(column="updated_at", max_age="1 week") # Within a week
|
|
11824
|
+
.interrogate()
|
|
11825
|
+
)
|
|
11826
|
+
|
|
11827
|
+
validation
|
|
11828
|
+
```
|
|
11829
|
+
|
|
11830
|
+
When your data contains naive datetimes (timestamps without timezone information), use the
|
|
11831
|
+
`timezone=` parameter to specify what timezone those values represent. Here we have event
|
|
11832
|
+
data recorded in Eastern Time, so we set `timezone="America/New_York"` to ensure the
|
|
11833
|
+
freshness comparison is done correctly.
|
|
11834
|
+
|
|
11835
|
+
```{python}
|
|
11836
|
+
# Data with naive datetimes (assume they're in Eastern Time)
|
|
11837
|
+
eastern_data = pl.DataFrame({
|
|
11838
|
+
"event_time": [
|
|
11839
|
+
datetime.now() - timedelta(hours=2),
|
|
11840
|
+
datetime.now() - timedelta(hours=5),
|
|
11841
|
+
]
|
|
11842
|
+
})
|
|
11843
|
+
|
|
11844
|
+
validation = (
|
|
11845
|
+
pb.Validate(data=eastern_data)
|
|
11846
|
+
.data_freshness(
|
|
11847
|
+
column="event_time",
|
|
11848
|
+
max_age="12 hours",
|
|
11849
|
+
timezone="America/New_York" # Interpret times as Eastern
|
|
11850
|
+
)
|
|
11851
|
+
.interrogate()
|
|
11852
|
+
)
|
|
11853
|
+
|
|
11854
|
+
validation
|
|
11855
|
+
```
|
|
11856
|
+
|
|
11857
|
+
For reproducible validations or historical checks, you can use `reference_time=` to compare
|
|
11858
|
+
against a specific point in time instead of the current time. This is particularly useful
|
|
11859
|
+
for testing or when validating data snapshots. The reference time should include a timezone
|
|
11860
|
+
offset (like `+00:00` for UTC) to avoid ambiguity.
|
|
11861
|
+
|
|
11862
|
+
```{python}
|
|
11863
|
+
validation = (
|
|
11864
|
+
pb.Validate(data=recent_data)
|
|
11865
|
+
.data_freshness(
|
|
11866
|
+
column="updated_at",
|
|
11867
|
+
max_age="24 hours",
|
|
11868
|
+
reference_time="2024-01-15T12:00:00+00:00"
|
|
11869
|
+
)
|
|
11870
|
+
.interrogate()
|
|
11871
|
+
)
|
|
11872
|
+
|
|
11873
|
+
validation
|
|
11874
|
+
```
|
|
11875
|
+
"""
|
|
11876
|
+
|
|
11877
|
+
assertion_type = _get_fn_name()
|
|
11878
|
+
|
|
11879
|
+
_check_pre(pre=pre)
|
|
11880
|
+
_check_thresholds(thresholds=thresholds)
|
|
11881
|
+
_check_boolean_input(param=active, param_name="active")
|
|
11882
|
+
_check_boolean_input(param=allow_tz_mismatch, param_name="allow_tz_mismatch")
|
|
11883
|
+
|
|
11884
|
+
# Validate and parse the max_age parameter
|
|
11885
|
+
max_age_td = _parse_max_age(max_age)
|
|
11886
|
+
|
|
11887
|
+
# Validate the column parameter
|
|
11888
|
+
if not isinstance(column, str):
|
|
11889
|
+
raise TypeError(
|
|
11890
|
+
f"The `column` parameter must be a string, got {type(column).__name__}."
|
|
11891
|
+
)
|
|
11892
|
+
|
|
11893
|
+
# Validate the timezone parameter if provided
|
|
11894
|
+
if timezone is not None:
|
|
11895
|
+
_validate_timezone(timezone)
|
|
11896
|
+
|
|
11897
|
+
# Parse reference_time if it's a string
|
|
11898
|
+
parsed_reference_time = None
|
|
11899
|
+
if reference_time is not None:
|
|
11900
|
+
if isinstance(reference_time, str):
|
|
11901
|
+
parsed_reference_time = _parse_reference_time(reference_time)
|
|
11902
|
+
elif isinstance(reference_time, datetime.datetime):
|
|
11903
|
+
parsed_reference_time = reference_time
|
|
11904
|
+
else:
|
|
11905
|
+
raise TypeError(
|
|
11906
|
+
f"The `reference_time` parameter must be a string or datetime object, "
|
|
11907
|
+
f"got {type(reference_time).__name__}."
|
|
11908
|
+
)
|
|
11909
|
+
|
|
11910
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
11911
|
+
thresholds = (
|
|
11912
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
11913
|
+
)
|
|
11914
|
+
|
|
11915
|
+
# Package up the parameters for later interrogation
|
|
11916
|
+
values = {
|
|
11917
|
+
"max_age": max_age_td,
|
|
11918
|
+
"max_age_str": max_age if isinstance(max_age, str) else str(max_age),
|
|
11919
|
+
"reference_time": parsed_reference_time,
|
|
11920
|
+
"timezone": timezone,
|
|
11921
|
+
"allow_tz_mismatch": allow_tz_mismatch,
|
|
11922
|
+
}
|
|
11923
|
+
|
|
11924
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
11925
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
11926
|
+
|
|
11927
|
+
val_info = _ValidationInfo(
|
|
11928
|
+
assertion_type=assertion_type,
|
|
11929
|
+
column=column,
|
|
11930
|
+
values=values,
|
|
11931
|
+
pre=pre,
|
|
11932
|
+
thresholds=thresholds,
|
|
11933
|
+
actions=actions,
|
|
11934
|
+
brief=brief,
|
|
11935
|
+
active=active,
|
|
11936
|
+
)
|
|
11937
|
+
|
|
11938
|
+
self._add_validation(validation_info=val_info)
|
|
11939
|
+
|
|
11940
|
+
return self
|
|
11941
|
+
|
|
11389
11942
|
def col_count_match(
|
|
11390
11943
|
self,
|
|
11391
|
-
count: int |
|
|
11944
|
+
count: int | Any,
|
|
11392
11945
|
inverse: bool = False,
|
|
11393
11946
|
pre: Callable | None = None,
|
|
11394
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
11947
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
11395
11948
|
actions: Actions | None = None,
|
|
11396
11949
|
brief: str | bool | None = None,
|
|
11397
11950
|
active: bool = True,
|
|
@@ -11564,9 +12117,9 @@ class Validate:
|
|
|
11564
12117
|
|
|
11565
12118
|
def tbl_match(
|
|
11566
12119
|
self,
|
|
11567
|
-
tbl_compare:
|
|
12120
|
+
tbl_compare: Any,
|
|
11568
12121
|
pre: Callable | None = None,
|
|
11569
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
12122
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
11570
12123
|
actions: Actions | None = None,
|
|
11571
12124
|
brief: str | bool | None = None,
|
|
11572
12125
|
active: bool = True,
|
|
@@ -11835,7 +12388,7 @@ class Validate:
|
|
|
11835
12388
|
self,
|
|
11836
12389
|
*exprs: Callable,
|
|
11837
12390
|
pre: Callable | None = None,
|
|
11838
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
12391
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
11839
12392
|
actions: Actions | None = None,
|
|
11840
12393
|
brief: str | bool | None = None,
|
|
11841
12394
|
active: bool = True,
|
|
@@ -12083,7 +12636,7 @@ class Validate:
|
|
|
12083
12636
|
self,
|
|
12084
12637
|
expr: Callable,
|
|
12085
12638
|
pre: Callable | None = None,
|
|
12086
|
-
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
12639
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
12087
12640
|
actions: Actions | None = None,
|
|
12088
12641
|
brief: str | bool | None = None,
|
|
12089
12642
|
active: bool = True,
|
|
@@ -12577,7 +13130,7 @@ class Validate:
|
|
|
12577
13130
|
segment = validation.segments
|
|
12578
13131
|
|
|
12579
13132
|
# Get compatible data types for this assertion type
|
|
12580
|
-
assertion_method = ASSERTION_TYPE_METHOD_MAP
|
|
13133
|
+
assertion_method = ASSERTION_TYPE_METHOD_MAP.get(assertion_type, assertion_type)
|
|
12581
13134
|
compatible_dtypes = COMPATIBLE_DTYPES.get(assertion_method, [])
|
|
12582
13135
|
|
|
12583
13136
|
# Process the `brief` text for the validation step by including template variables to
|
|
@@ -12632,7 +13185,11 @@ class Validate:
|
|
|
12632
13185
|
|
|
12633
13186
|
# Make a deep copy of the table for this step to ensure proper isolation
|
|
12634
13187
|
# This prevents modifications from one validation step affecting others
|
|
12635
|
-
|
|
13188
|
+
try:
|
|
13189
|
+
# TODO: This copying should be scrutinized further
|
|
13190
|
+
data_tbl_step: IntoDataFrame = _copy_dataframe(data_tbl)
|
|
13191
|
+
except Exception as e: # pragma: no cover
|
|
13192
|
+
data_tbl_step: IntoDataFrame = data_tbl # pragma: no cover
|
|
12636
13193
|
|
|
12637
13194
|
# Capture original table dimensions and columns before preprocessing
|
|
12638
13195
|
# (only if preprocessing is present - we'll set these inside the preprocessing block)
|
|
@@ -12793,6 +13350,8 @@ class Validate:
|
|
|
12793
13350
|
"col_schema_match",
|
|
12794
13351
|
"row_count_match",
|
|
12795
13352
|
"col_count_match",
|
|
13353
|
+
"data_freshness",
|
|
13354
|
+
"tbl_match",
|
|
12796
13355
|
]
|
|
12797
13356
|
|
|
12798
13357
|
if validation.n == 0 and assertion_type not in table_level_assertions:
|
|
@@ -13053,8 +13612,107 @@ class Validate:
|
|
|
13053
13612
|
|
|
13054
13613
|
results_tbl = None
|
|
13055
13614
|
|
|
13056
|
-
elif assertion_type == "
|
|
13057
|
-
from pointblank._interrogation import
|
|
13615
|
+
elif assertion_type == "data_freshness":
|
|
13616
|
+
from pointblank._interrogation import data_freshness as data_freshness_check
|
|
13617
|
+
|
|
13618
|
+
freshness_result = data_freshness_check(
|
|
13619
|
+
data_tbl=data_tbl_step,
|
|
13620
|
+
column=column,
|
|
13621
|
+
max_age=value["max_age"],
|
|
13622
|
+
reference_time=value["reference_time"],
|
|
13623
|
+
timezone=value["timezone"],
|
|
13624
|
+
allow_tz_mismatch=value["allow_tz_mismatch"],
|
|
13625
|
+
)
|
|
13626
|
+
|
|
13627
|
+
result_bool = freshness_result["passed"]
|
|
13628
|
+
validation.all_passed = result_bool
|
|
13629
|
+
validation.n = 1
|
|
13630
|
+
validation.n_passed = int(result_bool)
|
|
13631
|
+
validation.n_failed = 1 - int(result_bool)
|
|
13632
|
+
|
|
13633
|
+
# Store the freshness check details for reporting
|
|
13634
|
+
validation.val_info = freshness_result
|
|
13635
|
+
|
|
13636
|
+
# Update the values dict with actual computed values for failure text
|
|
13637
|
+
if freshness_result.get("age") is not None:
|
|
13638
|
+
value["age"] = freshness_result["age"]
|
|
13639
|
+
|
|
13640
|
+
# Add timezone warning note if applicable
|
|
13641
|
+
if freshness_result.get("tz_warning_key"):
|
|
13642
|
+
tz_key = freshness_result["tz_warning_key"]
|
|
13643
|
+
tz_warning_text = NOTES_TEXT.get(tz_key, {}).get(
|
|
13644
|
+
self.locale, NOTES_TEXT.get(tz_key, {}).get("en", "")
|
|
13645
|
+
)
|
|
13646
|
+
validation._add_note(
|
|
13647
|
+
key="tz_warning",
|
|
13648
|
+
markdown=f"⚠️ {tz_warning_text}",
|
|
13649
|
+
text=tz_warning_text,
|
|
13650
|
+
)
|
|
13651
|
+
|
|
13652
|
+
# Add note about column being empty if applicable
|
|
13653
|
+
if freshness_result.get("column_empty"):
|
|
13654
|
+
column_empty_text = NOTES_TEXT.get(
|
|
13655
|
+
"data_freshness_column_empty", {}
|
|
13656
|
+
).get(
|
|
13657
|
+
self.locale,
|
|
13658
|
+
NOTES_TEXT.get("data_freshness_column_empty", {}).get(
|
|
13659
|
+
"en", "The datetime column is empty (no values to check)."
|
|
13660
|
+
),
|
|
13661
|
+
)
|
|
13662
|
+
validation._add_note(
|
|
13663
|
+
key="column_empty",
|
|
13664
|
+
markdown=f"⚠️ {column_empty_text}",
|
|
13665
|
+
text=column_empty_text,
|
|
13666
|
+
)
|
|
13667
|
+
|
|
13668
|
+
# Add informational note about the freshness check
|
|
13669
|
+
if freshness_result.get("max_datetime") and freshness_result.get("age"):
|
|
13670
|
+
max_dt = freshness_result["max_datetime"]
|
|
13671
|
+
# Format datetime without microseconds for cleaner display
|
|
13672
|
+
if hasattr(max_dt, "replace"):
|
|
13673
|
+
max_dt_display = max_dt.replace(microsecond=0)
|
|
13674
|
+
else:
|
|
13675
|
+
max_dt_display = max_dt
|
|
13676
|
+
age = freshness_result["age"]
|
|
13677
|
+
age_str = _format_timedelta(age)
|
|
13678
|
+
max_age_str = _format_timedelta(value["max_age"])
|
|
13679
|
+
|
|
13680
|
+
# Get translated template for pass/fail
|
|
13681
|
+
if result_bool:
|
|
13682
|
+
details_key = "data_freshness_details_pass"
|
|
13683
|
+
prefix = "✓"
|
|
13684
|
+
else:
|
|
13685
|
+
details_key = "data_freshness_details_fail"
|
|
13686
|
+
prefix = "✗"
|
|
13687
|
+
|
|
13688
|
+
details_template = NOTES_TEXT.get(details_key, {}).get(
|
|
13689
|
+
self.locale,
|
|
13690
|
+
NOTES_TEXT.get(details_key, {}).get(
|
|
13691
|
+
"en",
|
|
13692
|
+
"Most recent data: `{max_dt}` (age: {age}, max allowed: {max_age})",
|
|
13693
|
+
),
|
|
13694
|
+
)
|
|
13695
|
+
|
|
13696
|
+
# Format the template with values
|
|
13697
|
+
note_text = details_template.format(
|
|
13698
|
+
max_dt=max_dt_display, age=age_str, max_age=max_age_str
|
|
13699
|
+
)
|
|
13700
|
+
# For markdown, make the age bold
|
|
13701
|
+
note_md_template = details_template.replace(
|
|
13702
|
+
"(age: {age}", "(age: **{age}**"
|
|
13703
|
+
)
|
|
13704
|
+
note_md = f"{prefix} {note_md_template.format(max_dt=max_dt_display, age=age_str, max_age=max_age_str)}"
|
|
13705
|
+
|
|
13706
|
+
validation._add_note(
|
|
13707
|
+
key="freshness_details",
|
|
13708
|
+
markdown=note_md,
|
|
13709
|
+
text=note_text,
|
|
13710
|
+
)
|
|
13711
|
+
|
|
13712
|
+
results_tbl = None
|
|
13713
|
+
|
|
13714
|
+
elif assertion_type == "tbl_match":
|
|
13715
|
+
from pointblank._interrogation import tbl_match
|
|
13058
13716
|
|
|
13059
13717
|
# Get the comparison table (could be callable or actual table)
|
|
13060
13718
|
tbl_compare = value["tbl_compare"]
|
|
@@ -13080,6 +13738,53 @@ class Validate:
|
|
|
13080
13738
|
tbl_type=tbl_type,
|
|
13081
13739
|
)
|
|
13082
13740
|
|
|
13741
|
+
elif is_valid_agg(assertion_type):
|
|
13742
|
+
agg, comp = resolve_agg_registries(assertion_type)
|
|
13743
|
+
|
|
13744
|
+
# Produce a 1-column Narwhals DataFrame
|
|
13745
|
+
# TODO: Should be able to take lazy too
|
|
13746
|
+
vec: nw.DataFrame = nw.from_native(data_tbl_step).select(column)
|
|
13747
|
+
real = agg(vec)
|
|
13748
|
+
|
|
13749
|
+
raw_value = value["value"]
|
|
13750
|
+
tol = value["tol"]
|
|
13751
|
+
|
|
13752
|
+
# Handle ReferenceColumn: compute target from reference data
|
|
13753
|
+
if isinstance(raw_value, ReferenceColumn):
|
|
13754
|
+
if self.reference is None:
|
|
13755
|
+
raise ValueError(
|
|
13756
|
+
f"Cannot use ref('{raw_value.column_name}') without "
|
|
13757
|
+
"setting reference data on the Validate object. "
|
|
13758
|
+
"Use Validate(data=..., reference=...) to set reference data."
|
|
13759
|
+
)
|
|
13760
|
+
ref_vec: nw.DataFrame = nw.from_native(self.reference).select(
|
|
13761
|
+
raw_value.column_name
|
|
13762
|
+
)
|
|
13763
|
+
target: float | int = agg(ref_vec)
|
|
13764
|
+
else:
|
|
13765
|
+
target = raw_value
|
|
13766
|
+
|
|
13767
|
+
lower_diff, upper_diff = _derive_bounds(target, tol)
|
|
13768
|
+
|
|
13769
|
+
lower_bound = target - lower_diff
|
|
13770
|
+
upper_bound = target + upper_diff
|
|
13771
|
+
result_bool: bool = comp(real, lower_bound, upper_bound)
|
|
13772
|
+
|
|
13773
|
+
validation.all_passed = result_bool
|
|
13774
|
+
validation.n = 1
|
|
13775
|
+
validation.n_passed = int(result_bool)
|
|
13776
|
+
validation.n_failed = 1 - result_bool
|
|
13777
|
+
|
|
13778
|
+
# Store computed values for step reports
|
|
13779
|
+
validation.val_info = {
|
|
13780
|
+
"actual": real,
|
|
13781
|
+
"target": target,
|
|
13782
|
+
"tol": tol,
|
|
13783
|
+
"lower_bound": lower_bound,
|
|
13784
|
+
"upper_bound": upper_bound,
|
|
13785
|
+
}
|
|
13786
|
+
|
|
13787
|
+
results_tbl = None
|
|
13083
13788
|
else:
|
|
13084
13789
|
raise ValueError(
|
|
13085
13790
|
f"Unknown assertion type: {assertion_type}"
|
|
@@ -13822,12 +14527,14 @@ class Validate:
|
|
|
13822
14527
|
)
|
|
13823
14528
|
|
|
13824
14529
|
# Get the threshold status using the appropriate method
|
|
14530
|
+
# Note: scalar=False (default) always returns a dict
|
|
14531
|
+
status: dict[int, bool]
|
|
13825
14532
|
if level == "warning":
|
|
13826
|
-
status = self.warning(i=i)
|
|
14533
|
+
status = self.warning(i=i) # type: ignore[assignment]
|
|
13827
14534
|
elif level == "error":
|
|
13828
|
-
status = self.error(i=i)
|
|
13829
|
-
|
|
13830
|
-
status = self.critical(i=i)
|
|
14535
|
+
status = self.error(i=i) # type: ignore[assignment]
|
|
14536
|
+
else: # level == "critical"
|
|
14537
|
+
status = self.critical(i=i) # type: ignore[assignment]
|
|
13831
14538
|
|
|
13832
14539
|
# Find any steps that exceeded the threshold
|
|
13833
14540
|
failures = []
|
|
@@ -13981,12 +14688,14 @@ class Validate:
|
|
|
13981
14688
|
)
|
|
13982
14689
|
|
|
13983
14690
|
# Get the threshold status using the appropriate method
|
|
14691
|
+
# Note: scalar=False (default) always returns a dict
|
|
14692
|
+
status: dict[int, bool]
|
|
13984
14693
|
if level == "warning":
|
|
13985
|
-
status = self.warning(i=i)
|
|
14694
|
+
status = self.warning(i=i) # type: ignore[assignment]
|
|
13986
14695
|
elif level == "error":
|
|
13987
|
-
status = self.error(i=i)
|
|
13988
|
-
|
|
13989
|
-
status = self.critical(i=i)
|
|
14696
|
+
status = self.error(i=i) # type: ignore[assignment]
|
|
14697
|
+
else: # level == "critical"
|
|
14698
|
+
status = self.critical(i=i) # type: ignore[assignment]
|
|
13990
14699
|
|
|
13991
14700
|
# Return True if any steps exceeded the threshold
|
|
13992
14701
|
return any(status.values())
|
|
@@ -14759,7 +15468,7 @@ class Validate:
|
|
|
14759
15468
|
|
|
14760
15469
|
def get_data_extracts(
|
|
14761
15470
|
self, i: int | list[int] | None = None, frame: bool = False
|
|
14762
|
-
) -> dict[int,
|
|
15471
|
+
) -> dict[int, Any] | Any:
|
|
14763
15472
|
"""
|
|
14764
15473
|
Get the rows that failed for each validation step.
|
|
14765
15474
|
|
|
@@ -14782,7 +15491,7 @@ class Validate:
|
|
|
14782
15491
|
|
|
14783
15492
|
Returns
|
|
14784
15493
|
-------
|
|
14785
|
-
dict[int,
|
|
15494
|
+
dict[int, Any] | Any
|
|
14786
15495
|
A dictionary of tables containing the rows that failed in every compatible validation
|
|
14787
15496
|
step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
|
|
14788
15497
|
|
|
@@ -15072,7 +15781,7 @@ class Validate:
|
|
|
15072
15781
|
|
|
15073
15782
|
return json.dumps(report, indent=4, default=str)
|
|
15074
15783
|
|
|
15075
|
-
def get_sundered_data(self, type="pass") ->
|
|
15784
|
+
def get_sundered_data(self, type="pass") -> Any:
|
|
15076
15785
|
"""
|
|
15077
15786
|
Get the data that passed or failed the validation steps.
|
|
15078
15787
|
|
|
@@ -15108,7 +15817,7 @@ class Validate:
|
|
|
15108
15817
|
|
|
15109
15818
|
Returns
|
|
15110
15819
|
-------
|
|
15111
|
-
|
|
15820
|
+
Any
|
|
15112
15821
|
A table containing the data that passed or failed the validation steps.
|
|
15113
15822
|
|
|
15114
15823
|
Examples
|
|
@@ -15200,6 +15909,7 @@ class Validate:
|
|
|
15200
15909
|
# Get all validation step result tables and join together the `pb_is_good_` columns
|
|
15201
15910
|
# ensuring that the columns are named uniquely (e.g., `pb_is_good_1`, `pb_is_good_2`, ...)
|
|
15202
15911
|
# and that the index is reset
|
|
15912
|
+
labeled_tbl_nw: nw.DataFrame | nw.LazyFrame | None = None
|
|
15203
15913
|
for i, validation in enumerate(validation_info):
|
|
15204
15914
|
results_tbl = nw.from_native(validation.tbl_checked)
|
|
15205
15915
|
|
|
@@ -15220,7 +15930,7 @@ class Validate:
|
|
|
15220
15930
|
)
|
|
15221
15931
|
|
|
15222
15932
|
# Add the results table to the list of tables
|
|
15223
|
-
if
|
|
15933
|
+
if labeled_tbl_nw is None:
|
|
15224
15934
|
labeled_tbl_nw = results_tbl
|
|
15225
15935
|
else:
|
|
15226
15936
|
labeled_tbl_nw = labeled_tbl_nw.join(results_tbl, on=index_name, how="left")
|
|
@@ -15396,10 +16106,10 @@ class Validate:
|
|
|
15396
16106
|
def get_tabular_report(
|
|
15397
16107
|
self,
|
|
15398
16108
|
title: str | None = ":default:",
|
|
15399
|
-
incl_header: bool = None,
|
|
15400
|
-
incl_footer: bool = None,
|
|
15401
|
-
incl_footer_timings: bool = None,
|
|
15402
|
-
incl_footer_notes: bool = None,
|
|
16109
|
+
incl_header: bool | None = None,
|
|
16110
|
+
incl_footer: bool | None = None,
|
|
16111
|
+
incl_footer_timings: bool | None = None,
|
|
16112
|
+
incl_footer_notes: bool | None = None,
|
|
15403
16113
|
) -> GT:
|
|
15404
16114
|
"""
|
|
15405
16115
|
Validation report as a GT table.
|
|
@@ -15767,10 +16477,16 @@ class Validate:
|
|
|
15767
16477
|
elif assertion_type[i] in ["conjointly", "specially"]:
|
|
15768
16478
|
column_text = ""
|
|
15769
16479
|
else:
|
|
15770
|
-
|
|
16480
|
+
# Handle both string columns and list columns
|
|
16481
|
+
# For single-element lists like ['a'], display as 'a'
|
|
16482
|
+
# For multi-element lists, display as comma-separated values
|
|
16483
|
+
if isinstance(column, list):
|
|
16484
|
+
column_text = ", ".join(str(c) for c in column)
|
|
16485
|
+
else:
|
|
16486
|
+
column_text = str(column)
|
|
15771
16487
|
|
|
15772
|
-
# Apply underline styling for synthetic columns
|
|
15773
|
-
#
|
|
16488
|
+
# Apply underline styling for synthetic columns; only apply styling if column_text is
|
|
16489
|
+
# not empty and not a special marker
|
|
15774
16490
|
if (
|
|
15775
16491
|
has_synthetic_column
|
|
15776
16492
|
and column_text
|
|
@@ -15848,6 +16564,69 @@ class Validate:
|
|
|
15848
16564
|
tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
|
|
15849
16565
|
values_upd.append(f"p = {p_value}<br/>tol = {tol_value}")
|
|
15850
16566
|
|
|
16567
|
+
elif assertion_type[i] in ["data_freshness"]:
|
|
16568
|
+
# Format max_age nicely for display
|
|
16569
|
+
max_age = value.get("max_age")
|
|
16570
|
+
max_age_str = _format_timedelta(max_age) if max_age else "—"
|
|
16571
|
+
|
|
16572
|
+
# Build additional lines with non-default parameters
|
|
16573
|
+
extra_lines = []
|
|
16574
|
+
|
|
16575
|
+
if value.get("reference_time") is not None:
|
|
16576
|
+
ref_time = value["reference_time"]
|
|
16577
|
+
|
|
16578
|
+
# Format datetime across two lines: date and time+tz
|
|
16579
|
+
if hasattr(ref_time, "strftime"):
|
|
16580
|
+
date_str = ref_time.strftime("@%Y-%m-%d")
|
|
16581
|
+
time_str = " " + ref_time.strftime("%H:%M:%S")
|
|
16582
|
+
|
|
16583
|
+
# Add timezone offset if present
|
|
16584
|
+
if hasattr(ref_time, "tzinfo") and ref_time.tzinfo is not None:
|
|
16585
|
+
tz_offset = ref_time.strftime("%z")
|
|
16586
|
+
if tz_offset:
|
|
16587
|
+
time_str += tz_offset
|
|
16588
|
+
extra_lines.append(date_str)
|
|
16589
|
+
extra_lines.append(time_str)
|
|
16590
|
+
else:
|
|
16591
|
+
extra_lines.append(f"@{ref_time}")
|
|
16592
|
+
|
|
16593
|
+
# Timezone and allow_tz_mismatch on same line
|
|
16594
|
+
tz_line_parts = []
|
|
16595
|
+
if value.get("timezone") is not None:
|
|
16596
|
+
# Convert timezone name to ISO 8601 offset format
|
|
16597
|
+
tz_name = value["timezone"]
|
|
16598
|
+
|
|
16599
|
+
try:
|
|
16600
|
+
tz_obj = ZoneInfo(tz_name)
|
|
16601
|
+
|
|
16602
|
+
# Get the current offset for this timezone
|
|
16603
|
+
now = datetime.datetime.now(tz_obj)
|
|
16604
|
+
offset = now.strftime("%z")
|
|
16605
|
+
|
|
16606
|
+
# Format as ISO 8601 extended: -07:00 (insert colon)
|
|
16607
|
+
if len(offset) == 5:
|
|
16608
|
+
tz_display = f"{offset[:3]}:{offset[3:]}"
|
|
16609
|
+
else:
|
|
16610
|
+
tz_display = offset
|
|
16611
|
+
|
|
16612
|
+
except Exception:
|
|
16613
|
+
tz_display = tz_name
|
|
16614
|
+
tz_line_parts.append(tz_display)
|
|
16615
|
+
|
|
16616
|
+
if value.get("allow_tz_mismatch"):
|
|
16617
|
+
tz_line_parts.append("~tz")
|
|
16618
|
+
|
|
16619
|
+
if tz_line_parts:
|
|
16620
|
+
extra_lines.append(" ".join(tz_line_parts))
|
|
16621
|
+
|
|
16622
|
+
if extra_lines:
|
|
16623
|
+
extra_html = "<br/>".join(extra_lines)
|
|
16624
|
+
values_upd.append(
|
|
16625
|
+
f'{max_age_str}<br/><span style="font-size: 9px;">{extra_html}</span>'
|
|
16626
|
+
)
|
|
16627
|
+
else:
|
|
16628
|
+
values_upd.append(max_age_str)
|
|
16629
|
+
|
|
15851
16630
|
elif assertion_type[i] in ["col_schema_match"]:
|
|
15852
16631
|
values_upd.append("SCHEMA")
|
|
15853
16632
|
|
|
@@ -15889,6 +16668,32 @@ class Validate:
|
|
|
15889
16668
|
else: # pragma: no cover
|
|
15890
16669
|
values_upd.append(str(value)) # pragma: no cover
|
|
15891
16670
|
|
|
16671
|
+
# Handle aggregation methods (col_sum_gt, col_avg_eq, etc.)
|
|
16672
|
+
elif is_valid_agg(assertion_type[i]):
|
|
16673
|
+
# Extract the value and tolerance from the values dict
|
|
16674
|
+
agg_value = value.get("value")
|
|
16675
|
+
tol_value = value.get("tol", 0)
|
|
16676
|
+
|
|
16677
|
+
# Format the value (could be a number, Column, or ReferenceColumn)
|
|
16678
|
+
if hasattr(agg_value, "__repr__"):
|
|
16679
|
+
# For Column or ReferenceColumn objects, use their repr
|
|
16680
|
+
value_str = repr(agg_value)
|
|
16681
|
+
else:
|
|
16682
|
+
value_str = str(agg_value)
|
|
16683
|
+
|
|
16684
|
+
# Format tolerance - only show on second line if non-zero
|
|
16685
|
+
if tol_value != 0:
|
|
16686
|
+
# Format tolerance based on its type
|
|
16687
|
+
if isinstance(tol_value, tuple):
|
|
16688
|
+
# Asymmetric bounds: (lower, upper)
|
|
16689
|
+
tol_str = f"tol=({tol_value[0]}, {tol_value[1]})"
|
|
16690
|
+
else:
|
|
16691
|
+
# Symmetric tolerance
|
|
16692
|
+
tol_str = f"tol={tol_value}"
|
|
16693
|
+
values_upd.append(f"{value_str}<br/>{tol_str}")
|
|
16694
|
+
else:
|
|
16695
|
+
values_upd.append(value_str)
|
|
16696
|
+
|
|
15892
16697
|
# If the assertion type is not recognized, add the value as a string
|
|
15893
16698
|
else: # pragma: no cover
|
|
15894
16699
|
values_upd.append(str(value)) # pragma: no cover
|
|
@@ -16327,6 +17132,15 @@ class Validate:
|
|
|
16327
17132
|
if incl_footer_timings:
|
|
16328
17133
|
gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
|
|
16329
17134
|
|
|
17135
|
+
# Add governance metadata as source note if any metadata is present
|
|
17136
|
+
governance_html = _create_governance_metadata_html(
|
|
17137
|
+
owner=self.owner,
|
|
17138
|
+
consumers=self.consumers,
|
|
17139
|
+
version=self.version,
|
|
17140
|
+
)
|
|
17141
|
+
if governance_html:
|
|
17142
|
+
gt_tbl = gt_tbl.tab_source_note(source_note=html(governance_html))
|
|
17143
|
+
|
|
16330
17144
|
# Create notes markdown from validation steps and add as separate source note if enabled
|
|
16331
17145
|
if incl_footer_notes:
|
|
16332
17146
|
notes_markdown = _create_notes_html(self.validation_info)
|
|
@@ -16675,6 +17489,18 @@ class Validate:
|
|
|
16675
17489
|
debug_return_df=debug_return_df,
|
|
16676
17490
|
)
|
|
16677
17491
|
|
|
17492
|
+
elif is_valid_agg(assertion_type):
|
|
17493
|
+
step_report = _step_report_aggregate(
|
|
17494
|
+
assertion_type=assertion_type,
|
|
17495
|
+
i=i,
|
|
17496
|
+
column=column,
|
|
17497
|
+
values=values,
|
|
17498
|
+
all_passed=all_passed,
|
|
17499
|
+
val_info=val_info,
|
|
17500
|
+
header=header,
|
|
17501
|
+
lang=lang,
|
|
17502
|
+
)
|
|
17503
|
+
|
|
16678
17504
|
else:
|
|
16679
17505
|
step_report = None # pragma: no cover
|
|
16680
17506
|
|
|
@@ -16738,7 +17564,7 @@ class Validate:
|
|
|
16738
17564
|
table = validation.pre(self.data)
|
|
16739
17565
|
|
|
16740
17566
|
# Get the columns from the table as a list
|
|
16741
|
-
columns = list(table.columns)
|
|
17567
|
+
columns = list(table.columns) # type: ignore[union-attr]
|
|
16742
17568
|
|
|
16743
17569
|
# Evaluate the column expression
|
|
16744
17570
|
if isinstance(column_expr, ColumnSelectorNarwhals):
|
|
@@ -17116,7 +17942,7 @@ def _convert_string_to_datetime(value: str) -> datetime.datetime:
|
|
|
17116
17942
|
return datetime.datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
|
|
17117
17943
|
|
|
17118
17944
|
|
|
17119
|
-
def _string_date_dttm_conversion(value:
|
|
17945
|
+
def _string_date_dttm_conversion(value: Any) -> Any:
|
|
17120
17946
|
"""
|
|
17121
17947
|
Convert a string to a date or datetime object if it is in the correct format.
|
|
17122
17948
|
If the value is not a string, it is returned as is.
|
|
@@ -17151,8 +17977,8 @@ def _string_date_dttm_conversion(value: any) -> any:
|
|
|
17151
17977
|
|
|
17152
17978
|
|
|
17153
17979
|
def _conditional_string_date_dttm_conversion(
|
|
17154
|
-
value:
|
|
17155
|
-
) ->
|
|
17980
|
+
value: Any, allow_regular_strings: bool = False
|
|
17981
|
+
) -> Any:
|
|
17156
17982
|
"""
|
|
17157
17983
|
Conditionally convert a string to a date or datetime object if it is in the correct format. If
|
|
17158
17984
|
`allow_regular_strings=` is `True`, regular strings are allowed to pass through unchanged. If
|
|
@@ -17196,9 +18022,9 @@ def _process_brief(
|
|
|
17196
18022
|
brief: str | None,
|
|
17197
18023
|
step: int,
|
|
17198
18024
|
col: str | list[str] | None,
|
|
17199
|
-
values:
|
|
17200
|
-
thresholds:
|
|
17201
|
-
segment:
|
|
18025
|
+
values: Any | None,
|
|
18026
|
+
thresholds: Any | None,
|
|
18027
|
+
segment: Any | None,
|
|
17202
18028
|
) -> str:
|
|
17203
18029
|
# If there is no brief, return `None`
|
|
17204
18030
|
if brief is None:
|
|
@@ -17271,6 +18097,265 @@ def _process_brief(
|
|
|
17271
18097
|
return brief
|
|
17272
18098
|
|
|
17273
18099
|
|
|
18100
|
+
def _parse_max_age(max_age: str | datetime.timedelta) -> datetime.timedelta:
|
|
18101
|
+
"""
|
|
18102
|
+
Parse a max_age specification into a timedelta.
|
|
18103
|
+
|
|
18104
|
+
Parameters
|
|
18105
|
+
----------
|
|
18106
|
+
max_age
|
|
18107
|
+
Either a timedelta object or a string like "24 hours", "1 day", "30 minutes",
|
|
18108
|
+
or compound expressions like "2 hours 15 minutes", "1 day 6 hours", etc.
|
|
18109
|
+
|
|
18110
|
+
Returns
|
|
18111
|
+
-------
|
|
18112
|
+
datetime.timedelta
|
|
18113
|
+
The parsed timedelta.
|
|
18114
|
+
|
|
18115
|
+
Raises
|
|
18116
|
+
------
|
|
18117
|
+
ValueError
|
|
18118
|
+
If the string format is invalid or the unit is not recognized.
|
|
18119
|
+
"""
|
|
18120
|
+
if isinstance(max_age, datetime.timedelta):
|
|
18121
|
+
return max_age
|
|
18122
|
+
|
|
18123
|
+
if not isinstance(max_age, str):
|
|
18124
|
+
raise TypeError(
|
|
18125
|
+
f"The `max_age` parameter must be a string or timedelta, got {type(max_age).__name__}."
|
|
18126
|
+
)
|
|
18127
|
+
|
|
18128
|
+
# Parse string format like "24 hours", "1 day", "30 minutes", etc.
|
|
18129
|
+
max_age_str = max_age.strip().lower()
|
|
18130
|
+
|
|
18131
|
+
# Define unit mappings (singular and plural forms)
|
|
18132
|
+
unit_mappings = {
|
|
18133
|
+
"second": "seconds",
|
|
18134
|
+
"seconds": "seconds",
|
|
18135
|
+
"sec": "seconds",
|
|
18136
|
+
"secs": "seconds",
|
|
18137
|
+
"s": "seconds",
|
|
18138
|
+
"minute": "minutes",
|
|
18139
|
+
"minutes": "minutes",
|
|
18140
|
+
"min": "minutes",
|
|
18141
|
+
"mins": "minutes",
|
|
18142
|
+
"m": "minutes",
|
|
18143
|
+
"hour": "hours",
|
|
18144
|
+
"hours": "hours",
|
|
18145
|
+
"hr": "hours",
|
|
18146
|
+
"hrs": "hours",
|
|
18147
|
+
"h": "hours",
|
|
18148
|
+
"day": "days",
|
|
18149
|
+
"days": "days",
|
|
18150
|
+
"d": "days",
|
|
18151
|
+
"week": "weeks",
|
|
18152
|
+
"weeks": "weeks",
|
|
18153
|
+
"wk": "weeks",
|
|
18154
|
+
"wks": "weeks",
|
|
18155
|
+
"w": "weeks",
|
|
18156
|
+
}
|
|
18157
|
+
|
|
18158
|
+
import re
|
|
18159
|
+
|
|
18160
|
+
# Pattern to find all number+unit pairs (supports compound expressions)
|
|
18161
|
+
# Matches: "2 hours 15 minutes", "1day6h", "30 min", etc.
|
|
18162
|
+
compound_pattern = r"(\d+(?:\.\d+)?)\s*([a-zA-Z]+)"
|
|
18163
|
+
matches = re.findall(compound_pattern, max_age_str)
|
|
18164
|
+
|
|
18165
|
+
if not matches:
|
|
18166
|
+
raise ValueError(
|
|
18167
|
+
f"Invalid max_age format: '{max_age}'. Expected format like '24 hours', "
|
|
18168
|
+
f"'1 day', '30 minutes', '2 hours 15 minutes', etc."
|
|
18169
|
+
)
|
|
18170
|
+
|
|
18171
|
+
# Accumulate timedelta from all matched components
|
|
18172
|
+
total_td = datetime.timedelta()
|
|
18173
|
+
valid_units = ["seconds", "minutes", "hours", "days", "weeks"]
|
|
18174
|
+
|
|
18175
|
+
for value_str, unit in matches:
|
|
18176
|
+
value = float(value_str)
|
|
18177
|
+
|
|
18178
|
+
# Normalize the unit
|
|
18179
|
+
unit_lower = unit.lower()
|
|
18180
|
+
if unit_lower not in unit_mappings:
|
|
18181
|
+
raise ValueError(
|
|
18182
|
+
f"Unknown time unit '{unit}' in max_age '{max_age}'. "
|
|
18183
|
+
f"Valid units are: {', '.join(valid_units)} (or their abbreviations)."
|
|
18184
|
+
)
|
|
18185
|
+
|
|
18186
|
+
normalized_unit = unit_mappings[unit_lower]
|
|
18187
|
+
|
|
18188
|
+
# Add to total timedelta
|
|
18189
|
+
if normalized_unit == "seconds":
|
|
18190
|
+
total_td += datetime.timedelta(seconds=value)
|
|
18191
|
+
elif normalized_unit == "minutes":
|
|
18192
|
+
total_td += datetime.timedelta(minutes=value)
|
|
18193
|
+
elif normalized_unit == "hours":
|
|
18194
|
+
total_td += datetime.timedelta(hours=value)
|
|
18195
|
+
elif normalized_unit == "days":
|
|
18196
|
+
total_td += datetime.timedelta(days=value)
|
|
18197
|
+
elif normalized_unit == "weeks":
|
|
18198
|
+
total_td += datetime.timedelta(weeks=value)
|
|
18199
|
+
|
|
18200
|
+
return total_td
|
|
18201
|
+
|
|
18202
|
+
|
|
18203
|
+
def _parse_timezone(timezone: str) -> datetime.tzinfo:
|
|
18204
|
+
"""
|
|
18205
|
+
Parse a timezone string into a tzinfo object.
|
|
18206
|
+
|
|
18207
|
+
Supports:
|
|
18208
|
+
- IANA timezone names: "America/New_York", "Europe/London", "UTC"
|
|
18209
|
+
- Offset strings: "-7", "+5", "-07:00", "+05:30"
|
|
18210
|
+
|
|
18211
|
+
Parameters
|
|
18212
|
+
----------
|
|
18213
|
+
timezone
|
|
18214
|
+
The timezone string to parse.
|
|
18215
|
+
|
|
18216
|
+
Returns
|
|
18217
|
+
-------
|
|
18218
|
+
datetime.tzinfo
|
|
18219
|
+
The parsed timezone object.
|
|
18220
|
+
|
|
18221
|
+
Raises
|
|
18222
|
+
------
|
|
18223
|
+
ValueError
|
|
18224
|
+
If the timezone is not valid.
|
|
18225
|
+
"""
|
|
18226
|
+
import re
|
|
18227
|
+
|
|
18228
|
+
# Check for offset formats: "-7", "+5", "-07:00", "+05:30", etc.
|
|
18229
|
+
# Match: optional sign, 1-2 digits, optional colon and 2 more digits
|
|
18230
|
+
offset_pattern = r"^([+-]?)(\d{1,2})(?::(\d{2}))?$"
|
|
18231
|
+
match = re.match(offset_pattern, timezone.strip())
|
|
18232
|
+
|
|
18233
|
+
if match:
|
|
18234
|
+
sign_str, hours_str, minutes_str = match.groups()
|
|
18235
|
+
hours = int(hours_str)
|
|
18236
|
+
minutes = int(minutes_str) if minutes_str else 0
|
|
18237
|
+
|
|
18238
|
+
# Apply sign (default positive if not specified)
|
|
18239
|
+
total_minutes = hours * 60 + minutes
|
|
18240
|
+
if sign_str == "-":
|
|
18241
|
+
total_minutes = -total_minutes
|
|
18242
|
+
|
|
18243
|
+
return datetime.timezone(datetime.timedelta(minutes=total_minutes))
|
|
18244
|
+
|
|
18245
|
+
# Try IANA timezone names (zoneinfo is standard in Python 3.9+)
|
|
18246
|
+
try:
|
|
18247
|
+
return ZoneInfo(timezone)
|
|
18248
|
+
except KeyError:
|
|
18249
|
+
pass
|
|
18250
|
+
|
|
18251
|
+
raise ValueError(
|
|
18252
|
+
f"Invalid timezone: '{timezone}'. Use an IANA timezone name "
|
|
18253
|
+
f"(e.g., 'America/New_York', 'UTC') or an offset (e.g., '-7', '+05:30')."
|
|
18254
|
+
)
|
|
18255
|
+
|
|
18256
|
+
|
|
18257
|
+
def _validate_timezone(timezone: str) -> None:
|
|
18258
|
+
"""
|
|
18259
|
+
Validate that a timezone string is valid.
|
|
18260
|
+
|
|
18261
|
+
Parameters
|
|
18262
|
+
----------
|
|
18263
|
+
timezone
|
|
18264
|
+
The timezone string to validate.
|
|
18265
|
+
|
|
18266
|
+
Raises
|
|
18267
|
+
------
|
|
18268
|
+
ValueError
|
|
18269
|
+
If the timezone is not valid.
|
|
18270
|
+
"""
|
|
18271
|
+
# Use _parse_timezone to validate - it will raise ValueError if invalid
|
|
18272
|
+
_parse_timezone(timezone)
|
|
18273
|
+
|
|
18274
|
+
|
|
18275
|
+
def _parse_reference_time(reference_time: str) -> datetime.datetime:
|
|
18276
|
+
"""
|
|
18277
|
+
Parse a reference time string into a datetime object.
|
|
18278
|
+
|
|
18279
|
+
Parameters
|
|
18280
|
+
----------
|
|
18281
|
+
reference_time
|
|
18282
|
+
An ISO 8601 formatted datetime string.
|
|
18283
|
+
|
|
18284
|
+
Returns
|
|
18285
|
+
-------
|
|
18286
|
+
datetime.datetime
|
|
18287
|
+
The parsed datetime object.
|
|
18288
|
+
|
|
18289
|
+
Raises
|
|
18290
|
+
------
|
|
18291
|
+
ValueError
|
|
18292
|
+
If the string cannot be parsed.
|
|
18293
|
+
"""
|
|
18294
|
+
# Try parsing with fromisoformat (handles most ISO 8601 formats)
|
|
18295
|
+
try:
|
|
18296
|
+
return datetime.datetime.fromisoformat(reference_time)
|
|
18297
|
+
except ValueError:
|
|
18298
|
+
pass
|
|
18299
|
+
|
|
18300
|
+
# Try parsing common formats
|
|
18301
|
+
formats = [
|
|
18302
|
+
"%Y-%m-%d %H:%M:%S",
|
|
18303
|
+
"%Y-%m-%d %H:%M:%S%z",
|
|
18304
|
+
"%Y-%m-%dT%H:%M:%S",
|
|
18305
|
+
"%Y-%m-%dT%H:%M:%S%z",
|
|
18306
|
+
"%Y-%m-%d",
|
|
18307
|
+
]
|
|
18308
|
+
|
|
18309
|
+
for fmt in formats:
|
|
18310
|
+
try:
|
|
18311
|
+
return datetime.datetime.strptime(reference_time, fmt)
|
|
18312
|
+
except ValueError:
|
|
18313
|
+
continue
|
|
18314
|
+
|
|
18315
|
+
raise ValueError(
|
|
18316
|
+
f"Could not parse reference_time '{reference_time}'. "
|
|
18317
|
+
f"Please use ISO 8601 format like '2024-01-15T10:30:00' or '2024-01-15T10:30:00+00:00'."
|
|
18318
|
+
)
|
|
18319
|
+
|
|
18320
|
+
|
|
18321
|
+
def _format_timedelta(td: datetime.timedelta) -> str:
|
|
18322
|
+
"""
|
|
18323
|
+
Format a timedelta into a human-readable string.
|
|
18324
|
+
|
|
18325
|
+
Parameters
|
|
18326
|
+
----------
|
|
18327
|
+
td
|
|
18328
|
+
The timedelta to format.
|
|
18329
|
+
|
|
18330
|
+
Returns
|
|
18331
|
+
-------
|
|
18332
|
+
str
|
|
18333
|
+
A human-readable string like "24 hours", "2 days 5 hours", etc.
|
|
18334
|
+
"""
|
|
18335
|
+
total_seconds = td.total_seconds()
|
|
18336
|
+
|
|
18337
|
+
if total_seconds < 60:
|
|
18338
|
+
val = round(total_seconds, 1)
|
|
18339
|
+
return f"{val}s"
|
|
18340
|
+
elif total_seconds < 3600:
|
|
18341
|
+
val = round(total_seconds / 60, 1)
|
|
18342
|
+
return f"{val}m"
|
|
18343
|
+
elif total_seconds < 86400:
|
|
18344
|
+
val = round(total_seconds / 3600, 1)
|
|
18345
|
+
return f"{val}h"
|
|
18346
|
+
elif total_seconds < 604800:
|
|
18347
|
+
# For days, show "xd yh" format for better readability
|
|
18348
|
+
days = int(total_seconds // 86400)
|
|
18349
|
+
remaining_hours = round((total_seconds % 86400) / 3600, 1)
|
|
18350
|
+
if remaining_hours == 0:
|
|
18351
|
+
return f"{days}d"
|
|
18352
|
+
else:
|
|
18353
|
+
return f"{days}d {remaining_hours}h"
|
|
18354
|
+
else:
|
|
18355
|
+
val = round(total_seconds / 604800)
|
|
18356
|
+
return f"{val}w"
|
|
18357
|
+
|
|
18358
|
+
|
|
17274
18359
|
def _transform_auto_brief(brief: str | bool | None) -> str | None:
|
|
17275
18360
|
if isinstance(brief, bool):
|
|
17276
18361
|
if brief:
|
|
@@ -17285,7 +18370,7 @@ def _process_action_str(
|
|
|
17285
18370
|
action_str: str,
|
|
17286
18371
|
step: int,
|
|
17287
18372
|
col: str | None,
|
|
17288
|
-
value:
|
|
18373
|
+
value: Any,
|
|
17289
18374
|
type: str,
|
|
17290
18375
|
level: str,
|
|
17291
18376
|
time: str,
|
|
@@ -17337,8 +18422,8 @@ def _process_action_str(
|
|
|
17337
18422
|
def _create_autobrief_or_failure_text(
|
|
17338
18423
|
assertion_type: str,
|
|
17339
18424
|
lang: str,
|
|
17340
|
-
column: str
|
|
17341
|
-
values:
|
|
18425
|
+
column: str,
|
|
18426
|
+
values: Any,
|
|
17342
18427
|
for_failure: bool,
|
|
17343
18428
|
locale: str | None = None,
|
|
17344
18429
|
n_rows: int | None = None,
|
|
@@ -17465,6 +18550,14 @@ def _create_autobrief_or_failure_text(
|
|
|
17465
18550
|
for_failure=for_failure,
|
|
17466
18551
|
)
|
|
17467
18552
|
|
|
18553
|
+
if assertion_type == "data_freshness":
|
|
18554
|
+
return _create_text_data_freshness(
|
|
18555
|
+
lang=lang,
|
|
18556
|
+
column=column,
|
|
18557
|
+
value=values,
|
|
18558
|
+
for_failure=for_failure,
|
|
18559
|
+
)
|
|
18560
|
+
|
|
17468
18561
|
if assertion_type == "col_pct_null":
|
|
17469
18562
|
return _create_text_col_pct_null(
|
|
17470
18563
|
lang=lang,
|
|
@@ -17490,7 +18583,7 @@ def _create_autobrief_or_failure_text(
|
|
|
17490
18583
|
for_failure=for_failure,
|
|
17491
18584
|
)
|
|
17492
18585
|
|
|
17493
|
-
return None
|
|
18586
|
+
return None
|
|
17494
18587
|
|
|
17495
18588
|
|
|
17496
18589
|
def _expect_failure_type(for_failure: bool) -> str:
|
|
@@ -17500,7 +18593,7 @@ def _expect_failure_type(for_failure: bool) -> str:
|
|
|
17500
18593
|
def _create_text_comparison(
|
|
17501
18594
|
assertion_type: str,
|
|
17502
18595
|
lang: str,
|
|
17503
|
-
column: str | list[str]
|
|
18596
|
+
column: str | list[str],
|
|
17504
18597
|
values: str | None,
|
|
17505
18598
|
for_failure: bool = False,
|
|
17506
18599
|
) -> str:
|
|
@@ -17526,7 +18619,7 @@ def _create_text_comparison(
|
|
|
17526
18619
|
|
|
17527
18620
|
def _create_text_between(
|
|
17528
18621
|
lang: str,
|
|
17529
|
-
column: str
|
|
18622
|
+
column: str,
|
|
17530
18623
|
value_1: str,
|
|
17531
18624
|
value_2: str,
|
|
17532
18625
|
not_: bool = False,
|
|
@@ -17556,7 +18649,7 @@ def _create_text_between(
|
|
|
17556
18649
|
|
|
17557
18650
|
|
|
17558
18651
|
def _create_text_set(
|
|
17559
|
-
lang: str, column: str
|
|
18652
|
+
lang: str, column: str, values: list[Any], not_: bool = False, for_failure: bool = False
|
|
17560
18653
|
) -> str:
|
|
17561
18654
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17562
18655
|
|
|
@@ -17578,9 +18671,7 @@ def _create_text_set(
|
|
|
17578
18671
|
return text
|
|
17579
18672
|
|
|
17580
18673
|
|
|
17581
|
-
def _create_text_null(
|
|
17582
|
-
lang: str, column: str | None, not_: bool = False, for_failure: bool = False
|
|
17583
|
-
) -> str:
|
|
18674
|
+
def _create_text_null(lang: str, column: str, not_: bool = False, for_failure: bool = False) -> str:
|
|
17584
18675
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17585
18676
|
|
|
17586
18677
|
column_text = _prep_column_text(column=column)
|
|
@@ -17597,9 +18688,7 @@ def _create_text_null(
|
|
|
17597
18688
|
return text
|
|
17598
18689
|
|
|
17599
18690
|
|
|
17600
|
-
def _create_text_regex(
|
|
17601
|
-
lang: str, column: str | None, pattern: str | dict, for_failure: bool = False
|
|
17602
|
-
) -> str:
|
|
18691
|
+
def _create_text_regex(lang: str, column: str, pattern: str, for_failure: bool = False) -> str:
|
|
17603
18692
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17604
18693
|
|
|
17605
18694
|
column_text = _prep_column_text(column=column)
|
|
@@ -17631,7 +18720,7 @@ def _create_text_expr(lang: str, for_failure: bool) -> str:
|
|
|
17631
18720
|
return EXPECT_FAIL_TEXT[f"col_vals_expr_{type_}_text"][lang]
|
|
17632
18721
|
|
|
17633
18722
|
|
|
17634
|
-
def _create_text_col_exists(lang: str, column: str
|
|
18723
|
+
def _create_text_col_exists(lang: str, column: str, for_failure: bool = False) -> str:
|
|
17635
18724
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17636
18725
|
|
|
17637
18726
|
column_text = _prep_column_text(column=column)
|
|
@@ -17681,7 +18770,7 @@ def _create_text_rows_complete(
|
|
|
17681
18770
|
return text
|
|
17682
18771
|
|
|
17683
18772
|
|
|
17684
|
-
def _create_text_row_count_match(lang: str, value:
|
|
18773
|
+
def _create_text_row_count_match(lang: str, value: dict, for_failure: bool = False) -> str:
|
|
17685
18774
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17686
18775
|
|
|
17687
18776
|
values_text = _prep_values_text(value["count"], lang=lang)
|
|
@@ -17689,7 +18778,7 @@ def _create_text_row_count_match(lang: str, value: int, for_failure: bool = Fals
|
|
|
17689
18778
|
return EXPECT_FAIL_TEXT[f"row_count_match_n_{type_}_text"][lang].format(values_text=values_text)
|
|
17690
18779
|
|
|
17691
18780
|
|
|
17692
|
-
def _create_text_col_count_match(lang: str, value:
|
|
18781
|
+
def _create_text_col_count_match(lang: str, value: dict, for_failure: bool = False) -> str:
|
|
17693
18782
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17694
18783
|
|
|
17695
18784
|
values_text = _prep_values_text(value["count"], lang=lang)
|
|
@@ -17697,6 +18786,33 @@ def _create_text_col_count_match(lang: str, value: int, for_failure: bool = Fals
|
|
|
17697
18786
|
return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text)
|
|
17698
18787
|
|
|
17699
18788
|
|
|
18789
|
+
def _create_text_data_freshness(
|
|
18790
|
+
lang: str,
|
|
18791
|
+
column: str | None,
|
|
18792
|
+
value: dict,
|
|
18793
|
+
for_failure: bool = False,
|
|
18794
|
+
) -> str:
|
|
18795
|
+
"""Create text for data_freshness validation."""
|
|
18796
|
+
type_ = _expect_failure_type(for_failure=for_failure)
|
|
18797
|
+
|
|
18798
|
+
column_text = _prep_column_text(column=column)
|
|
18799
|
+
max_age_text = _format_timedelta(value.get("max_age"))
|
|
18800
|
+
|
|
18801
|
+
if for_failure:
|
|
18802
|
+
age = value.get("age")
|
|
18803
|
+
age_text = _format_timedelta(age) if age else "unknown"
|
|
18804
|
+
return EXPECT_FAIL_TEXT[f"data_freshness_{type_}_text"][lang].format(
|
|
18805
|
+
column_text=column_text,
|
|
18806
|
+
max_age_text=max_age_text,
|
|
18807
|
+
age_text=age_text,
|
|
18808
|
+
)
|
|
18809
|
+
else:
|
|
18810
|
+
return EXPECT_FAIL_TEXT[f"data_freshness_{type_}_text"][lang].format(
|
|
18811
|
+
column_text=column_text,
|
|
18812
|
+
max_age_text=max_age_text,
|
|
18813
|
+
)
|
|
18814
|
+
|
|
18815
|
+
|
|
17700
18816
|
def _create_text_col_pct_null(
|
|
17701
18817
|
lang: str,
|
|
17702
18818
|
column: str | None,
|
|
@@ -17826,19 +18942,13 @@ def _create_text_prompt(lang: str, prompt: str, for_failure: bool = False) -> st
|
|
|
17826
18942
|
def _prep_column_text(column: str | list[str]) -> str:
|
|
17827
18943
|
if isinstance(column, list):
|
|
17828
18944
|
return "`" + str(column[0]) + "`"
|
|
17829
|
-
|
|
18945
|
+
if isinstance(column, str):
|
|
17830
18946
|
return "`" + column + "`"
|
|
17831
|
-
|
|
17832
|
-
return ""
|
|
18947
|
+
raise AssertionError
|
|
17833
18948
|
|
|
17834
18949
|
|
|
17835
18950
|
def _prep_values_text(
|
|
17836
|
-
values:
|
|
17837
|
-
| int
|
|
17838
|
-
| float
|
|
17839
|
-
| datetime.datetime
|
|
17840
|
-
| datetime.date
|
|
17841
|
-
| list[str | int | float | datetime.datetime | datetime.date],
|
|
18951
|
+
values: _CompliantValue | _CompliantValues,
|
|
17842
18952
|
lang: str,
|
|
17843
18953
|
limit: int = 3,
|
|
17844
18954
|
) -> str:
|
|
@@ -17886,7 +18996,7 @@ def _prep_values_text(
|
|
|
17886
18996
|
return values_str
|
|
17887
18997
|
|
|
17888
18998
|
|
|
17889
|
-
def _seg_expr_from_string(data_tbl:
|
|
18999
|
+
def _seg_expr_from_string(data_tbl: Any, segments_expr: str) -> tuple[str, str]:
|
|
17890
19000
|
"""
|
|
17891
19001
|
Obtain the segmentation categories from a table column.
|
|
17892
19002
|
|
|
@@ -17989,7 +19099,7 @@ def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, Any]]:
|
|
|
17989
19099
|
return seg_tuples
|
|
17990
19100
|
|
|
17991
19101
|
|
|
17992
|
-
def _apply_segments(data_tbl:
|
|
19102
|
+
def _apply_segments(data_tbl: Any, segments_expr: tuple[str, str]) -> Any:
|
|
17993
19103
|
"""
|
|
17994
19104
|
Apply the segments expression to the data table.
|
|
17995
19105
|
|
|
@@ -18053,8 +19163,26 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
|
|
|
18053
19163
|
except ValueError: # pragma: no cover
|
|
18054
19164
|
pass # pragma: no cover
|
|
18055
19165
|
|
|
18056
|
-
# Format 2:
|
|
18057
|
-
#
|
|
19166
|
+
# Format 2: Direct datetime strings like "2016-01-04 00:00:01" (Polars 1.36+)
|
|
19167
|
+
# These don't have UTC suffix anymore
|
|
19168
|
+
elif (
|
|
19169
|
+
" " in segment_str
|
|
19170
|
+
and "UTC" not in segment_str
|
|
19171
|
+
and "[" not in segment_str
|
|
19172
|
+
and ".alias" not in segment_str
|
|
19173
|
+
):
|
|
19174
|
+
try:
|
|
19175
|
+
parsed_dt = datetime.fromisoformat(segment_str)
|
|
19176
|
+
# Convert midnight datetimes to dates for consistency
|
|
19177
|
+
if parsed_dt.time() == datetime.min.time():
|
|
19178
|
+
parsed_value = parsed_dt.date() # pragma: no cover
|
|
19179
|
+
else:
|
|
19180
|
+
parsed_value = parsed_dt
|
|
19181
|
+
except ValueError: # pragma: no cover
|
|
19182
|
+
pass # pragma: no cover
|
|
19183
|
+
|
|
19184
|
+
# Format 3: Datetime strings with UTC timezone like
|
|
19185
|
+
# "2016-01-04 00:00:01 UTC.strict_cast(...)" (Polars < 1.36)
|
|
18058
19186
|
elif " UTC" in segment_str:
|
|
18059
19187
|
try:
|
|
18060
19188
|
# Extract just the datetime part before "UTC"
|
|
@@ -18069,7 +19197,7 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
|
|
|
18069
19197
|
except (ValueError, IndexError): # pragma: no cover
|
|
18070
19198
|
pass # pragma: no cover
|
|
18071
19199
|
|
|
18072
|
-
# Format
|
|
19200
|
+
# Format 4: Bracketed expressions like ['2016-01-04']
|
|
18073
19201
|
elif segment_str.startswith("[") and segment_str.endswith("]"):
|
|
18074
19202
|
try: # pragma: no cover
|
|
18075
19203
|
# Remove [' and ']
|
|
@@ -18204,8 +19332,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
|
|
|
18204
19332
|
|
|
18205
19333
|
def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]:
|
|
18206
19334
|
# For each icon, get the assertion icon SVG test from SVG_ICONS_FOR_ASSERTION_TYPES dictionary
|
|
18207
|
-
|
|
18208
|
-
icon_svg = [SVG_ICONS_FOR_ASSERTION_TYPES.get(icon) for icon in icon]
|
|
19335
|
+
icon_svg: list[str] = [SVG_ICONS_FOR_ASSERTION_TYPES[icon] for icon in icon]
|
|
18209
19336
|
|
|
18210
19337
|
# Replace the width and height in the SVG string
|
|
18211
19338
|
for i in range(len(icon_svg)):
|
|
@@ -18214,11 +19341,9 @@ def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]:
|
|
|
18214
19341
|
return icon_svg
|
|
18215
19342
|
|
|
18216
19343
|
|
|
18217
|
-
def _replace_svg_dimensions(svg:
|
|
19344
|
+
def _replace_svg_dimensions(svg: str, height_width: int | float) -> str:
|
|
18218
19345
|
svg = re.sub(r'width="[0-9]*?px', f'width="{height_width}px', svg)
|
|
18219
|
-
|
|
18220
|
-
|
|
18221
|
-
return svg
|
|
19346
|
+
return re.sub(r'height="[0-9]*?px', f'height="{height_width}px', svg)
|
|
18222
19347
|
|
|
18223
19348
|
|
|
18224
19349
|
def _get_title_text(
|
|
@@ -18282,7 +19407,7 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s
|
|
|
18282
19407
|
return title_text
|
|
18283
19408
|
|
|
18284
19409
|
|
|
18285
|
-
def _transform_tbl_preprocessed(pre:
|
|
19410
|
+
def _transform_tbl_preprocessed(pre: Any, seg: Any, interrogation_performed: bool) -> list[str]:
|
|
18286
19411
|
# If no interrogation was performed, return a list of empty strings
|
|
18287
19412
|
if not interrogation_performed:
|
|
18288
19413
|
return ["" for _ in range(len(pre))]
|
|
@@ -18304,9 +19429,7 @@ def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: boo
|
|
|
18304
19429
|
|
|
18305
19430
|
def _get_preprocessed_table_icon(icon: list[str]) -> list[str]:
|
|
18306
19431
|
# For each icon, get the SVG icon from the SVG_ICONS_FOR_TBL_STATUS dictionary
|
|
18307
|
-
|
|
18308
|
-
|
|
18309
|
-
return icon_svg
|
|
19432
|
+
return [SVG_ICONS_FOR_TBL_STATUS[icon] for icon in icon]
|
|
18310
19433
|
|
|
18311
19434
|
|
|
18312
19435
|
def _transform_eval(
|
|
@@ -18384,9 +19507,9 @@ def _transform_test_units(
|
|
|
18384
19507
|
return _format_single_number_with_gt(
|
|
18385
19508
|
value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
|
|
18386
19509
|
)
|
|
18387
|
-
|
|
18388
|
-
|
|
18389
|
-
|
|
19510
|
+
formatted = vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)
|
|
19511
|
+
assert isinstance(formatted, list)
|
|
19512
|
+
return formatted[0]
|
|
18390
19513
|
|
|
18391
19514
|
return [
|
|
18392
19515
|
(
|
|
@@ -18590,22 +19713,21 @@ def _transform_assertion_str(
|
|
|
18590
19713
|
return type_upd
|
|
18591
19714
|
|
|
18592
19715
|
|
|
18593
|
-
def _pre_processing_funcs_to_str(pre: Callable) -> str | list[str]:
|
|
19716
|
+
def _pre_processing_funcs_to_str(pre: Callable) -> str | list[str] | None:
|
|
18594
19717
|
if isinstance(pre, Callable):
|
|
18595
19718
|
return _get_callable_source(fn=pre)
|
|
19719
|
+
return None
|
|
18596
19720
|
|
|
18597
19721
|
|
|
18598
19722
|
def _get_callable_source(fn: Callable) -> str:
|
|
18599
|
-
|
|
18600
|
-
|
|
18601
|
-
|
|
18602
|
-
|
|
18603
|
-
|
|
18604
|
-
|
|
18605
|
-
|
|
18606
|
-
|
|
18607
|
-
return fn.__name__
|
|
18608
|
-
return fn # pragma: no cover
|
|
19723
|
+
try:
|
|
19724
|
+
source_lines, _ = inspect.getsourcelines(fn)
|
|
19725
|
+
source = "".join(source_lines).strip()
|
|
19726
|
+
# Extract the `pre` argument from the source code
|
|
19727
|
+
pre_arg = _extract_pre_argument(source)
|
|
19728
|
+
return pre_arg
|
|
19729
|
+
except (OSError, TypeError): # pragma: no cover
|
|
19730
|
+
return fn.__name__ # ty: ignore
|
|
18609
19731
|
|
|
18610
19732
|
|
|
18611
19733
|
def _extract_pre_argument(source: str) -> str:
|
|
@@ -18625,12 +19747,78 @@ def _extract_pre_argument(source: str) -> str:
|
|
|
18625
19747
|
return pre_arg
|
|
18626
19748
|
|
|
18627
19749
|
|
|
19750
|
+
def _create_governance_metadata_html(
|
|
19751
|
+
owner: str | None,
|
|
19752
|
+
consumers: list[str] | None,
|
|
19753
|
+
version: str | None,
|
|
19754
|
+
) -> str:
|
|
19755
|
+
"""
|
|
19756
|
+
Create HTML for governance metadata display in the report footer.
|
|
19757
|
+
|
|
19758
|
+
Parameters
|
|
19759
|
+
----------
|
|
19760
|
+
owner
|
|
19761
|
+
The owner of the data being validated.
|
|
19762
|
+
consumers
|
|
19763
|
+
List of consumers who depend on the data.
|
|
19764
|
+
version
|
|
19765
|
+
The version of the validation plan.
|
|
19766
|
+
|
|
19767
|
+
Returns
|
|
19768
|
+
-------
|
|
19769
|
+
str
|
|
19770
|
+
HTML string containing formatted governance metadata, or empty string if no metadata.
|
|
19771
|
+
"""
|
|
19772
|
+
if owner is None and consumers is None and version is None:
|
|
19773
|
+
return ""
|
|
19774
|
+
|
|
19775
|
+
metadata_parts = []
|
|
19776
|
+
|
|
19777
|
+
# Common style for the metadata badges (similar to timing style but slightly smaller font)
|
|
19778
|
+
badge_style = (
|
|
19779
|
+
"background-color: #FFF; color: #444; padding: 0.5em 0.5em; position: inherit; "
|
|
19780
|
+
"margin-right: 5px; border: solid 1px #999999; font-variant-numeric: tabular-nums; "
|
|
19781
|
+
"border-radius: 0; padding: 2px 10px 2px 10px; font-size: 11px;"
|
|
19782
|
+
)
|
|
19783
|
+
label_style = (
|
|
19784
|
+
"color: #777; font-weight: bold; font-size: 9px; text-transform: uppercase; "
|
|
19785
|
+
"margin-right: 3px;"
|
|
19786
|
+
)
|
|
19787
|
+
|
|
19788
|
+
if owner is not None:
|
|
19789
|
+
metadata_parts.append(
|
|
19790
|
+
f"<span style='{badge_style}'><span style='{label_style}'>Owner:</span> {owner}</span>"
|
|
19791
|
+
)
|
|
19792
|
+
|
|
19793
|
+
if consumers is not None and len(consumers) > 0:
|
|
19794
|
+
consumers_str = ", ".join(consumers)
|
|
19795
|
+
metadata_parts.append(
|
|
19796
|
+
f"<span style='{badge_style}'>"
|
|
19797
|
+
f"<span style='{label_style}'>Consumers:</span> {consumers_str}"
|
|
19798
|
+
f"</span>"
|
|
19799
|
+
)
|
|
19800
|
+
|
|
19801
|
+
if version is not None:
|
|
19802
|
+
metadata_parts.append(
|
|
19803
|
+
f"<span style='{badge_style}'>"
|
|
19804
|
+
f"<span style='{label_style}'>Version:</span> {version}"
|
|
19805
|
+
f"</span>"
|
|
19806
|
+
)
|
|
19807
|
+
|
|
19808
|
+
return (
|
|
19809
|
+
f"<div style='margin-top: 5px; margin-bottom: 5px; margin-left: 10px;'>"
|
|
19810
|
+
f"{''.join(metadata_parts)}"
|
|
19811
|
+
f"</div>"
|
|
19812
|
+
)
|
|
19813
|
+
|
|
19814
|
+
|
|
18628
19815
|
def _create_table_time_html(
|
|
18629
19816
|
time_start: datetime.datetime | None, time_end: datetime.datetime | None
|
|
18630
19817
|
) -> str:
|
|
18631
19818
|
if time_start is None:
|
|
18632
19819
|
return ""
|
|
18633
19820
|
|
|
19821
|
+
assert time_end is not None # typing
|
|
18634
19822
|
# Get the time duration (difference between `time_end` and `time_start`) in seconds
|
|
18635
19823
|
time_duration = (time_end - time_start).total_seconds()
|
|
18636
19824
|
|
|
@@ -18845,11 +20033,11 @@ def _format_number_safe(
|
|
|
18845
20033
|
locale=locale,
|
|
18846
20034
|
df_lib=df_lib,
|
|
18847
20035
|
)
|
|
18848
|
-
|
|
18849
|
-
|
|
18850
|
-
|
|
18851
|
-
|
|
18852
|
-
|
|
20036
|
+
ints = fmt_number(
|
|
20037
|
+
value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
20038
|
+
)
|
|
20039
|
+
assert isinstance(ints, list)
|
|
20040
|
+
return ints[0]
|
|
18853
20041
|
|
|
18854
20042
|
|
|
18855
20043
|
def _format_integer_safe(value: int, locale: str = "en", df_lib=None) -> str:
|
|
@@ -18862,9 +20050,10 @@ def _format_integer_safe(value: int, locale: str = "en", df_lib=None) -> str:
|
|
|
18862
20050
|
if df_lib is not None and value is not None:
|
|
18863
20051
|
# Use GT-based formatting to avoid Pandas dependency completely
|
|
18864
20052
|
return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
|
|
18865
|
-
|
|
18866
|
-
|
|
18867
|
-
|
|
20053
|
+
|
|
20054
|
+
ints = fmt_integer(value, locale=locale)
|
|
20055
|
+
assert isinstance(ints, list)
|
|
20056
|
+
return ints[0]
|
|
18868
20057
|
|
|
18869
20058
|
|
|
18870
20059
|
def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> str:
|
|
@@ -18980,7 +20169,7 @@ def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en"
|
|
|
18980
20169
|
HTML string containing the formatted threshold information.
|
|
18981
20170
|
"""
|
|
18982
20171
|
if thresholds == Thresholds():
|
|
18983
|
-
return ""
|
|
20172
|
+
return "" # pragma: no cover
|
|
18984
20173
|
|
|
18985
20174
|
# Get df_lib for formatting
|
|
18986
20175
|
df_lib = None
|
|
@@ -18988,10 +20177,10 @@ def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en"
|
|
|
18988
20177
|
import polars as pl
|
|
18989
20178
|
|
|
18990
20179
|
df_lib = pl
|
|
18991
|
-
elif _is_lib_present("pandas"):
|
|
18992
|
-
import pandas as pd
|
|
20180
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
20181
|
+
import pandas as pd # pragma: no cover
|
|
18993
20182
|
|
|
18994
|
-
df_lib = pd
|
|
20183
|
+
df_lib = pd # pragma: no cover
|
|
18995
20184
|
|
|
18996
20185
|
# Helper function to format threshold values using the shared formatting functions
|
|
18997
20186
|
def _format_threshold_value(fraction: float | None, count: int | None) -> str:
|
|
@@ -18999,10 +20188,12 @@ def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en"
|
|
|
18999
20188
|
# Format as fraction/percentage with locale formatting
|
|
19000
20189
|
if fraction == 0:
|
|
19001
20190
|
return "0"
|
|
19002
|
-
elif fraction < 0.01:
|
|
20191
|
+
elif fraction < 0.01: # pragma: no cover
|
|
19003
20192
|
# For very small fractions, show "<0.01" with locale formatting
|
|
19004
|
-
formatted = _format_number_safe(
|
|
19005
|
-
|
|
20193
|
+
formatted = _format_number_safe(
|
|
20194
|
+
0.01, decimals=2, locale=locale, df_lib=df_lib
|
|
20195
|
+
) # pragma: no cover
|
|
20196
|
+
return f"<{formatted}" # pragma: no cover
|
|
19006
20197
|
else:
|
|
19007
20198
|
# Use shared formatting function with drop_trailing_zeros
|
|
19008
20199
|
formatted = _format_number_safe(
|
|
@@ -19079,14 +20270,14 @@ def _create_local_threshold_note_text(thresholds: Thresholds) -> str:
|
|
|
19079
20270
|
if fraction is not None:
|
|
19080
20271
|
if fraction == 0:
|
|
19081
20272
|
return "0"
|
|
19082
|
-
elif fraction < 0.01:
|
|
19083
|
-
return "<0.01"
|
|
20273
|
+
elif fraction < 0.01: # pragma: no cover
|
|
20274
|
+
return "<0.01" # pragma: no cover
|
|
19084
20275
|
else:
|
|
19085
20276
|
return f"{fraction:.2f}".rstrip("0").rstrip(".")
|
|
19086
20277
|
elif count is not None:
|
|
19087
20278
|
return str(count)
|
|
19088
20279
|
else:
|
|
19089
|
-
return "—"
|
|
20280
|
+
return "—" # pragma: no cover
|
|
19090
20281
|
|
|
19091
20282
|
parts = []
|
|
19092
20283
|
|
|
@@ -19105,7 +20296,7 @@ def _create_local_threshold_note_text(thresholds: Thresholds) -> str:
|
|
|
19105
20296
|
if parts:
|
|
19106
20297
|
return "Step-specific thresholds set: " + ", ".join(parts)
|
|
19107
20298
|
else:
|
|
19108
|
-
return ""
|
|
20299
|
+
return "" # pragma: no cover
|
|
19109
20300
|
|
|
19110
20301
|
|
|
19111
20302
|
def _create_threshold_reset_note_html(locale: str = "en") -> str:
|
|
@@ -19654,13 +20845,13 @@ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") ->
|
|
|
19654
20845
|
f'<span style="color:#FF3300;">✗</span> {failed_text}: ' + ", ".join(failures) + "."
|
|
19655
20846
|
)
|
|
19656
20847
|
else:
|
|
19657
|
-
summary = f'<span style="color:#FF3300;">✗</span> {failed_text}.'
|
|
20848
|
+
summary = f'<span style="color:#FF3300;">✗</span> {failed_text}.' # pragma: no cover
|
|
19658
20849
|
|
|
19659
20850
|
# Generate the step report table using the existing function
|
|
19660
20851
|
# We'll call either _step_report_schema_in_order or _step_report_schema_any_order
|
|
19661
20852
|
# depending on the in_order parameter
|
|
19662
|
-
if in_order:
|
|
19663
|
-
step_report_gt = _step_report_schema_in_order(
|
|
20853
|
+
if in_order: # pragma: no cover
|
|
20854
|
+
step_report_gt = _step_report_schema_in_order( # pragma: no cover
|
|
19664
20855
|
step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
|
|
19665
20856
|
)
|
|
19666
20857
|
else:
|
|
@@ -19691,7 +20882,7 @@ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") ->
|
|
|
19691
20882
|
"""
|
|
19692
20883
|
|
|
19693
20884
|
# Add the settings as an additional source note to the step report
|
|
19694
|
-
step_report_gt = step_report_gt.tab_source_note(source_note=html(source_note_html))
|
|
20885
|
+
step_report_gt = step_report_gt.tab_source_note(source_note=html(source_note_html)) # type: ignore[union-attr]
|
|
19695
20886
|
|
|
19696
20887
|
# Extract the HTML from the GT object
|
|
19697
20888
|
step_report_html = step_report_gt._repr_html_()
|
|
@@ -19743,12 +20934,12 @@ def _step_report_row_based(
|
|
|
19743
20934
|
column: str,
|
|
19744
20935
|
column_position: int,
|
|
19745
20936
|
columns_subset: list[str] | None,
|
|
19746
|
-
values:
|
|
20937
|
+
values: Any,
|
|
19747
20938
|
inclusive: tuple[bool, bool] | None,
|
|
19748
20939
|
n: int,
|
|
19749
20940
|
n_failed: int,
|
|
19750
20941
|
all_passed: bool,
|
|
19751
|
-
extract:
|
|
20942
|
+
extract: Any,
|
|
19752
20943
|
tbl_preview: GT,
|
|
19753
20944
|
header: str,
|
|
19754
20945
|
limit: int | None,
|
|
@@ -19775,10 +20966,12 @@ def _step_report_row_based(
|
|
|
19775
20966
|
elif assertion_type == "col_vals_le":
|
|
19776
20967
|
text = f"{column} ≤ {values}"
|
|
19777
20968
|
elif assertion_type == "col_vals_between":
|
|
20969
|
+
assert inclusive is not None
|
|
19778
20970
|
symbol_left = "≤" if inclusive[0] else "<"
|
|
19779
20971
|
symbol_right = "≤" if inclusive[1] else "<"
|
|
19780
20972
|
text = f"{values[0]} {symbol_left} {column} {symbol_right} {values[1]}"
|
|
19781
20973
|
elif assertion_type == "col_vals_outside":
|
|
20974
|
+
assert inclusive is not None
|
|
19782
20975
|
symbol_left = "<" if inclusive[0] else "≤"
|
|
19783
20976
|
symbol_right = ">" if inclusive[1] else "≥"
|
|
19784
20977
|
text = f"{column} {symbol_left} {values[0]}, {column} {symbol_right} {values[1]}"
|
|
@@ -19999,7 +21192,7 @@ def _step_report_rows_distinct(
|
|
|
19999
21192
|
n: int,
|
|
20000
21193
|
n_failed: int,
|
|
20001
21194
|
all_passed: bool,
|
|
20002
|
-
extract:
|
|
21195
|
+
extract: Any,
|
|
20003
21196
|
tbl_preview: GT,
|
|
20004
21197
|
header: str,
|
|
20005
21198
|
limit: int | None,
|
|
@@ -20125,9 +21318,299 @@ def _step_report_rows_distinct(
|
|
|
20125
21318
|
return step_report
|
|
20126
21319
|
|
|
20127
21320
|
|
|
21321
|
+
def _step_report_aggregate(
|
|
21322
|
+
assertion_type: str,
|
|
21323
|
+
i: int,
|
|
21324
|
+
column: str,
|
|
21325
|
+
values: dict,
|
|
21326
|
+
all_passed: bool,
|
|
21327
|
+
val_info: dict | None,
|
|
21328
|
+
header: str,
|
|
21329
|
+
lang: str,
|
|
21330
|
+
) -> GT:
|
|
21331
|
+
"""
|
|
21332
|
+
Generate a step report for aggregate validation methods (col_sum_*, col_avg_*, col_sd_*).
|
|
21333
|
+
|
|
21334
|
+
This creates a 1-row table showing the computed aggregate value vs. the target value,
|
|
21335
|
+
along with tolerance and pass/fail status.
|
|
21336
|
+
"""
|
|
21337
|
+
|
|
21338
|
+
# Determine whether the `lang` value represents a right-to-left language
|
|
21339
|
+
is_rtl_lang = lang in RTL_LANGUAGES
|
|
21340
|
+
direction_rtl = " direction: rtl;" if is_rtl_lang else ""
|
|
21341
|
+
|
|
21342
|
+
# Parse assertion type to get aggregate function and comparison operator
|
|
21343
|
+
# Format: col_{agg}_{comp} (e.g., col_sum_eq, col_avg_gt, col_sd_le)
|
|
21344
|
+
parts = assertion_type.split("_")
|
|
21345
|
+
agg_type = parts[1] # sum, avg, sd
|
|
21346
|
+
comp_type = parts[2] # eq, gt, ge, lt, le
|
|
21347
|
+
|
|
21348
|
+
# Map aggregate type to display name
|
|
21349
|
+
agg_display = {"sum": "SUM", "avg": "AVG", "sd": "SD"}.get(agg_type, agg_type.upper())
|
|
21350
|
+
|
|
21351
|
+
# Map comparison type to symbol
|
|
21352
|
+
comp_symbols = {
|
|
21353
|
+
"eq": "=",
|
|
21354
|
+
"gt": ">",
|
|
21355
|
+
"ge": "≥",
|
|
21356
|
+
"lt": "<",
|
|
21357
|
+
"le": "≤",
|
|
21358
|
+
}
|
|
21359
|
+
comp_symbol = comp_symbols.get(comp_type, comp_type)
|
|
21360
|
+
|
|
21361
|
+
# Get computed values from val_info (stored during interrogation)
|
|
21362
|
+
if val_info is not None:
|
|
21363
|
+
actual = val_info.get("actual", None)
|
|
21364
|
+
target = val_info.get("target", None)
|
|
21365
|
+
tol = val_info.get("tol", 0)
|
|
21366
|
+
lower_bound = val_info.get("lower_bound", target)
|
|
21367
|
+
upper_bound = val_info.get("upper_bound", target)
|
|
21368
|
+
else:
|
|
21369
|
+
# Fallback if val_info is not available
|
|
21370
|
+
actual = None
|
|
21371
|
+
target = values.get("value", None)
|
|
21372
|
+
tol = values.get("tol", 0)
|
|
21373
|
+
lower_bound = target
|
|
21374
|
+
upper_bound = target
|
|
21375
|
+
|
|
21376
|
+
# Format column name for display (handle list vs string)
|
|
21377
|
+
if isinstance(column, list):
|
|
21378
|
+
column_display = column[0] if len(column) == 1 else ", ".join(column)
|
|
21379
|
+
else:
|
|
21380
|
+
column_display = str(column)
|
|
21381
|
+
|
|
21382
|
+
# Generate assertion text for header
|
|
21383
|
+
if target is not None:
|
|
21384
|
+
target_display = f"{target:,.6g}" if isinstance(target, float) else f"{target:,}"
|
|
21385
|
+
assertion_text = f"{agg_display}({column_display}) {comp_symbol} {target_display}"
|
|
21386
|
+
else:
|
|
21387
|
+
assertion_text = f"{agg_display}({column_display}) {comp_symbol} ?"
|
|
21388
|
+
|
|
21389
|
+
# Calculate difference from boundary
|
|
21390
|
+
if actual is not None and target is not None:
|
|
21391
|
+
if comp_type == "eq":
|
|
21392
|
+
# For equality, show distance from target (considering tolerance)
|
|
21393
|
+
if lower_bound == upper_bound:
|
|
21394
|
+
difference = actual - target
|
|
21395
|
+
else:
|
|
21396
|
+
# With tolerance, show distance from nearest bound
|
|
21397
|
+
if actual < lower_bound:
|
|
21398
|
+
difference = actual - lower_bound
|
|
21399
|
+
elif actual > upper_bound:
|
|
21400
|
+
difference = actual - upper_bound
|
|
21401
|
+
else:
|
|
21402
|
+
difference = 0 # Within bounds
|
|
21403
|
+
elif comp_type in ["gt", "ge"]:
|
|
21404
|
+
# Distance from lower bound (positive if passing)
|
|
21405
|
+
difference = actual - lower_bound
|
|
21406
|
+
elif comp_type in ["lt", "le"]:
|
|
21407
|
+
# Distance from upper bound (negative if passing)
|
|
21408
|
+
difference = actual - upper_bound
|
|
21409
|
+
else:
|
|
21410
|
+
difference = actual - target
|
|
21411
|
+
else:
|
|
21412
|
+
difference = None
|
|
21413
|
+
|
|
21414
|
+
# Format values for display
|
|
21415
|
+
def format_value(v):
|
|
21416
|
+
if v is None:
|
|
21417
|
+
return "—"
|
|
21418
|
+
if isinstance(v, float):
|
|
21419
|
+
return f"{v:,.6g}"
|
|
21420
|
+
return f"{v:,}"
|
|
21421
|
+
|
|
21422
|
+
# Format tolerance for display
|
|
21423
|
+
if tol == 0:
|
|
21424
|
+
tol_display = "—"
|
|
21425
|
+
elif isinstance(tol, tuple):
|
|
21426
|
+
tol_display = f"(-{tol[0]}, +{tol[1]})"
|
|
21427
|
+
else:
|
|
21428
|
+
tol_display = f"±{tol}"
|
|
21429
|
+
|
|
21430
|
+
# Format difference with sign
|
|
21431
|
+
if difference is not None:
|
|
21432
|
+
if difference == 0:
|
|
21433
|
+
diff_display = "0"
|
|
21434
|
+
elif difference > 0:
|
|
21435
|
+
diff_display = (
|
|
21436
|
+
f"+{difference:,.6g}" if isinstance(difference, float) else f"+{difference:,}"
|
|
21437
|
+
)
|
|
21438
|
+
else:
|
|
21439
|
+
diff_display = (
|
|
21440
|
+
f"{difference:,.6g}" if isinstance(difference, float) else f"{difference:,}"
|
|
21441
|
+
)
|
|
21442
|
+
else:
|
|
21443
|
+
diff_display = "—"
|
|
21444
|
+
|
|
21445
|
+
# Create pass/fail indicator
|
|
21446
|
+
if all_passed:
|
|
21447
|
+
status_html = CHECK_MARK_SPAN
|
|
21448
|
+
status_color = "#4CA64C"
|
|
21449
|
+
else:
|
|
21450
|
+
status_html = CROSS_MARK_SPAN
|
|
21451
|
+
status_color = "#CF142B"
|
|
21452
|
+
|
|
21453
|
+
# Select DataFrame library (prefer Polars, fall back to Pandas)
|
|
21454
|
+
if _is_lib_present("polars"):
|
|
21455
|
+
import polars as pl
|
|
21456
|
+
|
|
21457
|
+
df_lib = pl
|
|
21458
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
21459
|
+
import pandas as pd # pragma: no cover
|
|
21460
|
+
|
|
21461
|
+
df_lib = pd # pragma: no cover
|
|
21462
|
+
else: # pragma: no cover
|
|
21463
|
+
raise ImportError(
|
|
21464
|
+
"Neither Polars nor Pandas is available for step report generation"
|
|
21465
|
+
) # pragma: no cover
|
|
21466
|
+
|
|
21467
|
+
# Create the data for the 1-row table
|
|
21468
|
+
report_data = df_lib.DataFrame(
|
|
21469
|
+
{
|
|
21470
|
+
"actual": [format_value(actual)],
|
|
21471
|
+
"target": [format_value(target)],
|
|
21472
|
+
"tolerance": [tol_display],
|
|
21473
|
+
"difference": [diff_display],
|
|
21474
|
+
"status": [status_html],
|
|
21475
|
+
}
|
|
21476
|
+
)
|
|
21477
|
+
|
|
21478
|
+
# Create GT table with styling matching preview() and other step reports
|
|
21479
|
+
step_report = (
|
|
21480
|
+
GT(report_data, id="pb_step_tbl")
|
|
21481
|
+
.opt_table_font(font=google_font(name="IBM Plex Sans"))
|
|
21482
|
+
.opt_align_table_header(align="left")
|
|
21483
|
+
.cols_label(
|
|
21484
|
+
actual="ACTUAL",
|
|
21485
|
+
target="EXPECTED",
|
|
21486
|
+
tolerance="TOL",
|
|
21487
|
+
difference="DIFFERENCE",
|
|
21488
|
+
status="",
|
|
21489
|
+
)
|
|
21490
|
+
.cols_align(align="center")
|
|
21491
|
+
.fmt_markdown(columns=["actual", "target", "tolerance", "difference", "status"])
|
|
21492
|
+
.tab_style(
|
|
21493
|
+
style=style.text(color="black", font=google_font(name="IBM Plex Mono"), size="13px"),
|
|
21494
|
+
locations=loc.body(columns=["actual", "target", "tolerance", "difference"]),
|
|
21495
|
+
)
|
|
21496
|
+
.tab_style(
|
|
21497
|
+
style=style.text(size="13px"),
|
|
21498
|
+
locations=loc.body(columns="status"),
|
|
21499
|
+
)
|
|
21500
|
+
.tab_style(
|
|
21501
|
+
style=style.text(color="gray20", font=google_font(name="IBM Plex Mono"), size="12px"),
|
|
21502
|
+
locations=loc.column_labels(),
|
|
21503
|
+
)
|
|
21504
|
+
.tab_style(
|
|
21505
|
+
style=style.borders(
|
|
21506
|
+
sides=["top", "bottom"], color="#E9E9E9", style="solid", weight="1px"
|
|
21507
|
+
),
|
|
21508
|
+
locations=loc.body(),
|
|
21509
|
+
)
|
|
21510
|
+
.tab_options(
|
|
21511
|
+
table_body_vlines_style="solid",
|
|
21512
|
+
table_body_vlines_width="1px",
|
|
21513
|
+
table_body_vlines_color="#E9E9E9",
|
|
21514
|
+
column_labels_vlines_style="solid",
|
|
21515
|
+
column_labels_vlines_width="1px",
|
|
21516
|
+
column_labels_vlines_color="#F2F2F2",
|
|
21517
|
+
)
|
|
21518
|
+
.cols_width(
|
|
21519
|
+
cases={
|
|
21520
|
+
"actual": "200px",
|
|
21521
|
+
"target": "200px",
|
|
21522
|
+
"tolerance": "150px",
|
|
21523
|
+
"difference": "200px",
|
|
21524
|
+
"status": "50px",
|
|
21525
|
+
}
|
|
21526
|
+
)
|
|
21527
|
+
)
|
|
21528
|
+
|
|
21529
|
+
# Apply styling based on pass/fail
|
|
21530
|
+
if all_passed:
|
|
21531
|
+
step_report = step_report.tab_style(
|
|
21532
|
+
style=[
|
|
21533
|
+
style.text(color="#006400"),
|
|
21534
|
+
style.fill(color="#4CA64C33"),
|
|
21535
|
+
],
|
|
21536
|
+
locations=loc.body(columns="status"),
|
|
21537
|
+
)
|
|
21538
|
+
else:
|
|
21539
|
+
step_report = step_report.tab_style(
|
|
21540
|
+
style=[
|
|
21541
|
+
style.text(color="#B22222"),
|
|
21542
|
+
style.fill(color="#FFC1C159"),
|
|
21543
|
+
],
|
|
21544
|
+
locations=loc.body(columns="status"),
|
|
21545
|
+
)
|
|
21546
|
+
|
|
21547
|
+
# If the version of `great_tables` is `>=0.17.0` then disable Quarto table processing
|
|
21548
|
+
if version("great_tables") >= "0.17.0":
|
|
21549
|
+
step_report = step_report.tab_options(quarto_disable_processing=True)
|
|
21550
|
+
|
|
21551
|
+
# If no header requested, return the table as-is
|
|
21552
|
+
if header is None:
|
|
21553
|
+
return step_report
|
|
21554
|
+
|
|
21555
|
+
# Create header content
|
|
21556
|
+
assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
|
|
21557
|
+
|
|
21558
|
+
# Wrap assertion text in styled code tag
|
|
21559
|
+
assertion_code = (
|
|
21560
|
+
f"<code style='color: #303030; font-family: monospace; font-size: smaller;'>"
|
|
21561
|
+
f"{assertion_text}</code>"
|
|
21562
|
+
)
|
|
21563
|
+
|
|
21564
|
+
if all_passed:
|
|
21565
|
+
title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN
|
|
21566
|
+
result_stmt = STEP_REPORT_TEXT.get("agg_success_statement", {}).get(
|
|
21567
|
+
lang,
|
|
21568
|
+
f"The aggregate value for column <code>{column_display}</code> satisfies the condition.",
|
|
21569
|
+
)
|
|
21570
|
+
if isinstance(result_stmt, str) and "{column}" in result_stmt:
|
|
21571
|
+
result_stmt = result_stmt.format(column=column_display)
|
|
21572
|
+
else:
|
|
21573
|
+
title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CROSS_MARK_SPAN
|
|
21574
|
+
result_stmt = STEP_REPORT_TEXT.get("agg_failure_statement", {}).get(
|
|
21575
|
+
lang,
|
|
21576
|
+
f"The aggregate value for column <code>{column_display}</code> does not satisfy the condition.",
|
|
21577
|
+
)
|
|
21578
|
+
if isinstance(result_stmt, str) and "{column}" in result_stmt:
|
|
21579
|
+
result_stmt = result_stmt.format(column=column_display)
|
|
21580
|
+
|
|
21581
|
+
details = (
|
|
21582
|
+
f"<div style='font-size: 13.6px; {direction_rtl}'>"
|
|
21583
|
+
"<div style='padding-top: 7px;'>"
|
|
21584
|
+
f"{assertion_header_text} <span style='border-style: solid; border-width: thin; "
|
|
21585
|
+
"border-color: lightblue; padding-left: 2px; padding-right: 2px;'>"
|
|
21586
|
+
"<code style='color: #303030; background-color: transparent; "
|
|
21587
|
+
f"position: relative; bottom: 1px;'>{assertion_code}</code></span>"
|
|
21588
|
+
"</div>"
|
|
21589
|
+
"<div style='padding-top: 7px;'>"
|
|
21590
|
+
f"{result_stmt}"
|
|
21591
|
+
"</div>"
|
|
21592
|
+
"</div>"
|
|
21593
|
+
)
|
|
21594
|
+
|
|
21595
|
+
# Generate the default template text for the header when `":default:"` is used
|
|
21596
|
+
if header == ":default:":
|
|
21597
|
+
header = "{title}{details}"
|
|
21598
|
+
|
|
21599
|
+
# Use commonmark to convert the header text to HTML
|
|
21600
|
+
header = commonmark.commonmark(header)
|
|
21601
|
+
|
|
21602
|
+
# Place any templated text in the header
|
|
21603
|
+
header = header.format(title=title, details=details)
|
|
21604
|
+
|
|
21605
|
+
# Create the header with `header` string
|
|
21606
|
+
step_report = step_report.tab_header(title=md(header))
|
|
21607
|
+
|
|
21608
|
+
return step_report
|
|
21609
|
+
|
|
21610
|
+
|
|
20128
21611
|
def _step_report_schema_in_order(
|
|
20129
|
-
step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False
|
|
20130
|
-
) -> GT |
|
|
21612
|
+
step: int, schema_info: dict, header: str | None, lang: str, debug_return_df: bool = False
|
|
21613
|
+
) -> GT | Any:
|
|
20131
21614
|
"""
|
|
20132
21615
|
This is the case for schema validation where the schema is supposed to have the same column
|
|
20133
21616
|
order as the target table.
|
|
@@ -20195,22 +21678,22 @@ def _step_report_schema_in_order(
|
|
|
20195
21678
|
|
|
20196
21679
|
# Check if this column exists in exp_columns_dict (it might not if it's a duplicate)
|
|
20197
21680
|
# For duplicates, we need to handle them specially
|
|
20198
|
-
if column_name_exp_i not in exp_columns_dict:
|
|
21681
|
+
if column_name_exp_i not in exp_columns_dict: # pragma: no cover
|
|
20199
21682
|
# This is a duplicate or invalid column, mark it as incorrect
|
|
20200
|
-
col_exp_correct.append(CROSS_MARK_SPAN)
|
|
21683
|
+
col_exp_correct.append(CROSS_MARK_SPAN) # pragma: no cover
|
|
20201
21684
|
|
|
20202
21685
|
# For dtype, check if there's a dtype specified in the schema
|
|
20203
|
-
if len(expect_schema[i]) > 1:
|
|
20204
|
-
dtype_value = expect_schema[i][1]
|
|
20205
|
-
if isinstance(dtype_value, list):
|
|
20206
|
-
dtype_exp.append(" | ".join(dtype_value))
|
|
20207
|
-
else:
|
|
20208
|
-
dtype_exp.append(str(dtype_value))
|
|
20209
|
-
else:
|
|
20210
|
-
dtype_exp.append("—")
|
|
21686
|
+
if len(expect_schema[i]) > 1: # pragma: no cover
|
|
21687
|
+
dtype_value = expect_schema[i][1] # pragma: no cover
|
|
21688
|
+
if isinstance(dtype_value, list): # pragma: no cover
|
|
21689
|
+
dtype_exp.append(" | ".join(dtype_value)) # pragma: no cover
|
|
21690
|
+
else: # pragma: no cover
|
|
21691
|
+
dtype_exp.append(str(dtype_value)) # pragma: no cover
|
|
21692
|
+
else: # pragma: no cover
|
|
21693
|
+
dtype_exp.append("—") # pragma: no cover
|
|
20211
21694
|
|
|
20212
|
-
dtype_exp_correct.append("—")
|
|
20213
|
-
continue
|
|
21695
|
+
dtype_exp_correct.append("—") # pragma: no cover
|
|
21696
|
+
continue # pragma: no cover
|
|
20214
21697
|
|
|
20215
21698
|
#
|
|
20216
21699
|
# `col_exp_correct` values
|
|
@@ -20433,7 +21916,9 @@ def _step_report_schema_in_order(
|
|
|
20433
21916
|
# Add a border below the row that terminates the target table schema
|
|
20434
21917
|
step_report = step_report.tab_style(
|
|
20435
21918
|
style=style.borders(sides="bottom", color="#6699CC80", style="solid", weight="1px"),
|
|
20436
|
-
locations=loc.body(
|
|
21919
|
+
locations=loc.body(
|
|
21920
|
+
rows=len(colnames_tgt) - 1 # ty: ignore (bug in GT, should allow an int)
|
|
21921
|
+
),
|
|
20437
21922
|
)
|
|
20438
21923
|
|
|
20439
21924
|
# If the version of `great_tables` is `>=0.17.0` then disable Quarto table processing
|
|
@@ -20482,8 +21967,8 @@ def _step_report_schema_in_order(
|
|
|
20482
21967
|
|
|
20483
21968
|
|
|
20484
21969
|
def _step_report_schema_any_order(
|
|
20485
|
-
step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False
|
|
20486
|
-
) -> GT |
|
|
21970
|
+
step: int, schema_info: dict, header: str | None, lang: str, debug_return_df: bool = False
|
|
21971
|
+
) -> GT | pl.DataFrame:
|
|
20487
21972
|
"""
|
|
20488
21973
|
This is the case for schema validation where the schema is permitted to not have to be in the
|
|
20489
21974
|
same column order as the target table.
|
|
@@ -20902,9 +22387,7 @@ def _step_report_schema_any_order(
|
|
|
20902
22387
|
header = header.format(title=title, details=details)
|
|
20903
22388
|
|
|
20904
22389
|
# Create the header with `header` string
|
|
20905
|
-
|
|
20906
|
-
|
|
20907
|
-
return step_report
|
|
22390
|
+
return step_report.tab_header(title=md(header))
|
|
20908
22391
|
|
|
20909
22392
|
|
|
20910
22393
|
def _create_label_text_html(
|
|
@@ -20993,3 +22476,321 @@ def _create_col_schema_match_params_html(
|
|
|
20993
22476
|
f"{full_match_dtypes_text}"
|
|
20994
22477
|
"</div>"
|
|
20995
22478
|
)
|
|
22479
|
+
|
|
22480
|
+
|
|
22481
|
+
def _generate_agg_docstring(name: str) -> str:
|
|
22482
|
+
"""Generate a comprehensive docstring for an aggregation validation method.
|
|
22483
|
+
|
|
22484
|
+
This function creates detailed documentation for dynamically generated methods like
|
|
22485
|
+
`col_sum_eq()`, `col_avg_gt()`, `col_sd_le()`, etc. The docstrings follow the same
|
|
22486
|
+
structure and quality as manually written validation methods like `col_vals_gt()`.
|
|
22487
|
+
|
|
22488
|
+
Parameters
|
|
22489
|
+
----------
|
|
22490
|
+
name
|
|
22491
|
+
The method name (e.g., "col_sum_eq", "col_avg_gt", "col_sd_le").
|
|
22492
|
+
|
|
22493
|
+
Returns
|
|
22494
|
+
-------
|
|
22495
|
+
str
|
|
22496
|
+
A complete docstring for the method.
|
|
22497
|
+
"""
|
|
22498
|
+
# Parse the method name to extract aggregation type and comparison operator
|
|
22499
|
+
# Format: col_{agg}_{comp} (e.g., col_sum_eq, col_avg_gt, col_sd_le)
|
|
22500
|
+
parts = name.split("_")
|
|
22501
|
+
agg_type = parts[1] # sum, avg, sd
|
|
22502
|
+
comp_type = parts[2] # eq, gt, ge, lt, le
|
|
22503
|
+
|
|
22504
|
+
# Human-readable names for aggregation types
|
|
22505
|
+
agg_names = {
|
|
22506
|
+
"sum": ("sum", "summed"),
|
|
22507
|
+
"avg": ("average", "averaged"),
|
|
22508
|
+
"sd": ("standard deviation", "computed for standard deviation"),
|
|
22509
|
+
}
|
|
22510
|
+
|
|
22511
|
+
# Human-readable descriptions for comparison operators (with article for title)
|
|
22512
|
+
comp_descriptions = {
|
|
22513
|
+
"eq": ("equal to", "equals", "an"),
|
|
22514
|
+
"gt": ("greater than", "is greater than", "a"),
|
|
22515
|
+
"ge": ("greater than or equal to", "is at least", "a"),
|
|
22516
|
+
"lt": ("less than", "is less than", "a"),
|
|
22517
|
+
"le": ("less than or equal to", "is at most", "a"),
|
|
22518
|
+
}
|
|
22519
|
+
|
|
22520
|
+
# Mathematical symbols for comparison operators
|
|
22521
|
+
comp_symbols = {
|
|
22522
|
+
"eq": "==",
|
|
22523
|
+
"gt": ">",
|
|
22524
|
+
"ge": ">=",
|
|
22525
|
+
"lt": "<",
|
|
22526
|
+
"le": "<=",
|
|
22527
|
+
}
|
|
22528
|
+
|
|
22529
|
+
agg_name, agg_verb = agg_names[agg_type]
|
|
22530
|
+
comp_desc, comp_phrase, comp_article = comp_descriptions[comp_type]
|
|
22531
|
+
comp_symbol = comp_symbols[comp_type]
|
|
22532
|
+
|
|
22533
|
+
# Determine the appropriate example values based on the aggregation and comparison
|
|
22534
|
+
if agg_type == "sum":
|
|
22535
|
+
example_value = "15"
|
|
22536
|
+
example_data = '{"a": [1, 2, 3, 4, 5], "b": [2, 2, 2, 2, 2]}'
|
|
22537
|
+
example_sum = "15" # sum of a
|
|
22538
|
+
example_ref_sum = "10" # sum of b
|
|
22539
|
+
elif agg_type == "avg":
|
|
22540
|
+
example_value = "3"
|
|
22541
|
+
example_data = '{"a": [1, 2, 3, 4, 5], "b": [2, 2, 2, 2, 2]}'
|
|
22542
|
+
example_sum = "3.0" # avg of a
|
|
22543
|
+
example_ref_sum = "2.0" # avg of b
|
|
22544
|
+
else: # sd
|
|
22545
|
+
example_value = "2"
|
|
22546
|
+
example_data = '{"a": [1, 2, 3, 4, 5], "b": [2, 2, 2, 2, 2]}'
|
|
22547
|
+
example_sum = "~1.58" # sd of a
|
|
22548
|
+
example_ref_sum = "0.0" # sd of b
|
|
22549
|
+
|
|
22550
|
+
# Build appropriate tolerance explanation based on comparison type
|
|
22551
|
+
if comp_type == "eq":
|
|
22552
|
+
tol_explanation = f"""The `tol=` parameter is particularly useful with `{name}()` since exact equality
|
|
22553
|
+
comparisons on floating-point aggregations can be problematic due to numerical precision.
|
|
22554
|
+
Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from
|
|
22555
|
+
floating-point arithmetic."""
|
|
22556
|
+
else:
|
|
22557
|
+
tol_explanation = f"""The `tol=` parameter expands the acceptable range for the comparison. For
|
|
22558
|
+
`{name}()`, a tolerance of `tol=0.5` would mean the {agg_name} can be within `0.5` of the
|
|
22559
|
+
target value and still pass validation."""
|
|
22560
|
+
|
|
22561
|
+
docstring = f"""
|
|
22562
|
+
Does the column {agg_name} satisfy {comp_article} {comp_desc} comparison?
|
|
22563
|
+
|
|
22564
|
+
The `{name}()` validation method checks whether the {agg_name} of values in a column
|
|
22565
|
+
{comp_phrase} a specified `value=`. This is an aggregation-based validation where the entire
|
|
22566
|
+
column is reduced to a single {agg_name} value that is then compared against the target. The
|
|
22567
|
+
comparison used in this function is `{agg_name}(column) {comp_symbol} value`.
|
|
22568
|
+
|
|
22569
|
+
Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as
|
|
22570
|
+
a single test unit. The validation either passes completely (if the aggregated value satisfies
|
|
22571
|
+
the comparison) or fails completely.
|
|
22572
|
+
|
|
22573
|
+
Parameters
|
|
22574
|
+
----------
|
|
22575
|
+
columns
|
|
22576
|
+
A single column or a list of columns to validate. If multiple columns are supplied,
|
|
22577
|
+
there will be a separate validation step generated for each column. The columns must
|
|
22578
|
+
contain numeric data for the {agg_name} to be computed.
|
|
22579
|
+
value
|
|
22580
|
+
The value to compare the column {agg_name} against. This can be: (1) a numeric literal
|
|
22581
|
+
(`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column
|
|
22582
|
+
whose {agg_name} will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object
|
|
22583
|
+
referencing a column in reference data (when `Validate(reference=)` has been set), or (4)
|
|
22584
|
+
`None` to automatically compare against the same column in reference data (shorthand for
|
|
22585
|
+
`ref(column_name)` when reference data is set).
|
|
22586
|
+
tol
|
|
22587
|
+
A tolerance value for the comparison. The default is `0`, meaning exact comparison. When
|
|
22588
|
+
set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`,
|
|
22589
|
+
a {agg_name} that differs from the target by up to `0.5` will still pass. {tol_explanation}
|
|
22590
|
+
thresholds
|
|
22591
|
+
Failure threshold levels so that the validation step can react accordingly when
|
|
22592
|
+
failing test units are level. Since this is an aggregation-based validation with only
|
|
22593
|
+
one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to
|
|
22594
|
+
indicate pass/fail, or as proportions where any value less than `1.0` means failure is
|
|
22595
|
+
acceptable.
|
|
22596
|
+
brief
|
|
22597
|
+
An optional brief description of the validation step that will be displayed in the
|
|
22598
|
+
reporting table. You can use the templating elements like `"{{step}}"` to insert
|
|
22599
|
+
the step number, or `"{{auto}}"` to include an automatically generated brief. If `True`
|
|
22600
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
22601
|
+
won't be a brief.
|
|
22602
|
+
actions
|
|
22603
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
22604
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
22605
|
+
define the actions.
|
|
22606
|
+
active
|
|
22607
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
22608
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
22609
|
+
for the steps unchanged).
|
|
22610
|
+
|
|
22611
|
+
Returns
|
|
22612
|
+
-------
|
|
22613
|
+
Validate
|
|
22614
|
+
The `Validate` object with the added validation step.
|
|
22615
|
+
|
|
22616
|
+
Using Reference Data
|
|
22617
|
+
--------------------
|
|
22618
|
+
The `{name}()` method supports comparing column aggregations against reference data. This
|
|
22619
|
+
is useful for validating that statistical properties remain consistent across different
|
|
22620
|
+
versions of a dataset, or for comparing current data against historical baselines.
|
|
22621
|
+
|
|
22622
|
+
To use reference data, set the `reference=` parameter when creating the `Validate` object:
|
|
22623
|
+
|
|
22624
|
+
```python
|
|
22625
|
+
validation = (
|
|
22626
|
+
pb.Validate(data=current_data, reference=baseline_data)
|
|
22627
|
+
.{name}(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue)
|
|
22628
|
+
.interrogate()
|
|
22629
|
+
)
|
|
22630
|
+
```
|
|
22631
|
+
|
|
22632
|
+
When `value=None` and reference data is set, the method automatically compares against the
|
|
22633
|
+
same column in the reference data. You can also explicitly specify reference columns using
|
|
22634
|
+
the `ref()` helper:
|
|
22635
|
+
|
|
22636
|
+
```python
|
|
22637
|
+
.{name}(columns="revenue", value=pb.ref("baseline_revenue"))
|
|
22638
|
+
```
|
|
22639
|
+
|
|
22640
|
+
Understanding Tolerance
|
|
22641
|
+
-----------------------
|
|
22642
|
+
The `tol=` parameter allows for fuzzy comparisons, which is especially important for
|
|
22643
|
+
floating-point aggregations where exact equality is often unreliable.
|
|
22644
|
+
|
|
22645
|
+
{tol_explanation}
|
|
22646
|
+
|
|
22647
|
+
For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]`
|
|
22648
|
+
within which the aggregation is considered valid. For inequality comparisons, the tolerance
|
|
22649
|
+
shifts the comparison boundary.
|
|
22650
|
+
|
|
22651
|
+
Thresholds
|
|
22652
|
+
----------
|
|
22653
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
22654
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
22655
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
22656
|
+
|
|
22657
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation
|
|
22658
|
+
validations operate on a single test unit (the aggregated value), threshold values are
|
|
22659
|
+
typically set as absolute counts:
|
|
22660
|
+
|
|
22661
|
+
- `thresholds=1` means any failure triggers a 'warning'
|
|
22662
|
+
- `thresholds=(1, 1, 1)` means any failure triggers all three levels
|
|
22663
|
+
|
|
22664
|
+
Thresholds can be defined using one of these input schemes:
|
|
22665
|
+
|
|
22666
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
22667
|
+
thresholds)
|
|
22668
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
22669
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
22670
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
22671
|
+
'critical'
|
|
22672
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
22673
|
+
for the 'warning' level only
|
|
22674
|
+
|
|
22675
|
+
Examples
|
|
22676
|
+
--------
|
|
22677
|
+
```{{python}}
|
|
22678
|
+
#| echo: false
|
|
22679
|
+
#| output: false
|
|
22680
|
+
import pointblank as pb
|
|
22681
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
22682
|
+
```
|
|
22683
|
+
For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is
|
|
22684
|
+
shown below:
|
|
22685
|
+
|
|
22686
|
+
```{{python}}
|
|
22687
|
+
import pointblank as pb
|
|
22688
|
+
import polars as pl
|
|
22689
|
+
|
|
22690
|
+
tbl = pl.DataFrame(
|
|
22691
|
+
{{
|
|
22692
|
+
"a": [1, 2, 3, 4, 5],
|
|
22693
|
+
"b": [2, 2, 2, 2, 2],
|
|
22694
|
+
}}
|
|
22695
|
+
)
|
|
22696
|
+
|
|
22697
|
+
pb.preview(tbl)
|
|
22698
|
+
```
|
|
22699
|
+
|
|
22700
|
+
Let's validate that the {agg_name} of column `a` {comp_phrase} `{example_value}`:
|
|
22701
|
+
|
|
22702
|
+
```{{python}}
|
|
22703
|
+
validation = (
|
|
22704
|
+
pb.Validate(data=tbl)
|
|
22705
|
+
.{name}(columns="a", value={example_value})
|
|
22706
|
+
.interrogate()
|
|
22707
|
+
)
|
|
22708
|
+
|
|
22709
|
+
validation
|
|
22710
|
+
```
|
|
22711
|
+
|
|
22712
|
+
The validation result shows whether the {agg_name} comparison passed or failed. Since this
|
|
22713
|
+
is an aggregation-based validation, there is exactly one test unit per column.
|
|
22714
|
+
|
|
22715
|
+
When validating multiple columns, each column gets its own validation step:
|
|
22716
|
+
|
|
22717
|
+
```{{python}}
|
|
22718
|
+
validation = (
|
|
22719
|
+
pb.Validate(data=tbl)
|
|
22720
|
+
.{name}(columns=["a", "b"], value={example_value})
|
|
22721
|
+
.interrogate()
|
|
22722
|
+
)
|
|
22723
|
+
|
|
22724
|
+
validation
|
|
22725
|
+
```
|
|
22726
|
+
|
|
22727
|
+
Using tolerance for flexible comparisons:
|
|
22728
|
+
|
|
22729
|
+
```{{python}}
|
|
22730
|
+
validation = (
|
|
22731
|
+
pb.Validate(data=tbl)
|
|
22732
|
+
.{name}(columns="a", value={example_value}, tol=1.0)
|
|
22733
|
+
.interrogate()
|
|
22734
|
+
)
|
|
22735
|
+
|
|
22736
|
+
validation
|
|
22737
|
+
```
|
|
22738
|
+
"""
|
|
22739
|
+
|
|
22740
|
+
return docstring.strip()
|
|
22741
|
+
|
|
22742
|
+
|
|
22743
|
+
def make_agg_validator(name: str):
|
|
22744
|
+
"""Factory for dynamically generated aggregate validation methods.
|
|
22745
|
+
|
|
22746
|
+
Why this exists:
|
|
22747
|
+
Aggregate validators all share identical behavior. The only thing that differs
|
|
22748
|
+
between them is the semantic assertion type (their name). The implementation
|
|
22749
|
+
of each aggregate validator is fetched from `from_agg_validator`.
|
|
22750
|
+
|
|
22751
|
+
Instead of copy/pasting dozens of identical methods, we generate
|
|
22752
|
+
them dynamically and attach them to the Validate class. The types are generated
|
|
22753
|
+
at build time with `make pyi` to allow the methods to be visible to the type checker,
|
|
22754
|
+
documentation builders and the IDEs/LSPs.
|
|
22755
|
+
|
|
22756
|
+
The returned function is a thin adapter that forwards all arguments to
|
|
22757
|
+
`_add_agg_validation`, supplying the assertion type explicitly.
|
|
22758
|
+
"""
|
|
22759
|
+
|
|
22760
|
+
def agg_validator(
|
|
22761
|
+
self: Validate,
|
|
22762
|
+
columns: str | Collection[str],
|
|
22763
|
+
value: float | int | Column | ReferenceColumn | None = None,
|
|
22764
|
+
tol: float = 0,
|
|
22765
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
22766
|
+
brief: str | bool | None = None,
|
|
22767
|
+
actions: Actions | None = None,
|
|
22768
|
+
active: bool = True,
|
|
22769
|
+
) -> Validate:
|
|
22770
|
+
# Dynamically generated aggregate validator.
|
|
22771
|
+
# This method is generated per assertion type and forwards all arguments
|
|
22772
|
+
# to the shared aggregate validation implementation.
|
|
22773
|
+
return self._add_agg_validation(
|
|
22774
|
+
assertion_type=name,
|
|
22775
|
+
columns=columns,
|
|
22776
|
+
value=value,
|
|
22777
|
+
tol=tol,
|
|
22778
|
+
thresholds=thresholds,
|
|
22779
|
+
brief=brief,
|
|
22780
|
+
actions=actions,
|
|
22781
|
+
active=active,
|
|
22782
|
+
)
|
|
22783
|
+
|
|
22784
|
+
# Manually set function identity so this behaves like a real method.
|
|
22785
|
+
# These must be set before attaching the function to the class.
|
|
22786
|
+
agg_validator.__name__ = name
|
|
22787
|
+
agg_validator.__qualname__ = f"Validate.{name}"
|
|
22788
|
+
agg_validator.__doc__ = _generate_agg_docstring(name)
|
|
22789
|
+
|
|
22790
|
+
return agg_validator
|
|
22791
|
+
|
|
22792
|
+
|
|
22793
|
+
# Finally, we grab all the valid aggregation method names and attach them to
|
|
22794
|
+
# the Validate class, registering each one appropriately.
|
|
22795
|
+
for method in load_validation_method_grid(): # -> `col_sum_*`, `col_mean_*`, etc.
|
|
22796
|
+
setattr(Validate, method, make_agg_validator(method))
|