pointblank 0.16.0__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants.py +15 -1
- pointblank/_constants_translations.py +1302 -0
- pointblank/_interrogation.py +24 -0
- pointblank/_utils_llms_txt.py +1 -0
- pointblank/data/api-docs.txt +264 -3
- pointblank/validate.py +1350 -29
- pointblank/yaml.py +5 -2
- {pointblank-0.16.0.dist-info → pointblank-0.17.0.dist-info}/METADATA +6 -1
- {pointblank-0.16.0.dist-info → pointblank-0.17.0.dist-info}/RECORD +13 -13
- {pointblank-0.16.0.dist-info → pointblank-0.17.0.dist-info}/WHEEL +0 -0
- {pointblank-0.16.0.dist-info → pointblank-0.17.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.16.0.dist-info → pointblank-0.17.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.16.0.dist-info → pointblank-0.17.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -12,6 +12,7 @@ import tempfile
|
|
|
12
12
|
import threading
|
|
13
13
|
from dataclasses import dataclass
|
|
14
14
|
from enum import Enum
|
|
15
|
+
from functools import partial
|
|
15
16
|
from importlib.metadata import version
|
|
16
17
|
from pathlib import Path
|
|
17
18
|
from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
@@ -54,6 +55,7 @@ from pointblank._interrogation import (
|
|
|
54
55
|
SpeciallyValidation,
|
|
55
56
|
col_count_match,
|
|
56
57
|
col_exists,
|
|
58
|
+
col_pct_null,
|
|
57
59
|
col_schema_match,
|
|
58
60
|
col_vals_expr,
|
|
59
61
|
conjointly_validation,
|
|
@@ -363,12 +365,16 @@ class PointblankConfig:
|
|
|
363
365
|
|
|
364
366
|
report_incl_header: bool = True
|
|
365
367
|
report_incl_footer: bool = True
|
|
368
|
+
report_incl_footer_timings: bool = True
|
|
369
|
+
report_incl_footer_notes: bool = True
|
|
366
370
|
preview_incl_header: bool = True
|
|
367
371
|
|
|
368
372
|
def __repr__(self):
|
|
369
373
|
return (
|
|
370
374
|
f"PointblankConfig(report_incl_header={self.report_incl_header}, "
|
|
371
375
|
f"report_incl_footer={self.report_incl_footer}, "
|
|
376
|
+
f"report_incl_footer_timings={self.report_incl_footer_timings}, "
|
|
377
|
+
f"report_incl_footer_notes={self.report_incl_footer_notes}, "
|
|
372
378
|
f"preview_incl_header={self.preview_incl_header})"
|
|
373
379
|
)
|
|
374
380
|
|
|
@@ -380,6 +386,8 @@ global_config = PointblankConfig()
|
|
|
380
386
|
def config(
|
|
381
387
|
report_incl_header: bool = True,
|
|
382
388
|
report_incl_footer: bool = True,
|
|
389
|
+
report_incl_footer_timings: bool = True,
|
|
390
|
+
report_incl_footer_notes: bool = True,
|
|
383
391
|
preview_incl_header: bool = True,
|
|
384
392
|
) -> PointblankConfig:
|
|
385
393
|
"""
|
|
@@ -393,7 +401,13 @@ def config(
|
|
|
393
401
|
threshold levels (if set).
|
|
394
402
|
report_incl_footer
|
|
395
403
|
Should the footer of the validation table report be displayed? The footer contains the
|
|
396
|
-
starting and ending times of the interrogation.
|
|
404
|
+
starting and ending times of the interrogation and any notes added to validation steps.
|
|
405
|
+
report_incl_footer_timings
|
|
406
|
+
Controls whether the validation timing information (start time, duration, and end time)
|
|
407
|
+
should be displayed in the footer. Only applies when `report_incl_footer=True`.
|
|
408
|
+
report_incl_footer_notes
|
|
409
|
+
Controls whether the notes from validation steps should be displayed in the footer. Only
|
|
410
|
+
applies when `report_incl_footer=True`.
|
|
397
411
|
preview_incl_header
|
|
398
412
|
Whether the header should be present in any preview table (generated via the
|
|
399
413
|
[`preview()`](`pointblank.preview`) function).
|
|
@@ -407,6 +421,8 @@ def config(
|
|
|
407
421
|
global global_config
|
|
408
422
|
global_config.report_incl_header = report_incl_header # pragma: no cover
|
|
409
423
|
global_config.report_incl_footer = report_incl_footer # pragma: no cover
|
|
424
|
+
global_config.report_incl_footer_timings = report_incl_footer_timings # pragma: no cover
|
|
425
|
+
global_config.report_incl_footer_notes = report_incl_footer_notes # pragma: no cover
|
|
410
426
|
global_config.preview_incl_header = preview_incl_header # pragma: no cover
|
|
411
427
|
|
|
412
428
|
|
|
@@ -9755,6 +9771,302 @@ class Validate:
|
|
|
9755
9771
|
|
|
9756
9772
|
return self
|
|
9757
9773
|
|
|
9774
|
+
def col_pct_null(
|
|
9775
|
+
self,
|
|
9776
|
+
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
9777
|
+
p: float,
|
|
9778
|
+
tol: Tolerance = 0,
|
|
9779
|
+
thresholds: int | float | None | bool | tuple | dict | Thresholds = None,
|
|
9780
|
+
actions: Actions | None = None,
|
|
9781
|
+
brief: str | bool | None = None,
|
|
9782
|
+
active: bool = True,
|
|
9783
|
+
) -> Validate:
|
|
9784
|
+
"""
|
|
9785
|
+
Validate whether a column has a specific percentage of Null values.
|
|
9786
|
+
|
|
9787
|
+
The `col_pct_null()` validation method checks whether the percentage of Null values in a
|
|
9788
|
+
column matches a specified percentage `p=` (within an optional tolerance `tol=`). This
|
|
9789
|
+
validation operates at the column level, generating a single validation step per column that
|
|
9790
|
+
passes or fails based on whether the actual percentage of Null values falls within the
|
|
9791
|
+
acceptable range defined by `p ± tol`.
|
|
9792
|
+
|
|
9793
|
+
Parameters
|
|
9794
|
+
----------
|
|
9795
|
+
columns
|
|
9796
|
+
A single column or a list of columns to validate. Can also use
|
|
9797
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
9798
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
9799
|
+
generated for each column.
|
|
9800
|
+
p
|
|
9801
|
+
The expected percentage of Null values in the column, expressed as a decimal between
|
|
9802
|
+
`0.0` and `1.0`. For example, `p=0.5` means 50% of values should be Null.
|
|
9803
|
+
tol
|
|
9804
|
+
The tolerance allowed when comparing the actual percentage of Null values to the
|
|
9805
|
+
expected percentage `p=`. The validation passes if the actual percentage falls within
|
|
9806
|
+
the range `[p - tol, p + tol]`. Default is `0`, meaning an exact match is required. See
|
|
9807
|
+
the *Tolerance* section for details on all supported formats (absolute, relative,
|
|
9808
|
+
symmetric, and asymmetric bounds).
|
|
9809
|
+
thresholds
|
|
9810
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
9811
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
9812
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
9813
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
9814
|
+
section for information on how to set threshold levels.
|
|
9815
|
+
actions
|
|
9816
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
9817
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
9818
|
+
define the actions.
|
|
9819
|
+
brief
|
|
9820
|
+
An optional brief description of the validation step that will be displayed in the
|
|
9821
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
9822
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
9823
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
9824
|
+
won't be a brief.
|
|
9825
|
+
active
|
|
9826
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
9827
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
9828
|
+
for the steps unchanged).
|
|
9829
|
+
|
|
9830
|
+
Returns
|
|
9831
|
+
-------
|
|
9832
|
+
Validate
|
|
9833
|
+
The `Validate` object with the added validation step.
|
|
9834
|
+
|
|
9835
|
+
Tolerance
|
|
9836
|
+
---------
|
|
9837
|
+
The `tol=` parameter accepts several different formats to specify the acceptable deviation
|
|
9838
|
+
from the expected percentage `p=`. The tolerance can be expressed as:
|
|
9839
|
+
|
|
9840
|
+
1. *single integer* (absolute tolerance): the exact number of test units that can deviate.
|
|
9841
|
+
For example, `tol=2` means the actual count can differ from the expected count by up to 2
|
|
9842
|
+
units in either direction.
|
|
9843
|
+
|
|
9844
|
+
2. *single float between 0 and 1* (relative tolerance): a proportion of the expected
|
|
9845
|
+
count. For example, if the expected count is 50 and `tol=0.1`, the acceptable range is
|
|
9846
|
+
45 to 55 (50 ± 10% of 50 = 50 ± 5).
|
|
9847
|
+
|
|
9848
|
+
3. *tuple of two integers* (absolute bounds): explicitly specify the lower and upper
|
|
9849
|
+
bounds as absolute deviations. For example, `tol=(1, 3)` means the actual count can be
|
|
9850
|
+
1 unit below or 3 units above the expected count.
|
|
9851
|
+
|
|
9852
|
+
4. *tuple of two floats between 0 and 1* (relative bounds): explicitly specify the lower
|
|
9853
|
+
and upper bounds as proportional deviations. For example, `tol=(0.05, 0.15)` means the
|
|
9854
|
+
lower bound is 5% below and the upper bound is 15% above the expected count.
|
|
9855
|
+
|
|
9856
|
+
When using a single value (integer or float), the tolerance is applied symmetrically in both
|
|
9857
|
+
directions. When using a tuple, you can specify asymmetric tolerances where the lower and
|
|
9858
|
+
upper bounds differ.
|
|
9859
|
+
|
|
9860
|
+
Thresholds
|
|
9861
|
+
----------
|
|
9862
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
9863
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
9864
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
9865
|
+
|
|
9866
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
9867
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
9868
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
9869
|
+
|
|
9870
|
+
Thresholds can be defined using one of these input schemes:
|
|
9871
|
+
|
|
9872
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
9873
|
+
thresholds)
|
|
9874
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
9875
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
9876
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
9877
|
+
'critical'
|
|
9878
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
9879
|
+
for the 'warning' level only
|
|
9880
|
+
|
|
9881
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
9882
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
9883
|
+
set, you're free to set any combination of them.
|
|
9884
|
+
|
|
9885
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
9886
|
+
take for each level of failure (using the `actions=` parameter).
|
|
9887
|
+
|
|
9888
|
+
Examples
|
|
9889
|
+
--------
|
|
9890
|
+
```{python}
|
|
9891
|
+
#| echo: false
|
|
9892
|
+
#| output: false
|
|
9893
|
+
import pointblank as pb
|
|
9894
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
9895
|
+
```
|
|
9896
|
+
For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`,
|
|
9897
|
+
and `c`) that have different percentages of Null values. The table is shown below:
|
|
9898
|
+
|
|
9899
|
+
```{python}
|
|
9900
|
+
import pointblank as pb
|
|
9901
|
+
import polars as pl
|
|
9902
|
+
|
|
9903
|
+
tbl = pl.DataFrame(
|
|
9904
|
+
{
|
|
9905
|
+
"a": [1, 2, 3, 4, 5, 6, 7, 8],
|
|
9906
|
+
"b": [1, None, 3, None, 5, None, 7, None],
|
|
9907
|
+
"c": [None, None, None, None, None, None, 1, 2],
|
|
9908
|
+
}
|
|
9909
|
+
)
|
|
9910
|
+
|
|
9911
|
+
pb.preview(tbl)
|
|
9912
|
+
```
|
|
9913
|
+
|
|
9914
|
+
Let's validate that column `a` has 0% Null values (i.e., no Null values at all).
|
|
9915
|
+
|
|
9916
|
+
```{python}
|
|
9917
|
+
validation = (
|
|
9918
|
+
pb.Validate(data=tbl)
|
|
9919
|
+
.col_pct_null(columns="a", p=0.0)
|
|
9920
|
+
.interrogate()
|
|
9921
|
+
)
|
|
9922
|
+
|
|
9923
|
+
validation
|
|
9924
|
+
```
|
|
9925
|
+
|
|
9926
|
+
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
9927
|
+
The validation table shows the single entry that corresponds to the validation step created
|
|
9928
|
+
by using `col_pct_null()`. The validation passed since column `a` has no Null values.
|
|
9929
|
+
|
|
9930
|
+
Now, let's check that column `b` has exactly 50% Null values.
|
|
9931
|
+
|
|
9932
|
+
```{python}
|
|
9933
|
+
validation = (
|
|
9934
|
+
pb.Validate(data=tbl)
|
|
9935
|
+
.col_pct_null(columns="b", p=0.5)
|
|
9936
|
+
.interrogate()
|
|
9937
|
+
)
|
|
9938
|
+
|
|
9939
|
+
validation
|
|
9940
|
+
```
|
|
9941
|
+
|
|
9942
|
+
This validation also passes, as column `b` has exactly 4 out of 8 values as Null (50%).
|
|
9943
|
+
|
|
9944
|
+
Finally, let's validate column `c` with a tolerance. Column `c` has 75% Null values, so
|
|
9945
|
+
we'll check if it's approximately 70% Null with a tolerance of 10%.
|
|
9946
|
+
|
|
9947
|
+
```{python}
|
|
9948
|
+
validation = (
|
|
9949
|
+
pb.Validate(data=tbl)
|
|
9950
|
+
.col_pct_null(columns="c", p=0.70, tol=0.10)
|
|
9951
|
+
.interrogate()
|
|
9952
|
+
)
|
|
9953
|
+
|
|
9954
|
+
validation
|
|
9955
|
+
```
|
|
9956
|
+
|
|
9957
|
+
This validation passes because the actual percentage (75%) falls within the acceptable
|
|
9958
|
+
range of 60% to 80% (70% ± 10%).
|
|
9959
|
+
|
|
9960
|
+
The `tol=` parameter supports multiple formats to express tolerance. Let's explore all the
|
|
9961
|
+
different ways to specify tolerance using column `b`, which has exactly 50% Null values
|
|
9962
|
+
(4 out of 8 values).
|
|
9963
|
+
|
|
9964
|
+
*Using an absolute tolerance (integer)*: Specify the exact number of rows that can
|
|
9965
|
+
deviate. With `tol=1`, we allow the count to differ by 1 row in either direction.
|
|
9966
|
+
|
|
9967
|
+
```{python}
|
|
9968
|
+
validation = (
|
|
9969
|
+
pb.Validate(data=tbl)
|
|
9970
|
+
.col_pct_null(columns="b", p=0.375, tol=1) # Expect 3 nulls, allow ±1 (range: 2-4)
|
|
9971
|
+
.interrogate()
|
|
9972
|
+
)
|
|
9973
|
+
|
|
9974
|
+
validation
|
|
9975
|
+
```
|
|
9976
|
+
|
|
9977
|
+
This passes because column `b` has 4 Null values, which falls within the acceptable range
|
|
9978
|
+
of 2 to 4 (3 ± 1).
|
|
9979
|
+
|
|
9980
|
+
*Using a relative tolerance (float)*: Specify the tolerance as a proportion of the
|
|
9981
|
+
expected count. With `tol=0.25`, we allow a 25% deviation from the expected count.
|
|
9982
|
+
|
|
9983
|
+
```{python}
|
|
9984
|
+
validation = (
|
|
9985
|
+
pb.Validate(data=tbl)
|
|
9986
|
+
.col_pct_null(columns="b", p=0.375, tol=0.25) # Expect 3 nulls, allow ±25% (range: 2.25-3.75)
|
|
9987
|
+
.interrogate()
|
|
9988
|
+
)
|
|
9989
|
+
|
|
9990
|
+
validation
|
|
9991
|
+
```
|
|
9992
|
+
|
|
9993
|
+
This passes because 4 Null values falls within the acceptable range (3 ± 0.75 calculates
|
|
9994
|
+
to 2.25 to 3.75, which rounds down to 2 to 3 rows).
|
|
9995
|
+
|
|
9996
|
+
*Using asymmetric absolute bounds (tuple of integers)*: Specify different lower and
|
|
9997
|
+
upper bounds as absolute values. With `tol=(0, 2)`, we allow no deviation below but up
|
|
9998
|
+
to 2 rows above the expected count.
|
|
9999
|
+
|
|
10000
|
+
```{python}
|
|
10001
|
+
validation = (
|
|
10002
|
+
pb.Validate(data=tbl)
|
|
10003
|
+
.col_pct_null(columns="b", p=0.25, tol=(0, 2)) # Expect 2 Nulls, allow +0/-2 (range: 2-4)
|
|
10004
|
+
.interrogate()
|
|
10005
|
+
)
|
|
10006
|
+
|
|
10007
|
+
validation
|
|
10008
|
+
```
|
|
10009
|
+
|
|
10010
|
+
This passes because 4 Null values falls within the acceptable range of 2 to 4.
|
|
10011
|
+
|
|
10012
|
+
*Using asymmetric relative bounds (tuple of floats)*: Specify different lower and upper
|
|
10013
|
+
bounds as proportions. With `tol=(0.1, 0.3)`, we allow 10% below and 30% above the
|
|
10014
|
+
expected count.
|
|
10015
|
+
|
|
10016
|
+
```{python}
|
|
10017
|
+
validation = (
|
|
10018
|
+
pb.Validate(data=tbl)
|
|
10019
|
+
.col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3)) # Expect 3 Nulls, allow -10%/+30%
|
|
10020
|
+
.interrogate()
|
|
10021
|
+
)
|
|
10022
|
+
|
|
10023
|
+
validation
|
|
10024
|
+
```
|
|
10025
|
+
|
|
10026
|
+
This passes because 4 Null values falls within the acceptable range (3 - 0.3 to 3 + 0.9
|
|
10027
|
+
calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows).
|
|
10028
|
+
"""
|
|
10029
|
+
assertion_type = _get_fn_name()
|
|
10030
|
+
|
|
10031
|
+
_check_column(column=columns)
|
|
10032
|
+
_check_thresholds(thresholds=thresholds)
|
|
10033
|
+
_check_boolean_input(param=active, param_name="active")
|
|
10034
|
+
|
|
10035
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
10036
|
+
thresholds = (
|
|
10037
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
10038
|
+
)
|
|
10039
|
+
|
|
10040
|
+
# If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
|
|
10041
|
+
# resolve the columns
|
|
10042
|
+
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
10043
|
+
columns = col(columns)
|
|
10044
|
+
|
|
10045
|
+
# If `columns` is Column value or a string, place it in a list for iteration
|
|
10046
|
+
if isinstance(columns, (Column, str)):
|
|
10047
|
+
columns = [columns]
|
|
10048
|
+
|
|
10049
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
10050
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
10051
|
+
|
|
10052
|
+
bound_finder: Callable[[int], AbsoluteBounds] = partial(_derive_bounds, tol=tol)
|
|
10053
|
+
|
|
10054
|
+
# Iterate over the columns and create a validation step for each
|
|
10055
|
+
for column in columns:
|
|
10056
|
+
val_info = _ValidationInfo(
|
|
10057
|
+
assertion_type=assertion_type,
|
|
10058
|
+
column=column,
|
|
10059
|
+
values={"p": p, "bound_finder": bound_finder},
|
|
10060
|
+
thresholds=thresholds,
|
|
10061
|
+
actions=actions,
|
|
10062
|
+
brief=brief,
|
|
10063
|
+
active=active,
|
|
10064
|
+
)
|
|
10065
|
+
|
|
10066
|
+
self._add_validation(validation_info=val_info)
|
|
10067
|
+
|
|
10068
|
+
return self
|
|
10069
|
+
|
|
9758
10070
|
def rows_distinct(
|
|
9759
10071
|
self,
|
|
9760
10072
|
columns_subset: str | list[str] | None = None,
|
|
@@ -12282,12 +12594,19 @@ class Validate:
|
|
|
12282
12594
|
# Generate the autobrief description for the validation step; it's important to perform
|
|
12283
12595
|
# that here since text components like the column and the value(s) have been resolved
|
|
12284
12596
|
# at this point
|
|
12597
|
+
# Get row count for col_pct_null to properly calculate absolute tolerance percentages
|
|
12598
|
+
n_rows = None
|
|
12599
|
+
if assertion_type == "col_pct_null":
|
|
12600
|
+
n_rows = get_row_count(data_tbl)
|
|
12601
|
+
|
|
12285
12602
|
autobrief = _create_autobrief_or_failure_text(
|
|
12286
12603
|
assertion_type=assertion_type,
|
|
12287
12604
|
lang=self.lang,
|
|
12288
12605
|
column=column,
|
|
12289
12606
|
values=value,
|
|
12290
12607
|
for_failure=False,
|
|
12608
|
+
locale=self.locale,
|
|
12609
|
+
n_rows=n_rows,
|
|
12291
12610
|
)
|
|
12292
12611
|
|
|
12293
12612
|
validation.autobrief = autobrief
|
|
@@ -12315,6 +12634,12 @@ class Validate:
|
|
|
12315
12634
|
# This prevents modifications from one validation step affecting others
|
|
12316
12635
|
data_tbl_step = _copy_dataframe(data_tbl)
|
|
12317
12636
|
|
|
12637
|
+
# Capture original table dimensions and columns before preprocessing
|
|
12638
|
+
# (only if preprocessing is present - we'll set these inside the preprocessing block)
|
|
12639
|
+
original_rows = None
|
|
12640
|
+
original_cols = None
|
|
12641
|
+
original_column_names = None
|
|
12642
|
+
|
|
12318
12643
|
# ------------------------------------------------
|
|
12319
12644
|
# Preprocessing stage
|
|
12320
12645
|
# ------------------------------------------------
|
|
@@ -12322,6 +12647,16 @@ class Validate:
|
|
|
12322
12647
|
# Determine whether any preprocessing functions are to be applied to the table
|
|
12323
12648
|
if validation.pre is not None:
|
|
12324
12649
|
try:
|
|
12650
|
+
# Capture original table dimensions before preprocessing
|
|
12651
|
+
# Use get_row_count() instead of len() for compatibility with PySpark, etc.
|
|
12652
|
+
original_rows = get_row_count(data_tbl_step)
|
|
12653
|
+
original_cols = get_column_count(data_tbl_step)
|
|
12654
|
+
original_column_names = set(
|
|
12655
|
+
data_tbl_step.columns
|
|
12656
|
+
if hasattr(data_tbl_step, "columns")
|
|
12657
|
+
else list(data_tbl_step.columns)
|
|
12658
|
+
)
|
|
12659
|
+
|
|
12325
12660
|
# Read the text of the preprocessing function
|
|
12326
12661
|
pre_text = _pre_processing_funcs_to_str(validation.pre)
|
|
12327
12662
|
|
|
@@ -12354,6 +12689,62 @@ class Validate:
|
|
|
12354
12689
|
elif isinstance(validation.pre, Callable):
|
|
12355
12690
|
data_tbl_step = validation.pre(data_tbl_step)
|
|
12356
12691
|
|
|
12692
|
+
# After successful preprocessing, check dimensions and create notes
|
|
12693
|
+
# Use get_row_count() and get_column_count() for compatibility
|
|
12694
|
+
processed_rows = get_row_count(data_tbl_step)
|
|
12695
|
+
processed_cols = get_column_count(data_tbl_step)
|
|
12696
|
+
|
|
12697
|
+
# Always add a note when preprocessing is applied
|
|
12698
|
+
if original_rows != processed_rows or original_cols != processed_cols:
|
|
12699
|
+
# Dimensions changed - show the change
|
|
12700
|
+
note_html = _create_preprocessing_note_html(
|
|
12701
|
+
original_rows=original_rows,
|
|
12702
|
+
original_cols=original_cols,
|
|
12703
|
+
processed_rows=processed_rows,
|
|
12704
|
+
processed_cols=processed_cols,
|
|
12705
|
+
locale=self.locale,
|
|
12706
|
+
)
|
|
12707
|
+
note_text = _create_preprocessing_note_text(
|
|
12708
|
+
original_rows=original_rows,
|
|
12709
|
+
original_cols=original_cols,
|
|
12710
|
+
processed_rows=processed_rows,
|
|
12711
|
+
processed_cols=processed_cols,
|
|
12712
|
+
)
|
|
12713
|
+
else:
|
|
12714
|
+
# No dimension change - just indicate preprocessing was applied
|
|
12715
|
+
note_html = _create_preprocessing_no_change_note_html(locale=self.locale)
|
|
12716
|
+
note_text = _create_preprocessing_no_change_note_text()
|
|
12717
|
+
|
|
12718
|
+
validation._add_note(
|
|
12719
|
+
key="pre_applied",
|
|
12720
|
+
markdown=note_html,
|
|
12721
|
+
text=note_text,
|
|
12722
|
+
)
|
|
12723
|
+
|
|
12724
|
+
# Check if target column is synthetic (exists in processed but not original)
|
|
12725
|
+
# Only check for single column names (not lists used in rows_distinct, etc.)
|
|
12726
|
+
if column is not None and isinstance(column, str):
|
|
12727
|
+
processed_column_names = set(
|
|
12728
|
+
data_tbl_step.columns
|
|
12729
|
+
if hasattr(data_tbl_step, "columns")
|
|
12730
|
+
else list(data_tbl_step.columns)
|
|
12731
|
+
)
|
|
12732
|
+
|
|
12733
|
+
# Check if the target column is in the processed table but not in original
|
|
12734
|
+
if column in processed_column_names and column not in original_column_names:
|
|
12735
|
+
note_html = _create_synthetic_target_column_note_html(
|
|
12736
|
+
column_name=column,
|
|
12737
|
+
locale=self.locale,
|
|
12738
|
+
)
|
|
12739
|
+
note_text = _create_synthetic_target_column_note_text(
|
|
12740
|
+
column_name=column,
|
|
12741
|
+
)
|
|
12742
|
+
validation._add_note(
|
|
12743
|
+
key="syn_target_col",
|
|
12744
|
+
markdown=note_html,
|
|
12745
|
+
text=note_text,
|
|
12746
|
+
)
|
|
12747
|
+
|
|
12357
12748
|
except Exception:
|
|
12358
12749
|
# If preprocessing fails, mark the validation as having an eval_error
|
|
12359
12750
|
validation.eval_error = True
|
|
@@ -12543,6 +12934,21 @@ class Validate:
|
|
|
12543
12934
|
tbl=tbl, column=column, values=value, na_pass=na_pass
|
|
12544
12935
|
)
|
|
12545
12936
|
|
|
12937
|
+
elif assertion_type == "col_pct_null":
|
|
12938
|
+
result_bool = col_pct_null(
|
|
12939
|
+
data_tbl=data_tbl_step,
|
|
12940
|
+
column=column,
|
|
12941
|
+
p=value["p"],
|
|
12942
|
+
bound_finder=value["bound_finder"],
|
|
12943
|
+
)
|
|
12944
|
+
|
|
12945
|
+
validation.all_passed = result_bool
|
|
12946
|
+
validation.n = 1
|
|
12947
|
+
validation.n_passed = int(result_bool)
|
|
12948
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12949
|
+
|
|
12950
|
+
results_tbl = None
|
|
12951
|
+
|
|
12546
12952
|
elif assertion_type == "col_vals_expr":
|
|
12547
12953
|
results_tbl = col_vals_expr(
|
|
12548
12954
|
data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
|
|
@@ -12602,10 +13008,21 @@ class Validate:
|
|
|
12602
13008
|
# Add the schema validation info to the validation object
|
|
12603
13009
|
validation.val_info = schema_validation_info
|
|
12604
13010
|
|
|
13011
|
+
# Add a note with the schema expectation and results
|
|
13012
|
+
schema_note_html = _create_col_schema_match_note_html(
|
|
13013
|
+
schema_info=schema_validation_info, locale=self.locale
|
|
13014
|
+
)
|
|
13015
|
+
schema_note_text = _create_col_schema_match_note_text(
|
|
13016
|
+
schema_info=schema_validation_info
|
|
13017
|
+
)
|
|
13018
|
+
validation._add_note(
|
|
13019
|
+
key="schema_check", markdown=schema_note_html, text=schema_note_text
|
|
13020
|
+
)
|
|
13021
|
+
|
|
12605
13022
|
validation.all_passed = result_bool
|
|
12606
13023
|
validation.n = 1
|
|
12607
13024
|
validation.n_passed = int(result_bool)
|
|
12608
|
-
validation.n_failed = 1 - result_bool
|
|
13025
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12609
13026
|
|
|
12610
13027
|
results_tbl = None
|
|
12611
13028
|
|
|
@@ -12620,7 +13037,7 @@ class Validate:
|
|
|
12620
13037
|
validation.all_passed = result_bool
|
|
12621
13038
|
validation.n = 1
|
|
12622
13039
|
validation.n_passed = int(result_bool)
|
|
12623
|
-
validation.n_failed = 1 - result_bool
|
|
13040
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12624
13041
|
|
|
12625
13042
|
results_tbl = None
|
|
12626
13043
|
|
|
@@ -12632,7 +13049,7 @@ class Validate:
|
|
|
12632
13049
|
validation.all_passed = result_bool
|
|
12633
13050
|
validation.n = 1
|
|
12634
13051
|
validation.n_passed = int(result_bool)
|
|
12635
|
-
validation.n_failed = 1 - result_bool
|
|
13052
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12636
13053
|
|
|
12637
13054
|
results_tbl = None
|
|
12638
13055
|
|
|
@@ -12651,7 +13068,7 @@ class Validate:
|
|
|
12651
13068
|
validation.all_passed = result_bool
|
|
12652
13069
|
validation.n = 1
|
|
12653
13070
|
validation.n_passed = int(result_bool)
|
|
12654
|
-
validation.n_failed = 1 - result_bool
|
|
13071
|
+
validation.n_failed = 1 - int(result_bool)
|
|
12655
13072
|
|
|
12656
13073
|
results_tbl = None
|
|
12657
13074
|
|
|
@@ -12669,8 +13086,9 @@ class Validate:
|
|
|
12669
13086
|
) # pragma: no cover
|
|
12670
13087
|
|
|
12671
13088
|
except Exception as e:
|
|
12672
|
-
#
|
|
13089
|
+
# Catch data quality errors and column not found errors
|
|
12673
13090
|
error_msg = str(e).lower()
|
|
13091
|
+
|
|
12674
13092
|
is_comparison_error = (
|
|
12675
13093
|
"boolean value of na is ambiguous" in error_msg
|
|
12676
13094
|
or "cannot compare" in error_msg
|
|
@@ -12681,20 +13099,101 @@ class Validate:
|
|
|
12681
13099
|
or ("dtype" in error_msg and "compare" in error_msg)
|
|
12682
13100
|
)
|
|
12683
13101
|
|
|
12684
|
-
|
|
12685
|
-
|
|
12686
|
-
|
|
13102
|
+
is_column_not_found = "column" in error_msg and "not found" in error_msg
|
|
13103
|
+
|
|
13104
|
+
is_comparison_column_not_found = (
|
|
13105
|
+
"unable to find column" in error_msg and "valid columns" in error_msg
|
|
13106
|
+
)
|
|
13107
|
+
|
|
13108
|
+
if (
|
|
13109
|
+
is_comparison_error or is_column_not_found or is_comparison_column_not_found
|
|
13110
|
+
): # pragma: no cover
|
|
13111
|
+
# If data quality comparison fails or column not found, mark as eval_error
|
|
13112
|
+
validation.eval_error = True # pragma: no cover
|
|
13113
|
+
|
|
13114
|
+
# Add a note for column not found errors (target column)
|
|
13115
|
+
if is_column_not_found:
|
|
13116
|
+
note_html = _create_column_not_found_note_html(
|
|
13117
|
+
column_name=column,
|
|
13118
|
+
available_columns=list(data_tbl_step.columns)
|
|
13119
|
+
if hasattr(data_tbl_step, "columns")
|
|
13120
|
+
else [],
|
|
13121
|
+
locale=self.locale,
|
|
13122
|
+
)
|
|
13123
|
+
note_text = _create_column_not_found_note_text(
|
|
13124
|
+
column_name=column,
|
|
13125
|
+
available_columns=list(data_tbl_step.columns)
|
|
13126
|
+
if hasattr(data_tbl_step, "columns")
|
|
13127
|
+
else [],
|
|
13128
|
+
)
|
|
13129
|
+
validation._add_note(
|
|
13130
|
+
key="column_not_found",
|
|
13131
|
+
markdown=note_html,
|
|
13132
|
+
text=note_text,
|
|
13133
|
+
)
|
|
13134
|
+
|
|
13135
|
+
# Add a note for comparison column not found errors
|
|
13136
|
+
elif is_comparison_column_not_found:
|
|
13137
|
+
# Extract column name from error message
|
|
13138
|
+
# Error format: 'unable to find column "col_name"; valid columns: ...'
|
|
13139
|
+
match = re.search(r'unable to find column "([^"]+)"', str(e))
|
|
13140
|
+
|
|
13141
|
+
if match:
|
|
13142
|
+
missing_col_name = match.group(1)
|
|
13143
|
+
|
|
13144
|
+
# Determine position for between/outside validations
|
|
13145
|
+
position = None
|
|
13146
|
+
if assertion_type in ["col_vals_between", "col_vals_outside"]:
|
|
13147
|
+
# Check if missing column is in left or right position
|
|
13148
|
+
from pointblank.column import Column
|
|
13149
|
+
|
|
13150
|
+
if (
|
|
13151
|
+
isinstance(value[0], Column)
|
|
13152
|
+
and value[0].exprs == missing_col_name
|
|
13153
|
+
):
|
|
13154
|
+
position = "left"
|
|
13155
|
+
elif (
|
|
13156
|
+
isinstance(value[1], Column)
|
|
13157
|
+
and value[1].exprs == missing_col_name
|
|
13158
|
+
):
|
|
13159
|
+
position = "right"
|
|
13160
|
+
|
|
13161
|
+
note_html = _create_comparison_column_not_found_note_html(
|
|
13162
|
+
column_name=missing_col_name,
|
|
13163
|
+
position=position,
|
|
13164
|
+
available_columns=list(data_tbl_step.columns)
|
|
13165
|
+
if hasattr(data_tbl_step, "columns")
|
|
13166
|
+
else [],
|
|
13167
|
+
locale=self.locale,
|
|
13168
|
+
)
|
|
13169
|
+
note_text = _create_comparison_column_not_found_note_text(
|
|
13170
|
+
column_name=missing_col_name,
|
|
13171
|
+
position=position,
|
|
13172
|
+
available_columns=list(data_tbl_step.columns)
|
|
13173
|
+
if hasattr(data_tbl_step, "columns")
|
|
13174
|
+
else [],
|
|
13175
|
+
)
|
|
13176
|
+
validation._add_note(
|
|
13177
|
+
key="comparison_column_not_found",
|
|
13178
|
+
markdown=note_html,
|
|
13179
|
+
text=note_text,
|
|
13180
|
+
)
|
|
13181
|
+
|
|
12687
13182
|
end_time = datetime.datetime.now(datetime.timezone.utc) # pragma: no cover
|
|
13183
|
+
|
|
12688
13184
|
validation.proc_duration_s = (
|
|
12689
13185
|
end_time - start_time
|
|
12690
13186
|
).total_seconds() # pragma: no cover
|
|
13187
|
+
|
|
12691
13188
|
validation.time_processed = end_time.isoformat(
|
|
12692
13189
|
timespec="milliseconds"
|
|
12693
13190
|
) # pragma: no cover
|
|
13191
|
+
|
|
12694
13192
|
validation.active = False # pragma: no cover
|
|
13193
|
+
|
|
12695
13194
|
continue # pragma: no cover
|
|
12696
13195
|
else:
|
|
12697
|
-
# For other errors
|
|
13196
|
+
# For other unexpected errors, let them propagate
|
|
12698
13197
|
raise
|
|
12699
13198
|
|
|
12700
13199
|
else:
|
|
@@ -12792,6 +13291,7 @@ class Validate:
|
|
|
12792
13291
|
markdown=threshold_note_html,
|
|
12793
13292
|
text=threshold_note_text,
|
|
12794
13293
|
)
|
|
13294
|
+
|
|
12795
13295
|
elif self.thresholds != Thresholds():
|
|
12796
13296
|
# Thresholds explicitly reset to empty when global thresholds exist
|
|
12797
13297
|
reset_note_html = _create_threshold_reset_note_html(locale=self.locale)
|
|
@@ -12814,6 +13314,8 @@ class Validate:
|
|
|
12814
13314
|
column=column,
|
|
12815
13315
|
values=value,
|
|
12816
13316
|
for_failure=True,
|
|
13317
|
+
locale=self.locale,
|
|
13318
|
+
n_rows=n_rows,
|
|
12817
13319
|
)
|
|
12818
13320
|
|
|
12819
13321
|
# Set the failure text in the validation step
|
|
@@ -14892,7 +15394,12 @@ class Validate:
|
|
|
14892
15394
|
return None
|
|
14893
15395
|
|
|
14894
15396
|
def get_tabular_report(
|
|
14895
|
-
self,
|
|
15397
|
+
self,
|
|
15398
|
+
title: str | None = ":default:",
|
|
15399
|
+
incl_header: bool = None,
|
|
15400
|
+
incl_footer: bool = None,
|
|
15401
|
+
incl_footer_timings: bool = None,
|
|
15402
|
+
incl_footer_notes: bool = None,
|
|
14896
15403
|
) -> GT:
|
|
14897
15404
|
"""
|
|
14898
15405
|
Validation report as a GT table.
|
|
@@ -14915,6 +15422,20 @@ class Validate:
|
|
|
14915
15422
|
name of the table as the title for the report. If no title is wanted, then `":none:"`
|
|
14916
15423
|
can be used. Aside from keyword options, text can be provided for the title. This will
|
|
14917
15424
|
be interpreted as Markdown text and transformed internally to HTML.
|
|
15425
|
+
incl_header
|
|
15426
|
+
Controls whether the header section should be displayed. If `None`, uses the global
|
|
15427
|
+
configuration setting. The header contains the table name, label, and threshold
|
|
15428
|
+
information.
|
|
15429
|
+
incl_footer
|
|
15430
|
+
Controls whether the footer section should be displayed. If `None`, uses the global
|
|
15431
|
+
configuration setting. The footer can contain validation timing information and notes.
|
|
15432
|
+
incl_footer_timings
|
|
15433
|
+
Controls whether validation timing information (start time, duration, end time) should
|
|
15434
|
+
be displayed in the footer. If `None`, uses the global configuration setting. Only
|
|
15435
|
+
applies when `incl_footer=True`.
|
|
15436
|
+
incl_footer_notes
|
|
15437
|
+
Controls whether notes from validation steps should be displayed in the footer. If
|
|
15438
|
+
`None`, uses the global configuration setting. Only applies when `incl_footer=True`.
|
|
14918
15439
|
|
|
14919
15440
|
Returns
|
|
14920
15441
|
-------
|
|
@@ -14974,6 +15495,10 @@ class Validate:
|
|
|
14974
15495
|
incl_header = global_config.report_incl_header
|
|
14975
15496
|
if incl_footer is None:
|
|
14976
15497
|
incl_footer = global_config.report_incl_footer
|
|
15498
|
+
if incl_footer_timings is None:
|
|
15499
|
+
incl_footer_timings = global_config.report_incl_footer_timings
|
|
15500
|
+
if incl_footer_notes is None:
|
|
15501
|
+
incl_footer_notes = global_config.report_incl_footer_notes
|
|
14977
15502
|
|
|
14978
15503
|
# Do we have a DataFrame library to work with?
|
|
14979
15504
|
_check_any_df_lib(method_used="get_tabular_report")
|
|
@@ -15212,30 +15737,53 @@ class Validate:
|
|
|
15212
15737
|
columns_upd = []
|
|
15213
15738
|
|
|
15214
15739
|
columns = validation_info_dict["column"]
|
|
15740
|
+
notes = validation_info_dict["notes"]
|
|
15215
15741
|
|
|
15216
15742
|
assertion_type = validation_info_dict["assertion_type"]
|
|
15217
15743
|
|
|
15218
15744
|
# Iterate over the values in the `column` entry
|
|
15219
15745
|
for i, column in enumerate(columns):
|
|
15746
|
+
# Check if this validation has a synthetic target column note
|
|
15747
|
+
has_synthetic_column = (
|
|
15748
|
+
notes[i] is not None and isinstance(notes[i], dict) and "syn_target_col" in notes[i]
|
|
15749
|
+
)
|
|
15750
|
+
|
|
15751
|
+
column_text = None
|
|
15752
|
+
|
|
15220
15753
|
if assertion_type[i] in [
|
|
15221
15754
|
"col_schema_match",
|
|
15222
15755
|
"row_count_match",
|
|
15223
15756
|
"col_count_match",
|
|
15224
15757
|
"col_vals_expr",
|
|
15225
15758
|
]:
|
|
15226
|
-
|
|
15759
|
+
column_text = "—"
|
|
15227
15760
|
elif assertion_type[i] in ["rows_distinct", "rows_complete", "prompt"]:
|
|
15228
15761
|
if not column:
|
|
15229
15762
|
# If there is no column subset, then all columns are used
|
|
15230
|
-
|
|
15763
|
+
column_text = "ALL COLUMNS"
|
|
15231
15764
|
else:
|
|
15232
15765
|
# With a column subset list, format with commas between the column names
|
|
15233
|
-
|
|
15234
|
-
|
|
15766
|
+
column_text = ", ".join(column)
|
|
15235
15767
|
elif assertion_type[i] in ["conjointly", "specially"]:
|
|
15236
|
-
|
|
15768
|
+
column_text = ""
|
|
15237
15769
|
else:
|
|
15238
|
-
|
|
15770
|
+
column_text = str(column)
|
|
15771
|
+
|
|
15772
|
+
# Apply underline styling for synthetic columns (using the purple color from the icon)
|
|
15773
|
+
# Only apply styling if column_text is not empty and not a special marker
|
|
15774
|
+
if (
|
|
15775
|
+
has_synthetic_column
|
|
15776
|
+
and column_text
|
|
15777
|
+
and column_text not in ["—", "ALL COLUMNS", ""]
|
|
15778
|
+
):
|
|
15779
|
+
column_text = (
|
|
15780
|
+
f'<span style="text-decoration: underline; '
|
|
15781
|
+
f"text-decoration-color: #9A7CB4; text-decoration-thickness: 1px; "
|
|
15782
|
+
f'text-underline-offset: 3px;">'
|
|
15783
|
+
f"{column_text}</span>"
|
|
15784
|
+
)
|
|
15785
|
+
|
|
15786
|
+
columns_upd.append(column_text)
|
|
15239
15787
|
|
|
15240
15788
|
# Add the `columns_upd` entry to the dictionary
|
|
15241
15789
|
validation_info_dict["columns_upd"] = columns_upd
|
|
@@ -15291,6 +15839,15 @@ class Validate:
|
|
|
15291
15839
|
]:
|
|
15292
15840
|
values_upd.append("—")
|
|
15293
15841
|
|
|
15842
|
+
elif assertion_type[i] in ["col_pct_null"]:
|
|
15843
|
+
# Extract p and tol from the values dict for nice formatting
|
|
15844
|
+
p_value = value["p"]
|
|
15845
|
+
|
|
15846
|
+
# Extract tol from the bound_finder partial function
|
|
15847
|
+
bound_finder = value.get("bound_finder")
|
|
15848
|
+
tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
|
|
15849
|
+
values_upd.append(f"p = {p_value}<br/>tol = {tol_value}")
|
|
15850
|
+
|
|
15294
15851
|
elif assertion_type[i] in ["col_schema_match"]:
|
|
15295
15852
|
values_upd.append("SCHEMA")
|
|
15296
15853
|
|
|
@@ -15766,13 +16323,15 @@ class Validate:
|
|
|
15766
16323
|
gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
|
|
15767
16324
|
|
|
15768
16325
|
if incl_footer:
|
|
15769
|
-
# Add table time as HTML source note
|
|
15770
|
-
|
|
16326
|
+
# Add table time as HTML source note if enabled
|
|
16327
|
+
if incl_footer_timings:
|
|
16328
|
+
gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
|
|
15771
16329
|
|
|
15772
|
-
# Create notes markdown from validation steps and add as separate source note
|
|
15773
|
-
|
|
15774
|
-
|
|
15775
|
-
|
|
16330
|
+
# Create notes markdown from validation steps and add as separate source note if enabled
|
|
16331
|
+
if incl_footer_notes:
|
|
16332
|
+
notes_markdown = _create_notes_html(self.validation_info)
|
|
16333
|
+
if notes_markdown:
|
|
16334
|
+
gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
|
|
15776
16335
|
|
|
15777
16336
|
# If the interrogation has not been performed, then style the table columns dealing with
|
|
15778
16337
|
# interrogation data as grayed out
|
|
@@ -16189,6 +16748,12 @@ class Validate:
|
|
|
16189
16748
|
|
|
16190
16749
|
except Exception: # pragma: no cover
|
|
16191
16750
|
validation.eval_error = True
|
|
16751
|
+
columns_resolved = []
|
|
16752
|
+
# Store columns list for note generation
|
|
16753
|
+
try:
|
|
16754
|
+
columns = list(table.columns) if "table" in locals() else []
|
|
16755
|
+
except Exception:
|
|
16756
|
+
columns = []
|
|
16192
16757
|
|
|
16193
16758
|
# If no columns were resolved, then create a patched validation step with the
|
|
16194
16759
|
# `eval_error` and `column` attributes set
|
|
@@ -16196,6 +16761,22 @@ class Validate:
|
|
|
16196
16761
|
validation.eval_error = True
|
|
16197
16762
|
validation.column = str(column_expr)
|
|
16198
16763
|
|
|
16764
|
+
# Add a helpful note explaining that no columns were resolved
|
|
16765
|
+
note_html = _create_no_columns_resolved_note_html(
|
|
16766
|
+
column_expr=str(column_expr),
|
|
16767
|
+
available_columns=columns,
|
|
16768
|
+
locale=self.locale,
|
|
16769
|
+
)
|
|
16770
|
+
note_text = _create_no_columns_resolved_note_text(
|
|
16771
|
+
column_expr=str(column_expr),
|
|
16772
|
+
available_columns=columns,
|
|
16773
|
+
)
|
|
16774
|
+
validation._add_note(
|
|
16775
|
+
key="no_columns_resolved",
|
|
16776
|
+
markdown=note_html,
|
|
16777
|
+
text=note_text,
|
|
16778
|
+
)
|
|
16779
|
+
|
|
16199
16780
|
expanded_validation_info.append(validation)
|
|
16200
16781
|
continue
|
|
16201
16782
|
|
|
@@ -16754,7 +17335,13 @@ def _process_action_str(
|
|
|
16754
17335
|
|
|
16755
17336
|
|
|
16756
17337
|
def _create_autobrief_or_failure_text(
|
|
16757
|
-
assertion_type: str,
|
|
17338
|
+
assertion_type: str,
|
|
17339
|
+
lang: str,
|
|
17340
|
+
column: str | None,
|
|
17341
|
+
values: str | None,
|
|
17342
|
+
for_failure: bool,
|
|
17343
|
+
locale: str | None = None,
|
|
17344
|
+
n_rows: int | None = None,
|
|
16758
17345
|
) -> str:
|
|
16759
17346
|
if assertion_type in [
|
|
16760
17347
|
"col_vals_gt",
|
|
@@ -16878,6 +17465,16 @@ def _create_autobrief_or_failure_text(
|
|
|
16878
17465
|
for_failure=for_failure,
|
|
16879
17466
|
)
|
|
16880
17467
|
|
|
17468
|
+
if assertion_type == "col_pct_null":
|
|
17469
|
+
return _create_text_col_pct_null(
|
|
17470
|
+
lang=lang,
|
|
17471
|
+
column=column,
|
|
17472
|
+
value=values,
|
|
17473
|
+
for_failure=for_failure,
|
|
17474
|
+
locale=locale if locale else lang,
|
|
17475
|
+
n_rows=n_rows,
|
|
17476
|
+
)
|
|
17477
|
+
|
|
16881
17478
|
if assertion_type == "conjointly":
|
|
16882
17479
|
return _create_text_conjointly(lang=lang, for_failure=for_failure)
|
|
16883
17480
|
|
|
@@ -17100,6 +17697,115 @@ def _create_text_col_count_match(lang: str, value: int, for_failure: bool = Fals
|
|
|
17100
17697
|
return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text)
|
|
17101
17698
|
|
|
17102
17699
|
|
|
17700
|
+
def _create_text_col_pct_null(
|
|
17701
|
+
lang: str,
|
|
17702
|
+
column: str | None,
|
|
17703
|
+
value: dict,
|
|
17704
|
+
for_failure: bool = False,
|
|
17705
|
+
locale: str | None = None,
|
|
17706
|
+
n_rows: int | None = None,
|
|
17707
|
+
) -> str:
|
|
17708
|
+
"""Create text for col_pct_null validation with tolerance handling."""
|
|
17709
|
+
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17710
|
+
|
|
17711
|
+
column_text = _prep_column_text(column=column)
|
|
17712
|
+
|
|
17713
|
+
# Use locale for number formatting, defaulting to lang if not provided
|
|
17714
|
+
fmt_locale = locale if locale else lang
|
|
17715
|
+
|
|
17716
|
+
# Extract p and tol from the values dict
|
|
17717
|
+
p_value = value.get("p", 0) * 100 # Convert to percentage
|
|
17718
|
+
p_value_original = value.get("p", 0) # Keep original value for deviation format
|
|
17719
|
+
|
|
17720
|
+
# Extract tol from the bound_finder partial function
|
|
17721
|
+
bound_finder = value.get("bound_finder")
|
|
17722
|
+
tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
|
|
17723
|
+
|
|
17724
|
+
# Handle different tolerance types
|
|
17725
|
+
has_tolerance = False
|
|
17726
|
+
is_asymmetric = False
|
|
17727
|
+
|
|
17728
|
+
if isinstance(tol_value, tuple):
|
|
17729
|
+
# Tuple tolerance: can be (lower, upper) in absolute or relative terms
|
|
17730
|
+
tol_lower, tol_upper = tol_value
|
|
17731
|
+
|
|
17732
|
+
# Check if we have any non-zero tolerance
|
|
17733
|
+
has_tolerance = tol_lower != 0 or tol_upper != 0
|
|
17734
|
+
is_asymmetric = tol_lower != tol_upper
|
|
17735
|
+
|
|
17736
|
+
# For relative tolerances (floats < 1), we can compute exact percentage bounds
|
|
17737
|
+
# For absolute tolerances (ints >= 1), calculate based on actual row count if available
|
|
17738
|
+
if tol_lower < 1:
|
|
17739
|
+
# Relative tolerance (float)
|
|
17740
|
+
lower_pct_delta = tol_lower * 100
|
|
17741
|
+
else:
|
|
17742
|
+
# Absolute tolerance (int); uses actual row count if available
|
|
17743
|
+
if n_rows is not None and n_rows > 0:
|
|
17744
|
+
lower_pct_delta = (tol_lower / n_rows) * 100
|
|
17745
|
+
else:
|
|
17746
|
+
lower_pct_delta = tol_lower # Fallback approximation
|
|
17747
|
+
|
|
17748
|
+
if tol_upper < 1:
|
|
17749
|
+
# Relative tolerance (float)
|
|
17750
|
+
upper_pct_delta = tol_upper * 100
|
|
17751
|
+
else:
|
|
17752
|
+
# Absolute tolerance (int); uses actual row count if available
|
|
17753
|
+
if n_rows is not None and n_rows > 0:
|
|
17754
|
+
upper_pct_delta = (tol_upper / n_rows) * 100
|
|
17755
|
+
else:
|
|
17756
|
+
upper_pct_delta = tol_upper # Fallback approximation
|
|
17757
|
+
else:
|
|
17758
|
+
# Single value tolerance: symmetric
|
|
17759
|
+
has_tolerance = tol_value != 0
|
|
17760
|
+
|
|
17761
|
+
if tol_value < 1:
|
|
17762
|
+
# Relative tolerance (float)
|
|
17763
|
+
tol_pct = tol_value * 100
|
|
17764
|
+
else:
|
|
17765
|
+
# Absolute tolerance (int) - use actual row count if available
|
|
17766
|
+
if n_rows is not None and n_rows > 0:
|
|
17767
|
+
tol_pct = (tol_value / n_rows) * 100
|
|
17768
|
+
else:
|
|
17769
|
+
tol_pct = tol_value # Fallback approximation
|
|
17770
|
+
|
|
17771
|
+
lower_pct_delta = tol_pct
|
|
17772
|
+
upper_pct_delta = tol_pct
|
|
17773
|
+
|
|
17774
|
+
# Format numbers with locale-aware formatting
|
|
17775
|
+
p_formatted = _format_number_safe(p_value, decimals=1, locale=fmt_locale)
|
|
17776
|
+
p_original_formatted = _format_number_safe(p_value_original, decimals=2, locale=fmt_locale)
|
|
17777
|
+
|
|
17778
|
+
# Choose the appropriate translation key based on tolerance
|
|
17779
|
+
if not has_tolerance:
|
|
17780
|
+
# No tolerance - use simple text
|
|
17781
|
+
text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text"][lang].format(
|
|
17782
|
+
column_text=column_text,
|
|
17783
|
+
p=p_formatted,
|
|
17784
|
+
)
|
|
17785
|
+
elif is_asymmetric or isinstance(tol_value, tuple):
|
|
17786
|
+
# Use deviation format for tuple tolerances (including symmetric ones)
|
|
17787
|
+
# Format the deviation values with signs (using proper minus sign U+2212)
|
|
17788
|
+
lower_dev = f"−{_format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)}%"
|
|
17789
|
+
upper_dev = f"+{_format_number_safe(upper_pct_delta, decimals=1, locale=fmt_locale)}%"
|
|
17790
|
+
|
|
17791
|
+
text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol_deviation"][lang].format(
|
|
17792
|
+
column_text=column_text,
|
|
17793
|
+
lower_dev=lower_dev,
|
|
17794
|
+
upper_dev=upper_dev,
|
|
17795
|
+
p=p_original_formatted,
|
|
17796
|
+
)
|
|
17797
|
+
else:
|
|
17798
|
+
# Single value tolerance - use the symmetric ± format
|
|
17799
|
+
tol_formatted = _format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)
|
|
17800
|
+
text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol"][lang].format(
|
|
17801
|
+
column_text=column_text,
|
|
17802
|
+
p=p_formatted,
|
|
17803
|
+
tol=tol_formatted,
|
|
17804
|
+
)
|
|
17805
|
+
|
|
17806
|
+
return text
|
|
17807
|
+
|
|
17808
|
+
|
|
17103
17809
|
def _create_text_conjointly(lang: str, for_failure: bool = False) -> str:
|
|
17104
17810
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
17105
17811
|
|
|
@@ -17498,6 +18204,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
|
|
|
17498
18204
|
|
|
17499
18205
|
def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]:
|
|
17500
18206
|
# For each icon, get the assertion icon SVG test from SVG_ICONS_FOR_ASSERTION_TYPES dictionary
|
|
18207
|
+
# TODO: No point in using `get` if we can't handle missing keys anyways
|
|
17501
18208
|
icon_svg = [SVG_ICONS_FOR_ASSERTION_TYPES.get(icon) for icon in icon]
|
|
17502
18209
|
|
|
17503
18210
|
# Replace the width and height in the SVG string
|
|
@@ -18433,6 +19140,603 @@ def _create_threshold_reset_note_text() -> str:
|
|
|
18433
19140
|
return "Global thresholds explicitly not used for this step."
|
|
18434
19141
|
|
|
18435
19142
|
|
|
19143
|
+
def _create_no_columns_resolved_note_html(
|
|
19144
|
+
column_expr: str, available_columns: list[str], locale: str = "en"
|
|
19145
|
+
) -> str:
|
|
19146
|
+
"""
|
|
19147
|
+
Create an HTML note explaining that a column expression resolved to no columns.
|
|
19148
|
+
|
|
19149
|
+
Parameters
|
|
19150
|
+
----------
|
|
19151
|
+
column_expr
|
|
19152
|
+
The column expression that failed to resolve columns (as a string).
|
|
19153
|
+
available_columns
|
|
19154
|
+
List of available column names in the table.
|
|
19155
|
+
locale
|
|
19156
|
+
The locale string (e.g., 'en', 'fr').
|
|
19157
|
+
|
|
19158
|
+
Returns
|
|
19159
|
+
-------
|
|
19160
|
+
str
|
|
19161
|
+
HTML-formatted note text.
|
|
19162
|
+
"""
|
|
19163
|
+
# Get translated strings
|
|
19164
|
+
intro = NOTES_TEXT.get("column_not_found_intro", {}).get(
|
|
19165
|
+
locale, NOTES_TEXT.get("column_not_found_intro", {}).get("en", "The column expression")
|
|
19166
|
+
)
|
|
19167
|
+
no_resolve = NOTES_TEXT.get("column_not_found_no_resolve", {}).get(
|
|
19168
|
+
locale,
|
|
19169
|
+
NOTES_TEXT.get("column_not_found_no_resolve", {}).get(
|
|
19170
|
+
"en", "does not resolve to any columns"
|
|
19171
|
+
),
|
|
19172
|
+
)
|
|
19173
|
+
|
|
19174
|
+
# Format the column expression with monospace font
|
|
19175
|
+
col_expr_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_expr}</code>"
|
|
19176
|
+
|
|
19177
|
+
# Build the HTML note
|
|
19178
|
+
html = f"{intro} {col_expr_html} {no_resolve}."
|
|
19179
|
+
|
|
19180
|
+
return html
|
|
19181
|
+
|
|
19182
|
+
|
|
19183
|
+
def _create_no_columns_resolved_note_text(column_expr: str, available_columns: list[str]) -> str:
|
|
19184
|
+
"""
|
|
19185
|
+
Create a plain text note explaining that a column expression resolved to no columns.
|
|
19186
|
+
|
|
19187
|
+
Parameters
|
|
19188
|
+
----------
|
|
19189
|
+
column_expr
|
|
19190
|
+
The column expression that failed to resolve columns (as a string).
|
|
19191
|
+
available_columns
|
|
19192
|
+
List of available column names in the table.
|
|
19193
|
+
|
|
19194
|
+
Returns
|
|
19195
|
+
-------
|
|
19196
|
+
str
|
|
19197
|
+
Plain text note.
|
|
19198
|
+
"""
|
|
19199
|
+
return f"The column expression `{column_expr}` does not resolve to any columns."
|
|
19200
|
+
|
|
19201
|
+
|
|
19202
|
+
def _create_column_not_found_note_html(
|
|
19203
|
+
column_name: str, available_columns: list[str], locale: str = "en"
|
|
19204
|
+
) -> str:
|
|
19205
|
+
"""
|
|
19206
|
+
Create an HTML note explaining that a specific column was not found.
|
|
19207
|
+
|
|
19208
|
+
Parameters
|
|
19209
|
+
----------
|
|
19210
|
+
column_name
|
|
19211
|
+
The column name that was not found.
|
|
19212
|
+
available_columns
|
|
19213
|
+
List of available column names in the table.
|
|
19214
|
+
locale
|
|
19215
|
+
The locale string (e.g., 'en', 'fr').
|
|
19216
|
+
|
|
19217
|
+
Returns
|
|
19218
|
+
-------
|
|
19219
|
+
str
|
|
19220
|
+
HTML-formatted note text.
|
|
19221
|
+
"""
|
|
19222
|
+
# Get translated strings
|
|
19223
|
+
intro = NOTES_TEXT.get("target_column_provided", {}).get(
|
|
19224
|
+
locale, NOTES_TEXT.get("target_column_provided", {}).get("en", "The target column provided")
|
|
19225
|
+
)
|
|
19226
|
+
not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get(
|
|
19227
|
+
locale,
|
|
19228
|
+
NOTES_TEXT.get("does_not_match_any_columns", {}).get(
|
|
19229
|
+
"en", "does not match any columns in the table"
|
|
19230
|
+
),
|
|
19231
|
+
)
|
|
19232
|
+
|
|
19233
|
+
# Format the column name with monospace font
|
|
19234
|
+
col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
|
|
19235
|
+
|
|
19236
|
+
# Build the HTML note
|
|
19237
|
+
html = f"{intro} ({col_name_html}) {not_found}."
|
|
19238
|
+
|
|
19239
|
+
return html
|
|
19240
|
+
|
|
19241
|
+
|
|
19242
|
+
def _create_column_not_found_note_text(column_name: str, available_columns: list[str]) -> str:
|
|
19243
|
+
"""
|
|
19244
|
+
Create a plain text note explaining that a specific column was not found.
|
|
19245
|
+
|
|
19246
|
+
Parameters
|
|
19247
|
+
----------
|
|
19248
|
+
column_name
|
|
19249
|
+
The column name that was not found.
|
|
19250
|
+
available_columns
|
|
19251
|
+
List of available column names in the table.
|
|
19252
|
+
|
|
19253
|
+
Returns
|
|
19254
|
+
-------
|
|
19255
|
+
str
|
|
19256
|
+
Plain text note.
|
|
19257
|
+
"""
|
|
19258
|
+
return f"The target column provided ({column_name}) does not match any columns in the table."
|
|
19259
|
+
|
|
19260
|
+
|
|
19261
|
+
def _create_comparison_column_not_found_note_html(
|
|
19262
|
+
column_name: str, position: str | None, available_columns: list[str], locale: str = "en"
|
|
19263
|
+
) -> str:
|
|
19264
|
+
"""
|
|
19265
|
+
Create an HTML note explaining that a comparison column was not found.
|
|
19266
|
+
|
|
19267
|
+
Parameters
|
|
19268
|
+
----------
|
|
19269
|
+
column_name
|
|
19270
|
+
The comparison column name that was not found.
|
|
19271
|
+
position
|
|
19272
|
+
Optional position indicator ("left", "right") for between/outside validations.
|
|
19273
|
+
available_columns
|
|
19274
|
+
List of available column names in the table.
|
|
19275
|
+
locale
|
|
19276
|
+
The locale string (e.g., 'en', 'fr').
|
|
19277
|
+
|
|
19278
|
+
Returns
|
|
19279
|
+
-------
|
|
19280
|
+
str
|
|
19281
|
+
HTML-formatted note text.
|
|
19282
|
+
"""
|
|
19283
|
+
# Get translated strings
|
|
19284
|
+
intro = NOTES_TEXT.get("comparison_column_provided", {}).get(
|
|
19285
|
+
locale,
|
|
19286
|
+
NOTES_TEXT.get("comparison_column_provided", {}).get(
|
|
19287
|
+
"en", "The comparison column provided"
|
|
19288
|
+
),
|
|
19289
|
+
)
|
|
19290
|
+
intro_with_for = NOTES_TEXT.get("comparison_column_for", {}).get(
|
|
19291
|
+
locale,
|
|
19292
|
+
NOTES_TEXT.get("comparison_column_for", {}).get("en", "The comparison column provided for"),
|
|
19293
|
+
)
|
|
19294
|
+
not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get(
|
|
19295
|
+
locale,
|
|
19296
|
+
NOTES_TEXT.get("does_not_match_any_columns", {}).get(
|
|
19297
|
+
"en", "does not match any columns in the table"
|
|
19298
|
+
),
|
|
19299
|
+
)
|
|
19300
|
+
|
|
19301
|
+
# Format the column name with monospace font
|
|
19302
|
+
col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
|
|
19303
|
+
|
|
19304
|
+
# Add position if provided (for between/outside validations)
|
|
19305
|
+
if position:
|
|
19306
|
+
# Format position parameter with monospace font (e.g., "left=", "right=")
|
|
19307
|
+
position_param = (
|
|
19308
|
+
f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{position}=</code>"
|
|
19309
|
+
)
|
|
19310
|
+
# Use the "for" version of the intro text
|
|
19311
|
+
html = f"{intro_with_for} {position_param} ({col_name_html}) {not_found}."
|
|
19312
|
+
else:
|
|
19313
|
+
# Use the standard intro text without "for"
|
|
19314
|
+
html = f"{intro} ({col_name_html}) {not_found}."
|
|
19315
|
+
|
|
19316
|
+
return html
|
|
19317
|
+
|
|
19318
|
+
|
|
19319
|
+
def _create_comparison_column_not_found_note_text(
|
|
19320
|
+
column_name: str, position: str | None, available_columns: list[str]
|
|
19321
|
+
) -> str:
|
|
19322
|
+
"""
|
|
19323
|
+
Create a plain text note explaining that a comparison column was not found.
|
|
19324
|
+
|
|
19325
|
+
Parameters
|
|
19326
|
+
----------
|
|
19327
|
+
column_name
|
|
19328
|
+
The comparison column name that was not found.
|
|
19329
|
+
position
|
|
19330
|
+
Optional position indicator ("left", "right") for between/outside validations.
|
|
19331
|
+
available_columns
|
|
19332
|
+
List of available column names in the table.
|
|
19333
|
+
|
|
19334
|
+
Returns
|
|
19335
|
+
-------
|
|
19336
|
+
str
|
|
19337
|
+
Plain text note.
|
|
19338
|
+
"""
|
|
19339
|
+
if position:
|
|
19340
|
+
position_text = f" for {position}="
|
|
19341
|
+
else:
|
|
19342
|
+
position_text = ""
|
|
19343
|
+
|
|
19344
|
+
return (
|
|
19345
|
+
f"The comparison column provided{position_text} ({column_name}) "
|
|
19346
|
+
f"does not match any columns in the table."
|
|
19347
|
+
)
|
|
19348
|
+
|
|
19349
|
+
|
|
19350
|
+
def _create_preprocessing_note_html(
|
|
19351
|
+
original_rows: int,
|
|
19352
|
+
original_cols: int,
|
|
19353
|
+
processed_rows: int,
|
|
19354
|
+
processed_cols: int,
|
|
19355
|
+
locale: str = "en",
|
|
19356
|
+
) -> str:
|
|
19357
|
+
"""
|
|
19358
|
+
Create an HTML note showing table dimension changes from preprocessing.
|
|
19359
|
+
|
|
19360
|
+
Parameters
|
|
19361
|
+
----------
|
|
19362
|
+
original_rows
|
|
19363
|
+
Number of rows in the original table.
|
|
19364
|
+
original_cols
|
|
19365
|
+
Number of columns in the original table.
|
|
19366
|
+
processed_rows
|
|
19367
|
+
Number of rows after preprocessing.
|
|
19368
|
+
processed_cols
|
|
19369
|
+
Number of columns after preprocessing.
|
|
19370
|
+
locale
|
|
19371
|
+
The locale string (e.g., 'en', 'fr').
|
|
19372
|
+
|
|
19373
|
+
Returns
|
|
19374
|
+
-------
|
|
19375
|
+
str
|
|
19376
|
+
HTML-formatted note text.
|
|
19377
|
+
"""
|
|
19378
|
+
# Get translated strings
|
|
19379
|
+
precondition_text = NOTES_TEXT.get("precondition_applied", {}).get(
|
|
19380
|
+
locale, NOTES_TEXT.get("precondition_applied", {}).get("en", "Precondition applied")
|
|
19381
|
+
)
|
|
19382
|
+
table_dims_text = NOTES_TEXT.get("table_dimensions", {}).get(
|
|
19383
|
+
locale, NOTES_TEXT.get("table_dimensions", {}).get("en", "table dimensions")
|
|
19384
|
+
)
|
|
19385
|
+
|
|
19386
|
+
# Helper function to get singular or plural form
|
|
19387
|
+
def get_row_text(count: int) -> str:
|
|
19388
|
+
if count == 1:
|
|
19389
|
+
return NOTES_TEXT.get("row", {}).get(locale, NOTES_TEXT.get("row", {}).get("en", "row"))
|
|
19390
|
+
return NOTES_TEXT.get("rows", {}).get(locale, NOTES_TEXT.get("rows", {}).get("en", "rows"))
|
|
19391
|
+
|
|
19392
|
+
def get_col_text(count: int) -> str:
|
|
19393
|
+
if count == 1:
|
|
19394
|
+
return NOTES_TEXT.get("column", {}).get(
|
|
19395
|
+
locale, NOTES_TEXT.get("column", {}).get("en", "column")
|
|
19396
|
+
)
|
|
19397
|
+
return NOTES_TEXT.get("columns", {}).get(
|
|
19398
|
+
locale, NOTES_TEXT.get("columns", {}).get("en", "columns")
|
|
19399
|
+
)
|
|
19400
|
+
|
|
19401
|
+
# Determine which dimensions changed
|
|
19402
|
+
rows_changed = original_rows != processed_rows
|
|
19403
|
+
cols_changed = original_cols != processed_cols
|
|
19404
|
+
|
|
19405
|
+
# Format original dimensions
|
|
19406
|
+
original_rows_text = get_row_text(original_rows)
|
|
19407
|
+
original_cols_text = get_col_text(original_cols)
|
|
19408
|
+
original_dim = (
|
|
19409
|
+
f'<span style="font-family: monospace;">'
|
|
19410
|
+
f"[{original_rows:,} {original_rows_text}, {original_cols} {original_cols_text}]"
|
|
19411
|
+
f"</span>"
|
|
19412
|
+
)
|
|
19413
|
+
|
|
19414
|
+
# Format processed dimensions with bold for changed values
|
|
19415
|
+
processed_rows_text = get_row_text(processed_rows)
|
|
19416
|
+
processed_cols_text = get_col_text(processed_cols)
|
|
19417
|
+
|
|
19418
|
+
if rows_changed:
|
|
19419
|
+
rows_display = f"<strong>{processed_rows:,}</strong> {processed_rows_text}"
|
|
19420
|
+
else:
|
|
19421
|
+
rows_display = f"{processed_rows:,} {processed_rows_text}"
|
|
19422
|
+
|
|
19423
|
+
if cols_changed:
|
|
19424
|
+
cols_display = f"<strong>{processed_cols}</strong> {processed_cols_text}"
|
|
19425
|
+
else:
|
|
19426
|
+
cols_display = f"{processed_cols} {processed_cols_text}"
|
|
19427
|
+
|
|
19428
|
+
processed_dim = f'<span style="font-family: monospace;">[{rows_display}, {cols_display}]</span>'
|
|
19429
|
+
|
|
19430
|
+
# Build the HTML note
|
|
19431
|
+
html = f"{precondition_text}: {table_dims_text} {original_dim} → {processed_dim}."
|
|
19432
|
+
|
|
19433
|
+
return html
|
|
19434
|
+
|
|
19435
|
+
|
|
19436
|
+
def _create_preprocessing_note_text(
|
|
19437
|
+
original_rows: int,
|
|
19438
|
+
original_cols: int,
|
|
19439
|
+
processed_rows: int,
|
|
19440
|
+
processed_cols: int,
|
|
19441
|
+
) -> str:
|
|
19442
|
+
"""
|
|
19443
|
+
Create a plain text note showing table dimension changes from preprocessing.
|
|
19444
|
+
|
|
19445
|
+
Parameters
|
|
19446
|
+
----------
|
|
19447
|
+
original_rows
|
|
19448
|
+
Number of rows in the original table.
|
|
19449
|
+
original_cols
|
|
19450
|
+
Number of columns in the original table.
|
|
19451
|
+
processed_rows
|
|
19452
|
+
Number of rows after preprocessing.
|
|
19453
|
+
processed_cols
|
|
19454
|
+
Number of columns after preprocessing.
|
|
19455
|
+
|
|
19456
|
+
Returns
|
|
19457
|
+
-------
|
|
19458
|
+
str
|
|
19459
|
+
Plain text note.
|
|
19460
|
+
"""
|
|
19461
|
+
# Get singular or plural forms
|
|
19462
|
+
original_rows_text = "row" if original_rows == 1 else "rows"
|
|
19463
|
+
original_cols_text = "column" if original_cols == 1 else "columns"
|
|
19464
|
+
processed_rows_text = "row" if processed_rows == 1 else "rows"
|
|
19465
|
+
processed_cols_text = "column" if processed_cols == 1 else "columns"
|
|
19466
|
+
|
|
19467
|
+
return (
|
|
19468
|
+
f"Precondition applied: table dimensions "
|
|
19469
|
+
f"[{original_rows:,} {original_rows_text}, {original_cols} {original_cols_text}] → "
|
|
19470
|
+
f"[{processed_rows:,} {processed_rows_text}, {processed_cols} {processed_cols_text}]."
|
|
19471
|
+
)
|
|
19472
|
+
|
|
19473
|
+
|
|
19474
|
+
def _create_preprocessing_no_change_note_html(locale: str = "en") -> str:
|
|
19475
|
+
"""
|
|
19476
|
+
Create an HTML note indicating preprocessing was applied with no dimension change.
|
|
19477
|
+
|
|
19478
|
+
Parameters
|
|
19479
|
+
----------
|
|
19480
|
+
locale
|
|
19481
|
+
The locale string (e.g., 'en', 'fr').
|
|
19482
|
+
|
|
19483
|
+
Returns
|
|
19484
|
+
-------
|
|
19485
|
+
str
|
|
19486
|
+
HTML-formatted note text.
|
|
19487
|
+
"""
|
|
19488
|
+
# Get translated string
|
|
19489
|
+
note_text = NOTES_TEXT.get("precondition_applied_no_change", {}).get(
|
|
19490
|
+
locale,
|
|
19491
|
+
NOTES_TEXT.get("precondition_applied_no_change", {}).get(
|
|
19492
|
+
"en", "Precondition applied: no table dimension change"
|
|
19493
|
+
),
|
|
19494
|
+
)
|
|
19495
|
+
|
|
19496
|
+
return f"{note_text}."
|
|
19497
|
+
|
|
19498
|
+
|
|
19499
|
+
def _create_preprocessing_no_change_note_text() -> str:
|
|
19500
|
+
"""
|
|
19501
|
+
Create a plain text note indicating preprocessing was applied with no dimension change.
|
|
19502
|
+
|
|
19503
|
+
Returns
|
|
19504
|
+
-------
|
|
19505
|
+
str
|
|
19506
|
+
Plain text note.
|
|
19507
|
+
"""
|
|
19508
|
+
return "Precondition applied: no table dimension change."
|
|
19509
|
+
|
|
19510
|
+
|
|
19511
|
+
def _create_synthetic_target_column_note_html(column_name: str, locale: str = "en") -> str:
|
|
19512
|
+
"""
|
|
19513
|
+
Create an HTML note indicating that the target column was created via preprocessing.
|
|
19514
|
+
|
|
19515
|
+
Parameters
|
|
19516
|
+
----------
|
|
19517
|
+
column_name
|
|
19518
|
+
The name of the synthetic target column.
|
|
19519
|
+
locale
|
|
19520
|
+
The locale string (e.g., 'en', 'fr').
|
|
19521
|
+
|
|
19522
|
+
Returns
|
|
19523
|
+
-------
|
|
19524
|
+
str
|
|
19525
|
+
HTML-formatted note text.
|
|
19526
|
+
"""
|
|
19527
|
+
# Get translated strings
|
|
19528
|
+
synthetic_text = NOTES_TEXT.get("synthetic_target_column", {}).get(
|
|
19529
|
+
locale, NOTES_TEXT.get("synthetic_target_column", {}).get("en", "Synthetic target column")
|
|
19530
|
+
)
|
|
19531
|
+
created_via_text = NOTES_TEXT.get("created_via_preprocessing", {}).get(
|
|
19532
|
+
locale,
|
|
19533
|
+
NOTES_TEXT.get("created_via_preprocessing", {}).get("en", "created via preprocessing"),
|
|
19534
|
+
)
|
|
19535
|
+
|
|
19536
|
+
# Format the column name with monospace font
|
|
19537
|
+
col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
|
|
19538
|
+
|
|
19539
|
+
# Build the HTML note
|
|
19540
|
+
html = f"{synthetic_text} {col_name_html} {created_via_text}."
|
|
19541
|
+
|
|
19542
|
+
return html
|
|
19543
|
+
|
|
19544
|
+
|
|
19545
|
+
def _create_synthetic_target_column_note_text(column_name: str) -> str:
|
|
19546
|
+
"""
|
|
19547
|
+
Create a plain text note indicating that the target column was created via preprocessing.
|
|
19548
|
+
|
|
19549
|
+
Parameters
|
|
19550
|
+
----------
|
|
19551
|
+
column_name
|
|
19552
|
+
The name of the synthetic target column.
|
|
19553
|
+
|
|
19554
|
+
Returns
|
|
19555
|
+
-------
|
|
19556
|
+
str
|
|
19557
|
+
Plain text note.
|
|
19558
|
+
"""
|
|
19559
|
+
return f"Synthetic target column ({column_name}) created via preprocessing."
|
|
19560
|
+
|
|
19561
|
+
|
|
19562
|
+
def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> str:
|
|
19563
|
+
"""
|
|
19564
|
+
Create an HTML note with collapsible schema expectation and results.
|
|
19565
|
+
|
|
19566
|
+
This generates a disclosure-style note showing:
|
|
19567
|
+
1. A summary of what failed (if anything)
|
|
19568
|
+
2. The full step report table (collapsible)
|
|
19569
|
+
|
|
19570
|
+
Parameters
|
|
19571
|
+
----------
|
|
19572
|
+
schema_info
|
|
19573
|
+
The schema validation information dictionary from interrogation.
|
|
19574
|
+
locale
|
|
19575
|
+
The locale string (e.g., 'en', 'fr').
|
|
19576
|
+
|
|
19577
|
+
Returns
|
|
19578
|
+
-------
|
|
19579
|
+
str
|
|
19580
|
+
HTML-formatted note with collapsible schema details.
|
|
19581
|
+
"""
|
|
19582
|
+
passed = schema_info["passed"]
|
|
19583
|
+
expect_schema = schema_info["expect_schema"]
|
|
19584
|
+
target_schema = schema_info["target_schema"]
|
|
19585
|
+
params = schema_info["params"]
|
|
19586
|
+
columns_dict = schema_info["columns"]
|
|
19587
|
+
in_order = params["in_order"]
|
|
19588
|
+
|
|
19589
|
+
# Get translations for the locale
|
|
19590
|
+
passed_text = VALIDATION_REPORT_TEXT["note_schema_comparison_passed"].get(
|
|
19591
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_comparison_passed"]["en"]
|
|
19592
|
+
)
|
|
19593
|
+
failed_text = VALIDATION_REPORT_TEXT["note_schema_comparison_failed"].get(
|
|
19594
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_comparison_failed"]["en"]
|
|
19595
|
+
)
|
|
19596
|
+
disclosure_text = VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"].get(
|
|
19597
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"]["en"]
|
|
19598
|
+
)
|
|
19599
|
+
settings_title_text = VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"].get(
|
|
19600
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"]["en"]
|
|
19601
|
+
)
|
|
19602
|
+
|
|
19603
|
+
# Build summary message
|
|
19604
|
+
if passed:
|
|
19605
|
+
summary = f'<span style="color:#4CA64C;">✓</span> {passed_text}.'
|
|
19606
|
+
else:
|
|
19607
|
+
# Analyze what failed
|
|
19608
|
+
failures = []
|
|
19609
|
+
|
|
19610
|
+
# Check column count mismatch
|
|
19611
|
+
n_expect = len(expect_schema)
|
|
19612
|
+
n_target = len(target_schema)
|
|
19613
|
+
if n_expect != n_target:
|
|
19614
|
+
count_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"].get(
|
|
19615
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"]["en"]
|
|
19616
|
+
)
|
|
19617
|
+
failures.append(count_mismatch_text.format(n_expect=n_expect, n_target=n_target))
|
|
19618
|
+
|
|
19619
|
+
# Check for unmatched columns
|
|
19620
|
+
unmatched_cols = [col for col, info in columns_dict.items() if not info["colname_matched"]]
|
|
19621
|
+
if unmatched_cols:
|
|
19622
|
+
unmatched_text = VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"].get(
|
|
19623
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"]["en"]
|
|
19624
|
+
)
|
|
19625
|
+
failures.append(unmatched_text.format(n=len(unmatched_cols)))
|
|
19626
|
+
|
|
19627
|
+
# Check for wrong order (if in_order=True)
|
|
19628
|
+
if params["in_order"]:
|
|
19629
|
+
wrong_order = [
|
|
19630
|
+
col
|
|
19631
|
+
for col, info in columns_dict.items()
|
|
19632
|
+
if info["colname_matched"] and not info["index_matched"]
|
|
19633
|
+
]
|
|
19634
|
+
if wrong_order:
|
|
19635
|
+
wrong_order_text = VALIDATION_REPORT_TEXT["note_schema_wrong_order"].get(
|
|
19636
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_wrong_order"]["en"]
|
|
19637
|
+
)
|
|
19638
|
+
failures.append(wrong_order_text.format(n=len(wrong_order)))
|
|
19639
|
+
|
|
19640
|
+
# Check for dtype mismatches
|
|
19641
|
+
dtype_mismatches = [
|
|
19642
|
+
col
|
|
19643
|
+
for col, info in columns_dict.items()
|
|
19644
|
+
if info["colname_matched"] and info["dtype_present"] and not info["dtype_matched"]
|
|
19645
|
+
]
|
|
19646
|
+
if dtype_mismatches:
|
|
19647
|
+
dtype_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"].get(
|
|
19648
|
+
locale, VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"]["en"]
|
|
19649
|
+
)
|
|
19650
|
+
failures.append(dtype_mismatch_text.format(n=len(dtype_mismatches)))
|
|
19651
|
+
|
|
19652
|
+
if failures:
|
|
19653
|
+
summary = (
|
|
19654
|
+
f'<span style="color:#FF3300;">✗</span> {failed_text}: ' + ", ".join(failures) + "."
|
|
19655
|
+
)
|
|
19656
|
+
else:
|
|
19657
|
+
summary = f'<span style="color:#FF3300;">✗</span> {failed_text}.'
|
|
19658
|
+
|
|
19659
|
+
# Generate the step report table using the existing function
|
|
19660
|
+
# We'll call either _step_report_schema_in_order or _step_report_schema_any_order
|
|
19661
|
+
# depending on the in_order parameter
|
|
19662
|
+
if in_order:
|
|
19663
|
+
step_report_gt = _step_report_schema_in_order(
|
|
19664
|
+
step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
|
|
19665
|
+
)
|
|
19666
|
+
else:
|
|
19667
|
+
step_report_gt = _step_report_schema_any_order(
|
|
19668
|
+
step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
|
|
19669
|
+
)
|
|
19670
|
+
|
|
19671
|
+
# Generate the settings HTML using the existing function
|
|
19672
|
+
settings_html = _create_col_schema_match_params_html(
|
|
19673
|
+
lang=locale,
|
|
19674
|
+
complete=params["complete"],
|
|
19675
|
+
in_order=params["in_order"],
|
|
19676
|
+
case_sensitive_colnames=params["case_sensitive_colnames"],
|
|
19677
|
+
case_sensitive_dtypes=params["case_sensitive_dtypes"],
|
|
19678
|
+
full_match_dtypes=params["full_match_dtypes"],
|
|
19679
|
+
)
|
|
19680
|
+
|
|
19681
|
+
# Remove the inner div containing column_schema_match_str
|
|
19682
|
+
settings_html = re.sub(r'<div style="margin-right: 5px;">.*?</div>', "", settings_html, count=1)
|
|
19683
|
+
|
|
19684
|
+
# Change padding-top from 7px to 2px
|
|
19685
|
+
settings_html = settings_html.replace("padding-top: 7px;", "padding-top: 2px;")
|
|
19686
|
+
|
|
19687
|
+
# Create new source note HTML that includes both settings and schema
|
|
19688
|
+
source_note_html = f"""
|
|
19689
|
+
<div style='padding-bottom: 2px;'>{settings_title_text}</div>
|
|
19690
|
+
<div style='padding-bottom: 4px;'>{settings_html}</div>
|
|
19691
|
+
"""
|
|
19692
|
+
|
|
19693
|
+
# Add the settings as an additional source note to the step report
|
|
19694
|
+
step_report_gt = step_report_gt.tab_source_note(source_note=html(source_note_html))
|
|
19695
|
+
|
|
19696
|
+
# Extract the HTML from the GT object
|
|
19697
|
+
step_report_html = step_report_gt._repr_html_()
|
|
19698
|
+
|
|
19699
|
+
# Create collapsible section with the step report
|
|
19700
|
+
note_html = f"""
|
|
19701
|
+
{summary}
|
|
19702
|
+
|
|
19703
|
+
<details style="margin-top: 2px; margin-bottom: 8px; font-size: 12px; text-indent: 12px;">
|
|
19704
|
+
<summary style="cursor: pointer; font-weight: bold; color: #555; margin-bottom: -5px;">{disclosure_text}</summary>
|
|
19705
|
+
<div style="margin-top: 6px; padding-left: 15px; padding-right: 15px;">
|
|
19706
|
+
|
|
19707
|
+
{step_report_html}
|
|
19708
|
+
|
|
19709
|
+
</div>
|
|
19710
|
+
</details>
|
|
19711
|
+
"""
|
|
19712
|
+
|
|
19713
|
+
return note_html.strip()
|
|
19714
|
+
|
|
19715
|
+
|
|
19716
|
+
def _create_col_schema_match_note_text(schema_info: dict) -> str:
|
|
19717
|
+
"""
|
|
19718
|
+
Create a plain text note for schema validation.
|
|
19719
|
+
|
|
19720
|
+
Parameters
|
|
19721
|
+
----------
|
|
19722
|
+
schema_info
|
|
19723
|
+
The schema validation information dictionary from interrogation.
|
|
19724
|
+
|
|
19725
|
+
Returns
|
|
19726
|
+
-------
|
|
19727
|
+
str
|
|
19728
|
+
Plain text note.
|
|
19729
|
+
"""
|
|
19730
|
+
passed = schema_info["passed"]
|
|
19731
|
+
expect_schema = schema_info["expect_schema"]
|
|
19732
|
+
target_schema = schema_info["target_schema"]
|
|
19733
|
+
|
|
19734
|
+
if passed:
|
|
19735
|
+
return f"Schema validation passed. Expected {len(expect_schema)} column(s), found {len(target_schema)}."
|
|
19736
|
+
else:
|
|
19737
|
+
return f"Schema validation failed. Expected {len(expect_schema)} column(s), found {len(target_schema)}."
|
|
19738
|
+
|
|
19739
|
+
|
|
18436
19740
|
def _step_report_row_based(
|
|
18437
19741
|
assertion_type: str,
|
|
18438
19742
|
i: int,
|
|
@@ -18880,16 +20184,33 @@ def _step_report_schema_in_order(
|
|
|
18880
20184
|
dtype_exp = []
|
|
18881
20185
|
dtype_exp_correct = []
|
|
18882
20186
|
|
|
18883
|
-
for i in range(len(
|
|
20187
|
+
for i in range(len(expect_schema)):
|
|
18884
20188
|
#
|
|
18885
20189
|
# `col_name_exp` values
|
|
18886
20190
|
#
|
|
18887
20191
|
|
|
18888
|
-
#
|
|
18889
|
-
|
|
18890
|
-
col_name_exp.append(
|
|
20192
|
+
# Get the column name from expect_schema (which can have duplicates)
|
|
20193
|
+
column_name_exp_i = expect_schema[i][0]
|
|
20194
|
+
col_name_exp.append(column_name_exp_i)
|
|
20195
|
+
|
|
20196
|
+
# Check if this column exists in exp_columns_dict (it might not if it's a duplicate)
|
|
20197
|
+
# For duplicates, we need to handle them specially
|
|
20198
|
+
if column_name_exp_i not in exp_columns_dict:
|
|
20199
|
+
# This is a duplicate or invalid column, mark it as incorrect
|
|
20200
|
+
col_exp_correct.append(CROSS_MARK_SPAN)
|
|
18891
20201
|
|
|
18892
|
-
|
|
20202
|
+
# For dtype, check if there's a dtype specified in the schema
|
|
20203
|
+
if len(expect_schema[i]) > 1:
|
|
20204
|
+
dtype_value = expect_schema[i][1]
|
|
20205
|
+
if isinstance(dtype_value, list):
|
|
20206
|
+
dtype_exp.append(" | ".join(dtype_value))
|
|
20207
|
+
else:
|
|
20208
|
+
dtype_exp.append(str(dtype_value))
|
|
20209
|
+
else:
|
|
20210
|
+
dtype_exp.append("—")
|
|
20211
|
+
|
|
20212
|
+
dtype_exp_correct.append("—")
|
|
20213
|
+
continue
|
|
18893
20214
|
|
|
18894
20215
|
#
|
|
18895
20216
|
# `col_exp_correct` values
|