pointblank 0.13.2__py3-none-any.whl → 0.13.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants_translations.py +54 -0
- pointblank/_interrogation.py +16 -1
- pointblank/_utils.py +40 -0
- pointblank/validate.py +385 -159
- pointblank/yaml.py +154 -44
- {pointblank-0.13.2.dist-info → pointblank-0.13.4.dist-info}/METADATA +2 -2
- {pointblank-0.13.2.dist-info → pointblank-0.13.4.dist-info}/RECORD +11 -12
- pointblank/_constants_docs.py +0 -40
- {pointblank-0.13.2.dist-info → pointblank-0.13.4.dist-info}/WHEEL +0 -0
- {pointblank-0.13.2.dist-info → pointblank-0.13.4.dist-info}/entry_points.txt +0 -0
- {pointblank-0.13.2.dist-info → pointblank-0.13.4.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.13.2.dist-info → pointblank-0.13.4.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -10,6 +10,7 @@ import re
|
|
|
10
10
|
import tempfile
|
|
11
11
|
import threading
|
|
12
12
|
from dataclasses import dataclass
|
|
13
|
+
from enum import Enum
|
|
13
14
|
from importlib.metadata import version
|
|
14
15
|
from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
15
16
|
from zipfile import ZipFile
|
|
@@ -74,6 +75,7 @@ from pointblank._utils import (
|
|
|
74
75
|
_check_any_df_lib,
|
|
75
76
|
_check_invalid_fields,
|
|
76
77
|
_column_test_prep,
|
|
78
|
+
_copy_dataframe,
|
|
77
79
|
_count_null_values_in_column,
|
|
78
80
|
_count_true_values_in_column,
|
|
79
81
|
_derive_bounds,
|
|
@@ -2006,9 +2008,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
|
|
|
2006
2008
|
|
|
2007
2009
|
# Apply the appropriate conversion method
|
|
2008
2010
|
if use_polars_conversion:
|
|
2009
|
-
null_sum_converted = null_sum.to_polars()
|
|
2011
|
+
null_sum_converted = null_sum.to_polars() # pragma: no cover
|
|
2010
2012
|
else:
|
|
2011
|
-
null_sum_converted = null_sum.to_pandas()
|
|
2013
|
+
null_sum_converted = null_sum.to_pandas() # pragma: no cover
|
|
2012
2014
|
|
|
2013
2015
|
missing_prop = (null_sum_converted / sector_size) * 100
|
|
2014
2016
|
col_missing_props.append(missing_prop)
|
|
@@ -2025,9 +2027,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
|
|
|
2025
2027
|
|
|
2026
2028
|
# Apply the appropriate conversion method
|
|
2027
2029
|
if use_polars_conversion:
|
|
2028
|
-
null_sum_converted = null_sum.to_polars()
|
|
2030
|
+
null_sum_converted = null_sum.to_polars() # pragma: no cover
|
|
2029
2031
|
else:
|
|
2030
|
-
null_sum_converted = null_sum.to_pandas()
|
|
2032
|
+
null_sum_converted = null_sum.to_pandas() # pragma: no cover
|
|
2031
2033
|
|
|
2032
2034
|
missing_prop = (null_sum_converted / sector_size) * 100
|
|
2033
2035
|
col_missing_props.append(missing_prop)
|
|
@@ -2040,9 +2042,13 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
|
|
|
2040
2042
|
|
|
2041
2043
|
# Use the helper function based on the DataFrame library
|
|
2042
2044
|
if df_lib_name_gt == "polars":
|
|
2043
|
-
missing_vals = _calculate_missing_proportions(
|
|
2045
|
+
missing_vals = _calculate_missing_proportions(
|
|
2046
|
+
use_polars_conversion=True
|
|
2047
|
+
) # pragma: no cover
|
|
2044
2048
|
else:
|
|
2045
|
-
missing_vals = _calculate_missing_proportions(
|
|
2049
|
+
missing_vals = _calculate_missing_proportions(
|
|
2050
|
+
use_polars_conversion=False
|
|
2051
|
+
) # pragma: no cover
|
|
2046
2052
|
|
|
2047
2053
|
# Pivot the `missing_vals` dictionary to create a table with the missing value proportions
|
|
2048
2054
|
missing_vals = {
|
|
@@ -2055,9 +2061,13 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
|
|
|
2055
2061
|
|
|
2056
2062
|
# Get a dictionary of counts of missing values in each column
|
|
2057
2063
|
if df_lib_name_gt == "polars":
|
|
2058
|
-
missing_val_counts = {
|
|
2064
|
+
missing_val_counts = {
|
|
2065
|
+
col: data[col].isnull().sum().to_polars() for col in data.columns
|
|
2066
|
+
} # pragma: no cover
|
|
2059
2067
|
else:
|
|
2060
|
-
missing_val_counts = {
|
|
2068
|
+
missing_val_counts = {
|
|
2069
|
+
col: data[col].isnull().sum().to_pandas() for col in data.columns
|
|
2070
|
+
} # pragma: no cover
|
|
2061
2071
|
|
|
2062
2072
|
if pl_pb_tbl:
|
|
2063
2073
|
# Get the column names from the table
|
|
@@ -2429,10 +2439,10 @@ def _get_column_names_safe(data: Any) -> list[str]:
|
|
|
2429
2439
|
if hasattr(df_nw, "collect_schema"):
|
|
2430
2440
|
return list(df_nw.collect_schema().keys())
|
|
2431
2441
|
else:
|
|
2432
|
-
return list(df_nw.columns)
|
|
2433
|
-
except Exception:
|
|
2442
|
+
return list(df_nw.columns) # pragma: no cover
|
|
2443
|
+
except Exception: # pragma: no cover
|
|
2434
2444
|
# Fallback to direct column access
|
|
2435
|
-
return list(data.columns)
|
|
2445
|
+
return list(data.columns) # pragma: no cover
|
|
2436
2446
|
|
|
2437
2447
|
|
|
2438
2448
|
def _get_column_names(data: FrameT | Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
|
|
@@ -2633,7 +2643,7 @@ def get_column_count(data: FrameT | Any) -> int:
|
|
|
2633
2643
|
if hasattr(df_nw, "collect_schema"):
|
|
2634
2644
|
return len(df_nw.collect_schema())
|
|
2635
2645
|
else:
|
|
2636
|
-
return len(df_nw.columns)
|
|
2646
|
+
return len(df_nw.columns) # pragma: no cover
|
|
2637
2647
|
except Exception:
|
|
2638
2648
|
# Fallback for unsupported types
|
|
2639
2649
|
if "pandas" in str(type(data)):
|
|
@@ -2642,6 +2652,48 @@ def get_column_count(data: FrameT | Any) -> int:
|
|
|
2642
2652
|
raise ValueError("The input table type supplied in `data=` is not supported.")
|
|
2643
2653
|
|
|
2644
2654
|
|
|
2655
|
+
def _extract_enum_values(set_values: Any) -> list[Any]:
|
|
2656
|
+
"""
|
|
2657
|
+
Extract values from Enum classes or collections containing Enum instances.
|
|
2658
|
+
|
|
2659
|
+
This helper function handles:
|
|
2660
|
+
1. Enum classes: extracts all enum values
|
|
2661
|
+
2. Collections containing Enum instances: extracts their values
|
|
2662
|
+
3. Regular collections: returns as-is
|
|
2663
|
+
|
|
2664
|
+
Parameters
|
|
2665
|
+
----------
|
|
2666
|
+
set_values
|
|
2667
|
+
The input collection that may contain Enum class or Enum instances.
|
|
2668
|
+
|
|
2669
|
+
Returns
|
|
2670
|
+
-------
|
|
2671
|
+
list[Any]
|
|
2672
|
+
A list of extracted values
|
|
2673
|
+
"""
|
|
2674
|
+
from collections.abc import Collection
|
|
2675
|
+
|
|
2676
|
+
# Check if set_values is an Enum class (not an instance)
|
|
2677
|
+
if inspect.isclass(set_values) and issubclass(set_values, Enum):
|
|
2678
|
+
# Extract all values from the Enum class
|
|
2679
|
+
return [enum_member.value for enum_member in set_values]
|
|
2680
|
+
|
|
2681
|
+
# Check if set_values is a collection
|
|
2682
|
+
if isinstance(set_values, Collection) and not isinstance(set_values, (str, bytes)):
|
|
2683
|
+
extracted_values = []
|
|
2684
|
+
for item in set_values:
|
|
2685
|
+
if isinstance(item, Enum):
|
|
2686
|
+
# If item is an Enum instance, extract its value
|
|
2687
|
+
extracted_values.append(item.value)
|
|
2688
|
+
else:
|
|
2689
|
+
# If item is not an Enum instance, keep as-is
|
|
2690
|
+
extracted_values.append(item)
|
|
2691
|
+
return extracted_values
|
|
2692
|
+
|
|
2693
|
+
# If set_values is neither an Enum class nor a collection, return as list
|
|
2694
|
+
return [set_values]
|
|
2695
|
+
|
|
2696
|
+
|
|
2645
2697
|
def get_row_count(data: FrameT | Any) -> int:
|
|
2646
2698
|
"""
|
|
2647
2699
|
Get the number of rows in a table.
|
|
@@ -2806,11 +2858,11 @@ def get_row_count(data: FrameT | Any) -> int:
|
|
|
2806
2858
|
# Try different ways to get row count
|
|
2807
2859
|
if hasattr(df_nw, "shape"):
|
|
2808
2860
|
return df_nw.shape[0]
|
|
2809
|
-
elif hasattr(df_nw, "height"):
|
|
2861
|
+
elif hasattr(df_nw, "height"): # pragma: no cover
|
|
2810
2862
|
return df_nw.height # pragma: no cover
|
|
2811
2863
|
else: # pragma: no cover
|
|
2812
2864
|
raise ValueError("Unable to determine row count from Narwhals DataFrame")
|
|
2813
|
-
except Exception:
|
|
2865
|
+
except Exception: # pragma: no cover
|
|
2814
2866
|
# Fallback for types that don't work with Narwhals
|
|
2815
2867
|
if "pandas" in str(type(data)): # pragma: no cover
|
|
2816
2868
|
return data.shape[0]
|
|
@@ -6324,7 +6376,10 @@ class Validate:
|
|
|
6324
6376
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
6325
6377
|
generated for each column.
|
|
6326
6378
|
set
|
|
6327
|
-
A
|
|
6379
|
+
A collection of values to compare against. Can be a list of values, a Python Enum class,
|
|
6380
|
+
or a collection containing Enum instances. When an Enum class is provided, all enum
|
|
6381
|
+
values will be used. When a collection contains Enum instances, their values will be
|
|
6382
|
+
extracted automatically.
|
|
6328
6383
|
pre
|
|
6329
6384
|
An optional preprocessing function or lambda to apply to the data table during
|
|
6330
6385
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -6501,12 +6556,69 @@ class Validate:
|
|
|
6501
6556
|
|
|
6502
6557
|
The validation table reports two failing test units. The specific failing cases are for the
|
|
6503
6558
|
column `b` values of `8` and `1`, which are not in the set of `[2, 3, 4, 5, 6]`.
|
|
6559
|
+
|
|
6560
|
+
**Using Python Enums**
|
|
6561
|
+
|
|
6562
|
+
The `col_vals_in_set()` method also supports Python Enum classes and instances, which can
|
|
6563
|
+
make validations more readable and maintainable:
|
|
6564
|
+
|
|
6565
|
+
```{python}
|
|
6566
|
+
from enum import Enum
|
|
6567
|
+
|
|
6568
|
+
class Color(Enum):
|
|
6569
|
+
RED = "red"
|
|
6570
|
+
GREEN = "green"
|
|
6571
|
+
BLUE = "blue"
|
|
6572
|
+
|
|
6573
|
+
# Create a table with color data
|
|
6574
|
+
tbl_colors = pl.DataFrame({
|
|
6575
|
+
"product": ["shirt", "pants", "hat", "shoes"],
|
|
6576
|
+
"color": ["red", "blue", "green", "yellow"]
|
|
6577
|
+
})
|
|
6578
|
+
|
|
6579
|
+
# Validate using an Enum class (all enum values are allowed)
|
|
6580
|
+
validation = (
|
|
6581
|
+
pb.Validate(data=tbl_colors)
|
|
6582
|
+
.col_vals_in_set(columns="color", set=Color)
|
|
6583
|
+
.interrogate()
|
|
6584
|
+
)
|
|
6585
|
+
|
|
6586
|
+
validation
|
|
6587
|
+
```
|
|
6588
|
+
|
|
6589
|
+
This validation will fail for the `"yellow"` value since it's not in the `Color` enum.
|
|
6590
|
+
|
|
6591
|
+
You can also use specific Enum instances or mix them with regular values:
|
|
6592
|
+
|
|
6593
|
+
```{python}
|
|
6594
|
+
# Validate using specific Enum instances
|
|
6595
|
+
validation = (
|
|
6596
|
+
pb.Validate(data=tbl_colors)
|
|
6597
|
+
.col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE])
|
|
6598
|
+
.interrogate()
|
|
6599
|
+
)
|
|
6600
|
+
|
|
6601
|
+
# Mix Enum instances with regular values
|
|
6602
|
+
validation = (
|
|
6603
|
+
pb.Validate(data=tbl_colors)
|
|
6604
|
+
.col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE, "yellow"])
|
|
6605
|
+
.interrogate()
|
|
6606
|
+
)
|
|
6607
|
+
|
|
6608
|
+
validation
|
|
6609
|
+
```
|
|
6610
|
+
|
|
6611
|
+
In this case, the `"green"` value will cause a failing test unit since it's not part of the
|
|
6612
|
+
specified set.
|
|
6504
6613
|
"""
|
|
6505
6614
|
|
|
6506
6615
|
assertion_type = _get_fn_name()
|
|
6507
6616
|
|
|
6508
6617
|
_check_column(column=columns)
|
|
6509
6618
|
|
|
6619
|
+
# Extract values from Enum classes or Enum instances if present
|
|
6620
|
+
set = _extract_enum_values(set)
|
|
6621
|
+
|
|
6510
6622
|
for val in set:
|
|
6511
6623
|
if val is None:
|
|
6512
6624
|
continue
|
|
@@ -6557,7 +6669,7 @@ class Validate:
|
|
|
6557
6669
|
def col_vals_not_in_set(
|
|
6558
6670
|
self,
|
|
6559
6671
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
6560
|
-
set:
|
|
6672
|
+
set: Collection[Any],
|
|
6561
6673
|
pre: Callable | None = None,
|
|
6562
6674
|
segments: SegmentSpec | None = None,
|
|
6563
6675
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
@@ -6581,7 +6693,10 @@ class Validate:
|
|
|
6581
6693
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
6582
6694
|
generated for each column.
|
|
6583
6695
|
set
|
|
6584
|
-
A
|
|
6696
|
+
A collection of values to compare against. Can be a list of values, a Python Enum class,
|
|
6697
|
+
or a collection containing Enum instances. When an Enum class is provided, all enum
|
|
6698
|
+
values will be used. When a collection contains Enum instances, their values will be
|
|
6699
|
+
extracted automatically.
|
|
6585
6700
|
pre
|
|
6586
6701
|
An optional preprocessing function or lambda to apply to the data table during
|
|
6587
6702
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -6759,11 +6874,45 @@ class Validate:
|
|
|
6759
6874
|
|
|
6760
6875
|
The validation table reports two failing test units. The specific failing cases are for the
|
|
6761
6876
|
column `b` values of `2` and `6`, both of which are in the set of `[2, 3, 4, 5, 6]`.
|
|
6877
|
+
|
|
6878
|
+
**Using Python Enums**
|
|
6879
|
+
|
|
6880
|
+
Like `col_vals_in_set()`, this method also supports Python Enum classes and instances:
|
|
6881
|
+
|
|
6882
|
+
```{python}
|
|
6883
|
+
from enum import Enum
|
|
6884
|
+
|
|
6885
|
+
class InvalidStatus(Enum):
|
|
6886
|
+
DELETED = "deleted"
|
|
6887
|
+
ARCHIVED = "archived"
|
|
6888
|
+
|
|
6889
|
+
# Create a table with status data
|
|
6890
|
+
status_table = pl.DataFrame({
|
|
6891
|
+
"product": ["widget", "gadget", "tool", "device"],
|
|
6892
|
+
"status": ["active", "pending", "deleted", "active"]
|
|
6893
|
+
})
|
|
6894
|
+
|
|
6895
|
+
# Validate that no values are in the invalid status set
|
|
6896
|
+
validation = (
|
|
6897
|
+
pb.Validate(data=status_table)
|
|
6898
|
+
.col_vals_not_in_set(columns="status", set=InvalidStatus)
|
|
6899
|
+
.interrogate()
|
|
6900
|
+
)
|
|
6901
|
+
|
|
6902
|
+
validation
|
|
6903
|
+
```
|
|
6904
|
+
|
|
6905
|
+
This `"deleted"` value in the `status` column will fail since it matches one of the invalid
|
|
6906
|
+
statuses in the `InvalidStatus` enum.
|
|
6762
6907
|
"""
|
|
6763
6908
|
|
|
6764
6909
|
assertion_type = _get_fn_name()
|
|
6765
6910
|
|
|
6766
6911
|
_check_column(column=columns)
|
|
6912
|
+
|
|
6913
|
+
# Extract values from Enum classes or Enum instances if present
|
|
6914
|
+
set = _extract_enum_values(set)
|
|
6915
|
+
|
|
6767
6916
|
_check_set_types(set=set)
|
|
6768
6917
|
_check_pre(pre=pre)
|
|
6769
6918
|
# TODO: add check for segments
|
|
@@ -7297,6 +7446,7 @@ class Validate:
|
|
|
7297
7446
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
7298
7447
|
pattern: str,
|
|
7299
7448
|
na_pass: bool = False,
|
|
7449
|
+
inverse: bool = False,
|
|
7300
7450
|
pre: Callable | None = None,
|
|
7301
7451
|
segments: SegmentSpec | None = None,
|
|
7302
7452
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
@@ -7324,6 +7474,9 @@ class Validate:
|
|
|
7324
7474
|
na_pass
|
|
7325
7475
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
7326
7476
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
7477
|
+
inverse
|
|
7478
|
+
Should the validation step be inverted? If `True`, then the expectation is that column
|
|
7479
|
+
values should *not* match the specified `pattern=` regex.
|
|
7327
7480
|
pre
|
|
7328
7481
|
An optional preprocessing function or lambda to apply to the data table during
|
|
7329
7482
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -7510,6 +7663,7 @@ class Validate:
|
|
|
7510
7663
|
# _check_segments(segments=segments)
|
|
7511
7664
|
_check_thresholds(thresholds=thresholds)
|
|
7512
7665
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
7666
|
+
_check_boolean_input(param=inverse, param_name="inverse")
|
|
7513
7667
|
_check_boolean_input(param=active, param_name="active")
|
|
7514
7668
|
|
|
7515
7669
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
@@ -7529,12 +7683,15 @@ class Validate:
|
|
|
7529
7683
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
7530
7684
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
7531
7685
|
|
|
7686
|
+
# Package up the `pattern=` and boolean params into a dictionary for later interrogation
|
|
7687
|
+
values = {"pattern": pattern, "inverse": inverse}
|
|
7688
|
+
|
|
7532
7689
|
# Iterate over the columns and create a validation step for each
|
|
7533
7690
|
for column in columns:
|
|
7534
7691
|
val_info = _ValidationInfo(
|
|
7535
7692
|
assertion_type=assertion_type,
|
|
7536
7693
|
column=column,
|
|
7537
|
-
values=
|
|
7694
|
+
values=values,
|
|
7538
7695
|
na_pass=na_pass,
|
|
7539
7696
|
pre=pre,
|
|
7540
7697
|
segments=segments,
|
|
@@ -8401,8 +8558,8 @@ class Validate:
|
|
|
8401
8558
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
8402
8559
|
)
|
|
8403
8560
|
|
|
8404
|
-
if columns_subset is not None and isinstance(columns_subset, str):
|
|
8405
|
-
columns_subset = [columns_subset]
|
|
8561
|
+
if columns_subset is not None and isinstance(columns_subset, str): # pragma: no cover
|
|
8562
|
+
columns_subset = [columns_subset] # pragma: no cover
|
|
8406
8563
|
|
|
8407
8564
|
# TODO: incorporate Column object
|
|
8408
8565
|
|
|
@@ -9830,8 +9987,9 @@ class Validate:
|
|
|
9830
9987
|
validation.active = False
|
|
9831
9988
|
continue
|
|
9832
9989
|
|
|
9833
|
-
# Make a copy of the table for this step
|
|
9834
|
-
|
|
9990
|
+
# Make a deep copy of the table for this step to ensure proper isolation
|
|
9991
|
+
# This prevents modifications from one validation step affecting others
|
|
9992
|
+
data_tbl_step = _copy_dataframe(data_tbl)
|
|
9835
9993
|
|
|
9836
9994
|
# ------------------------------------------------
|
|
9837
9995
|
# Preprocessing stage
|
|
@@ -9998,7 +10156,7 @@ class Validate:
|
|
|
9998
10156
|
|
|
9999
10157
|
elif assertion_type == "col_vals_regex":
|
|
10000
10158
|
results_tbl = interrogate_regex(
|
|
10001
|
-
tbl=tbl, column=column,
|
|
10159
|
+
tbl=tbl, column=column, values=value, na_pass=na_pass
|
|
10002
10160
|
)
|
|
10003
10161
|
|
|
10004
10162
|
elif assertion_type == "col_vals_expr":
|
|
@@ -10096,7 +10254,9 @@ class Validate:
|
|
|
10096
10254
|
)
|
|
10097
10255
|
|
|
10098
10256
|
else:
|
|
10099
|
-
raise ValueError(
|
|
10257
|
+
raise ValueError(
|
|
10258
|
+
f"Unknown assertion type: {assertion_type}"
|
|
10259
|
+
) # pragma: no cover
|
|
10100
10260
|
|
|
10101
10261
|
except Exception as e:
|
|
10102
10262
|
# Only catch specific data quality comparison errors, not programming errors
|
|
@@ -10111,14 +10271,18 @@ class Validate:
|
|
|
10111
10271
|
or ("dtype" in error_msg and "compare" in error_msg)
|
|
10112
10272
|
)
|
|
10113
10273
|
|
|
10114
|
-
if is_comparison_error:
|
|
10274
|
+
if is_comparison_error: # pragma: no cover
|
|
10115
10275
|
# If data quality comparison fails, mark the validation as having an eval_error
|
|
10116
|
-
validation.eval_error = True
|
|
10117
|
-
end_time = datetime.datetime.now(datetime.timezone.utc)
|
|
10118
|
-
validation.proc_duration_s = (
|
|
10119
|
-
|
|
10120
|
-
|
|
10121
|
-
|
|
10276
|
+
validation.eval_error = True # pragma: no cover
|
|
10277
|
+
end_time = datetime.datetime.now(datetime.timezone.utc) # pragma: no cover
|
|
10278
|
+
validation.proc_duration_s = (
|
|
10279
|
+
end_time - start_time
|
|
10280
|
+
).total_seconds() # pragma: no cover
|
|
10281
|
+
validation.time_processed = end_time.isoformat(
|
|
10282
|
+
timespec="milliseconds"
|
|
10283
|
+
) # pragma: no cover
|
|
10284
|
+
validation.active = False # pragma: no cover
|
|
10285
|
+
continue # pragma: no cover
|
|
10122
10286
|
else:
|
|
10123
10287
|
# For other errors (like missing columns), let them propagate
|
|
10124
10288
|
raise
|
|
@@ -10363,32 +10527,46 @@ class Validate:
|
|
|
10363
10527
|
except AttributeError:
|
|
10364
10528
|
# For LazyFrames without sample method, collect first then sample
|
|
10365
10529
|
validation_extract_native = validation_extract_nw.collect().to_native()
|
|
10366
|
-
if hasattr(validation_extract_native, "sample"):
|
|
10530
|
+
if hasattr(validation_extract_native, "sample"): # pragma: no cover
|
|
10367
10531
|
# PySpark DataFrame has sample method
|
|
10368
|
-
validation_extract_native =
|
|
10369
|
-
|
|
10370
|
-
|
|
10371
|
-
|
|
10532
|
+
validation_extract_native = (
|
|
10533
|
+
validation_extract_native.sample( # pragma: no cover
|
|
10534
|
+
fraction=min(
|
|
10535
|
+
1.0, sample_n / validation_extract_native.count()
|
|
10536
|
+
) # pragma: no cover
|
|
10537
|
+
).limit(sample_n)
|
|
10538
|
+
) # pragma: no cover
|
|
10539
|
+
validation_extract_nw = nw.from_native(
|
|
10540
|
+
validation_extract_native
|
|
10541
|
+
) # pragma: no cover
|
|
10372
10542
|
else:
|
|
10373
10543
|
# Fallback: just take first n rows after collecting
|
|
10374
|
-
validation_extract_nw = validation_extract_nw.collect().head(
|
|
10544
|
+
validation_extract_nw = validation_extract_nw.collect().head(
|
|
10545
|
+
sample_n
|
|
10546
|
+
) # pragma: no cover
|
|
10375
10547
|
elif sample_frac is not None:
|
|
10376
10548
|
try:
|
|
10377
10549
|
validation_extract_nw = validation_extract_nw.sample(fraction=sample_frac)
|
|
10378
|
-
except AttributeError:
|
|
10550
|
+
except AttributeError: # pragma: no cover
|
|
10379
10551
|
# For LazyFrames without sample method, collect first then sample
|
|
10380
|
-
validation_extract_native =
|
|
10381
|
-
|
|
10552
|
+
validation_extract_native = (
|
|
10553
|
+
validation_extract_nw.collect().to_native()
|
|
10554
|
+
) # pragma: no cover
|
|
10555
|
+
if hasattr(validation_extract_native, "sample"): # pragma: no cover
|
|
10382
10556
|
# PySpark DataFrame has sample method
|
|
10383
10557
|
validation_extract_native = validation_extract_native.sample(
|
|
10384
10558
|
fraction=sample_frac
|
|
10385
|
-
)
|
|
10386
|
-
validation_extract_nw = nw.from_native(
|
|
10559
|
+
) # pragma: no cover
|
|
10560
|
+
validation_extract_nw = nw.from_native(
|
|
10561
|
+
validation_extract_native
|
|
10562
|
+
) # pragma: no cover
|
|
10387
10563
|
else:
|
|
10388
10564
|
# Fallback: use fraction to calculate head size
|
|
10389
|
-
collected = validation_extract_nw.collect()
|
|
10390
|
-
sample_size = max(
|
|
10391
|
-
|
|
10565
|
+
collected = validation_extract_nw.collect() # pragma: no cover
|
|
10566
|
+
sample_size = max(
|
|
10567
|
+
1, int(len(collected) * sample_frac)
|
|
10568
|
+
) # pragma: no cover
|
|
10569
|
+
validation_extract_nw = collected.head(sample_size) # pragma: no cover
|
|
10392
10570
|
|
|
10393
10571
|
# Ensure a limit is set on the number of rows to extract
|
|
10394
10572
|
try:
|
|
@@ -10398,9 +10576,9 @@ class Validate:
|
|
|
10398
10576
|
# For LazyFrames, collect to get length (or use a reasonable default)
|
|
10399
10577
|
try:
|
|
10400
10578
|
extract_length = len(validation_extract_nw.collect())
|
|
10401
|
-
except Exception:
|
|
10579
|
+
except Exception: # pragma: no cover
|
|
10402
10580
|
# If collection fails, apply limit anyway as a safety measure
|
|
10403
|
-
extract_length = extract_limit + 1 #
|
|
10581
|
+
extract_length = extract_limit + 1 # pragma: no cover
|
|
10404
10582
|
|
|
10405
10583
|
if extract_length > extract_limit:
|
|
10406
10584
|
validation_extract_nw = validation_extract_nw.head(extract_limit)
|
|
@@ -12065,10 +12243,12 @@ class Validate:
|
|
|
12065
12243
|
try:
|
|
12066
12244
|
# Try without order_by first (for DataFrames)
|
|
12067
12245
|
data_nw = data_nw.with_row_index(name=index_name)
|
|
12068
|
-
except TypeError:
|
|
12246
|
+
except TypeError: # pragma: no cover
|
|
12069
12247
|
# LazyFrames require order_by parameter - use first column for ordering
|
|
12070
|
-
first_col = data_nw.columns[0]
|
|
12071
|
-
data_nw = data_nw.with_row_index(
|
|
12248
|
+
first_col = data_nw.columns[0] # pragma: no cover
|
|
12249
|
+
data_nw = data_nw.with_row_index(
|
|
12250
|
+
name=index_name, order_by=first_col
|
|
12251
|
+
) # pragma: no cover
|
|
12072
12252
|
|
|
12073
12253
|
# Get all validation step result tables and join together the `pb_is_good_` columns
|
|
12074
12254
|
# ensuring that the columns are named uniquely (e.g., `pb_is_good_1`, `pb_is_good_2`, ...)
|
|
@@ -12080,10 +12260,12 @@ class Validate:
|
|
|
12080
12260
|
try:
|
|
12081
12261
|
# Try without order_by first (for DataFrames)
|
|
12082
12262
|
results_tbl = results_tbl.with_row_index(name=index_name)
|
|
12083
|
-
except TypeError:
|
|
12263
|
+
except TypeError: # pragma: no cover
|
|
12084
12264
|
# LazyFrames require order_by parameter - use first column for ordering
|
|
12085
|
-
first_col = results_tbl.columns[0]
|
|
12086
|
-
results_tbl = results_tbl.with_row_index(
|
|
12265
|
+
first_col = results_tbl.columns[0] # pragma: no cover
|
|
12266
|
+
results_tbl = results_tbl.with_row_index(
|
|
12267
|
+
name=index_name, order_by=first_col
|
|
12268
|
+
) # pragma: no cover
|
|
12087
12269
|
|
|
12088
12270
|
# Add numerical suffix to the `pb_is_good_` column to make it unique
|
|
12089
12271
|
results_tbl = results_tbl.select([index_name, "pb_is_good_"]).rename(
|
|
@@ -12215,15 +12397,15 @@ class Validate:
|
|
|
12215
12397
|
# If the table is a Polars one, determine if it's a LazyFrame
|
|
12216
12398
|
if tbl_info == "polars":
|
|
12217
12399
|
if _is_lazy_frame(self.data):
|
|
12218
|
-
tbl_info = "polars-lazy"
|
|
12400
|
+
tbl_info = "polars-lazy" # pragma: no cover
|
|
12219
12401
|
|
|
12220
12402
|
# Determine if the input table is a Narwhals DF
|
|
12221
12403
|
if _is_narwhals_table(self.data):
|
|
12222
12404
|
# Determine if the Narwhals table is a LazyFrame
|
|
12223
|
-
if _is_lazy_frame(self.data):
|
|
12224
|
-
tbl_info = "narwhals-lazy"
|
|
12405
|
+
if _is_lazy_frame(self.data): # pragma: no cover
|
|
12406
|
+
tbl_info = "narwhals-lazy" # pragma: no cover
|
|
12225
12407
|
else:
|
|
12226
|
-
tbl_info = "narwhals"
|
|
12408
|
+
tbl_info = "narwhals" # pragma: no cover
|
|
12227
12409
|
|
|
12228
12410
|
# Get the thresholds object
|
|
12229
12411
|
thresholds = self.thresholds
|
|
@@ -12388,7 +12570,7 @@ class Validate:
|
|
|
12388
12570
|
if lang in RTL_LANGUAGES:
|
|
12389
12571
|
gt_tbl = gt_tbl.tab_style(
|
|
12390
12572
|
style=style.css("direction: rtl;"), locations=loc.source_notes()
|
|
12391
|
-
)
|
|
12573
|
+
) # pragma: no cover
|
|
12392
12574
|
|
|
12393
12575
|
if incl_header:
|
|
12394
12576
|
gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
|
|
@@ -12537,6 +12719,11 @@ class Validate:
|
|
|
12537
12719
|
elif assertion_type[i] in ["specially"]:
|
|
12538
12720
|
values_upd.append("EXPR")
|
|
12539
12721
|
|
|
12722
|
+
elif assertion_type[i] in ["col_vals_regex"]:
|
|
12723
|
+
pattern = value["pattern"]
|
|
12724
|
+
|
|
12725
|
+
values_upd.append(str(pattern))
|
|
12726
|
+
|
|
12540
12727
|
# If the assertion type is not recognized, add the value as a string
|
|
12541
12728
|
else:
|
|
12542
12729
|
values_upd.append(str(value))
|
|
@@ -12705,9 +12892,11 @@ class Validate:
|
|
|
12705
12892
|
# Get the number of rows in the extract (safe for LazyFrames)
|
|
12706
12893
|
try:
|
|
12707
12894
|
n_rows = len(extract_nw)
|
|
12708
|
-
except TypeError:
|
|
12895
|
+
except TypeError: # pragma: no cover
|
|
12709
12896
|
# For LazyFrames, collect() first to get length
|
|
12710
|
-
n_rows =
|
|
12897
|
+
n_rows = (
|
|
12898
|
+
len(extract_nw.collect()) if hasattr(extract_nw, "collect") else 0
|
|
12899
|
+
) # pragma: no cover
|
|
12711
12900
|
|
|
12712
12901
|
# If the number of rows is zero, then produce an em dash then go to the next iteration
|
|
12713
12902
|
if n_rows == 0:
|
|
@@ -12715,7 +12904,7 @@ class Validate:
|
|
|
12715
12904
|
continue
|
|
12716
12905
|
|
|
12717
12906
|
# Write the CSV text (ensure LazyFrames are collected first)
|
|
12718
|
-
if hasattr(extract_nw, "collect"):
|
|
12907
|
+
if hasattr(extract_nw, "collect"): # pragma: no cover
|
|
12719
12908
|
extract_nw = extract_nw.collect()
|
|
12720
12909
|
csv_text = extract_nw.write_csv()
|
|
12721
12910
|
|
|
@@ -13217,7 +13406,7 @@ class Validate:
|
|
|
13217
13406
|
elif isinstance(column, list):
|
|
13218
13407
|
column_position = [list(self.data.columns).index(col) + 1 for col in column]
|
|
13219
13408
|
else:
|
|
13220
|
-
column_position = None
|
|
13409
|
+
column_position = None # pragma: no cover
|
|
13221
13410
|
else:
|
|
13222
13411
|
column_position = None
|
|
13223
13412
|
|
|
@@ -13309,7 +13498,7 @@ class Validate:
|
|
|
13309
13498
|
)
|
|
13310
13499
|
|
|
13311
13500
|
else:
|
|
13312
|
-
step_report = None
|
|
13501
|
+
step_report = None # pragma: no cover
|
|
13313
13502
|
|
|
13314
13503
|
return step_report
|
|
13315
13504
|
|
|
@@ -13797,7 +13986,7 @@ def _conditional_string_date_dttm_conversion(
|
|
|
13797
13986
|
elif not allow_regular_strings:
|
|
13798
13987
|
raise ValueError(
|
|
13799
13988
|
"If `value=` is provided as a string it must be a date or datetime string."
|
|
13800
|
-
)
|
|
13989
|
+
) # pragma: no cover
|
|
13801
13990
|
# If allow_regular_strings is True, regular strings pass through unchanged
|
|
13802
13991
|
|
|
13803
13992
|
return value
|
|
@@ -13851,12 +14040,33 @@ def _process_brief(
|
|
|
13851
14040
|
|
|
13852
14041
|
if segment is not None:
|
|
13853
14042
|
# The segment is always a tuple of the form ("{column}", "{value}")
|
|
14043
|
+
# Handle both regular lists and Segment objects (from seg_group())
|
|
14044
|
+
|
|
14045
|
+
segment_column = segment[0]
|
|
14046
|
+
segment_value = segment[1]
|
|
14047
|
+
|
|
14048
|
+
# If segment_value is a Segment object (from seg_group()), format it appropriately
|
|
14049
|
+
if isinstance(segment_value, Segment):
|
|
14050
|
+
# For Segment objects, format the segments as a readable string
|
|
14051
|
+
segments = segment_value.segments
|
|
14052
|
+
if len(segments) == 1:
|
|
14053
|
+
# Single segment: join the values with commas
|
|
14054
|
+
segment_value_str = ", ".join(str(v) for v in segments[0])
|
|
14055
|
+
else:
|
|
14056
|
+
# Multiple segments: join each segment with commas, separate segments with " | "
|
|
14057
|
+
segment_value_str = " | ".join([", ".join(str(v) for v in seg) for seg in segments])
|
|
14058
|
+
else:
|
|
14059
|
+
# For regular lists or other types, convert to string
|
|
14060
|
+
if isinstance(segment_value, list):
|
|
14061
|
+
segment_value_str = ", ".join(str(v) for v in segment_value)
|
|
14062
|
+
else:
|
|
14063
|
+
segment_value_str = str(segment_value)
|
|
13854
14064
|
|
|
13855
|
-
segment_fmt = f"{
|
|
14065
|
+
segment_fmt = f"{segment_column} / {segment_value_str}"
|
|
13856
14066
|
|
|
13857
14067
|
brief = brief.replace("{segment}", segment_fmt)
|
|
13858
|
-
brief = brief.replace("{segment_column}",
|
|
13859
|
-
brief = brief.replace("{segment_value}",
|
|
14068
|
+
brief = brief.replace("{segment_column}", segment_column)
|
|
14069
|
+
brief = brief.replace("{segment_value}", segment_value_str)
|
|
13860
14070
|
|
|
13861
14071
|
return brief
|
|
13862
14072
|
|
|
@@ -13890,7 +14100,7 @@ def _process_action_str(
|
|
|
13890
14100
|
if col is not None:
|
|
13891
14101
|
# If a list of columns is provided, then join the columns into a comma-separated string
|
|
13892
14102
|
if isinstance(col, list):
|
|
13893
|
-
col = ", ".join(col)
|
|
14103
|
+
col = ", ".join(col) # pragma: no cover
|
|
13894
14104
|
|
|
13895
14105
|
action_str = action_str.replace("{col}", col)
|
|
13896
14106
|
action_str = action_str.replace("{column}", col)
|
|
@@ -14163,15 +14373,30 @@ def _create_text_null(
|
|
|
14163
14373
|
|
|
14164
14374
|
|
|
14165
14375
|
def _create_text_regex(
|
|
14166
|
-
lang: str, column: str | None, pattern: str, for_failure: bool = False
|
|
14376
|
+
lang: str, column: str | None, pattern: str | dict, for_failure: bool = False
|
|
14167
14377
|
) -> str:
|
|
14168
14378
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
14169
14379
|
|
|
14170
14380
|
column_text = _prep_column_text(column=column)
|
|
14171
14381
|
|
|
14172
|
-
|
|
14382
|
+
# Handle case where pattern is a dictionary containing `pattern` and `inverse`
|
|
14383
|
+
if isinstance(pattern, dict):
|
|
14384
|
+
pattern_str = pattern["pattern"]
|
|
14385
|
+
inverse = pattern.get("inverse", False)
|
|
14386
|
+
else:
|
|
14387
|
+
# For backward compatibility, assume it's just the pattern string
|
|
14388
|
+
pattern_str = pattern
|
|
14389
|
+
inverse = False
|
|
14390
|
+
|
|
14391
|
+
# Use inverse-specific translations if inverse=True
|
|
14392
|
+
if inverse:
|
|
14393
|
+
text_key = f"regex_inverse_{type_}_text"
|
|
14394
|
+
else:
|
|
14395
|
+
text_key = f"regex_{type_}_text"
|
|
14396
|
+
|
|
14397
|
+
return EXPECT_FAIL_TEXT[text_key][lang].format(
|
|
14173
14398
|
column_text=column_text,
|
|
14174
|
-
values_text=
|
|
14399
|
+
values_text=pattern_str,
|
|
14175
14400
|
)
|
|
14176
14401
|
|
|
14177
14402
|
|
|
@@ -14287,7 +14512,7 @@ def _prep_values_text(
|
|
|
14287
14512
|
length_values = len(values)
|
|
14288
14513
|
|
|
14289
14514
|
if length_values == 0:
|
|
14290
|
-
return ""
|
|
14515
|
+
return "" # pragma: no cover
|
|
14291
14516
|
|
|
14292
14517
|
if length_values > limit:
|
|
14293
14518
|
num_omitted = length_values - limit
|
|
@@ -14296,7 +14521,7 @@ def _prep_values_text(
|
|
|
14296
14521
|
formatted_values = []
|
|
14297
14522
|
for value in values[:limit]:
|
|
14298
14523
|
if isinstance(value, (datetime.datetime, datetime.date)):
|
|
14299
|
-
formatted_values.append(f"`{value.isoformat()}`")
|
|
14524
|
+
formatted_values.append(f"`{value.isoformat()}`") # pragma: no cover
|
|
14300
14525
|
else:
|
|
14301
14526
|
formatted_values.append(f"`{value}`")
|
|
14302
14527
|
|
|
@@ -14486,8 +14711,8 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
|
|
|
14486
14711
|
if len(segment_str) == 10 and segment_str.count("-") == 2:
|
|
14487
14712
|
try:
|
|
14488
14713
|
parsed_value = date.fromisoformat(segment_str)
|
|
14489
|
-
except ValueError:
|
|
14490
|
-
pass
|
|
14714
|
+
except ValueError: # pragma: no cover
|
|
14715
|
+
pass # pragma: no cover
|
|
14491
14716
|
|
|
14492
14717
|
# Format 2: Datetime strings with UTC timezone like
|
|
14493
14718
|
# "2016-01-04 00:00:01 UTC.strict_cast(...)"
|
|
@@ -14499,27 +14724,28 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
|
|
|
14499
14724
|
parsed_dt = datetime.fromisoformat(datetime_part)
|
|
14500
14725
|
# Convert midnight datetimes to dates for consistency
|
|
14501
14726
|
if parsed_dt.time() == datetime.min.time():
|
|
14502
|
-
parsed_value = parsed_dt.date()
|
|
14727
|
+
parsed_value = parsed_dt.date() # pragma: no cover
|
|
14503
14728
|
else:
|
|
14504
14729
|
parsed_value = parsed_dt
|
|
14505
|
-
except (ValueError, IndexError):
|
|
14506
|
-
pass
|
|
14730
|
+
except (ValueError, IndexError): # pragma: no cover
|
|
14731
|
+
pass # pragma: no cover
|
|
14507
14732
|
|
|
14508
14733
|
# Format 3: Bracketed expressions like ['2016-01-04']
|
|
14509
14734
|
elif segment_str.startswith("[") and segment_str.endswith("]"):
|
|
14510
|
-
try:
|
|
14511
|
-
|
|
14735
|
+
try: # pragma: no cover
|
|
14736
|
+
# Remove [' and ']
|
|
14737
|
+
content = segment_str[2:-2] # pragma: no cover
|
|
14512
14738
|
|
|
14513
14739
|
# Try parsing as date first
|
|
14514
|
-
if len(content) == 10 and content.count("-") == 2:
|
|
14515
|
-
try:
|
|
14516
|
-
parsed_value = date.fromisoformat(content)
|
|
14517
|
-
except ValueError:
|
|
14518
|
-
pass
|
|
14740
|
+
if len(content) == 10 and content.count("-") == 2: # pragma: no cover
|
|
14741
|
+
try: # pragma: no cover
|
|
14742
|
+
parsed_value = date.fromisoformat(content) # pragma: no cover
|
|
14743
|
+
except ValueError: # pragma: no cover
|
|
14744
|
+
pass # pragma: no cover
|
|
14519
14745
|
|
|
14520
14746
|
# Try parsing as datetime
|
|
14521
|
-
if parsed_value is None:
|
|
14522
|
-
try:
|
|
14747
|
+
if parsed_value is None: # pragma: no cover
|
|
14748
|
+
try: # pragma: no cover
|
|
14523
14749
|
parsed_dt = datetime.fromisoformat(content.replace(" UTC", ""))
|
|
14524
14750
|
if parsed_dt.time() == datetime.min.time():
|
|
14525
14751
|
parsed_value = parsed_dt.date()
|
|
@@ -14528,8 +14754,8 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
|
|
|
14528
14754
|
except ValueError:
|
|
14529
14755
|
pass
|
|
14530
14756
|
|
|
14531
|
-
except (ValueError, IndexError):
|
|
14532
|
-
pass
|
|
14757
|
+
except (ValueError, IndexError): # pragma: no cover
|
|
14758
|
+
pass # pragma: no cover
|
|
14533
14759
|
|
|
14534
14760
|
# Handle `pl.datetime()` expressions with .alias("datetime")
|
|
14535
14761
|
elif "datetime" in segment_str and '.alias("datetime")' in segment_str:
|
|
@@ -14540,10 +14766,10 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
|
|
|
14540
14766
|
if parsed_dt.time() == datetime.min.time():
|
|
14541
14767
|
parsed_value = parsed_dt.date()
|
|
14542
14768
|
else:
|
|
14543
|
-
parsed_value = parsed_dt
|
|
14769
|
+
parsed_value = parsed_dt # pragma: no cover
|
|
14544
14770
|
|
|
14545
|
-
except (ValueError, AttributeError):
|
|
14546
|
-
pass
|
|
14771
|
+
except (ValueError, AttributeError): # pragma: no cover
|
|
14772
|
+
pass # pragma: no cover
|
|
14547
14773
|
|
|
14548
14774
|
# If we successfully parsed a value, use it; otherwise leave segment as is
|
|
14549
14775
|
if parsed_value is not None:
|
|
@@ -14567,9 +14793,9 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
|
|
|
14567
14793
|
# Filter the data table based on the column name and segment
|
|
14568
14794
|
# Use the new Ibis API methods to avoid deprecation warnings
|
|
14569
14795
|
if segment is None:
|
|
14570
|
-
data_tbl = data_tbl.filter(data_tbl[column].isnull())
|
|
14796
|
+
data_tbl = data_tbl.filter(data_tbl[column].isnull()) # pragma: no cover
|
|
14571
14797
|
elif isinstance(segment, list):
|
|
14572
|
-
data_tbl = data_tbl.filter(data_tbl[column].isin(segment))
|
|
14798
|
+
data_tbl = data_tbl.filter(data_tbl[column].isin(segment)) # pragma: no cover
|
|
14573
14799
|
else:
|
|
14574
14800
|
data_tbl = data_tbl.filter(data_tbl[column] == segment)
|
|
14575
14801
|
|
|
@@ -14690,7 +14916,7 @@ def _get_title_text(
|
|
|
14690
14916
|
"</span>"
|
|
14691
14917
|
f'<span style="float: right;">{title}</span>'
|
|
14692
14918
|
"</div>"
|
|
14693
|
-
)
|
|
14919
|
+
) # pragma: no cover
|
|
14694
14920
|
|
|
14695
14921
|
return html_str
|
|
14696
14922
|
|
|
@@ -14768,24 +14994,6 @@ def _transform_eval(
|
|
|
14768
14994
|
return symbol_list
|
|
14769
14995
|
|
|
14770
14996
|
|
|
14771
|
-
def _format_numbers_with_gt(
|
|
14772
|
-
values: list[int], n_sigfig: int = 3, compact: bool = True, locale: str = "en"
|
|
14773
|
-
) -> list[str]:
|
|
14774
|
-
"""Format numbers using Great Tables GT object to avoid pandas dependency."""
|
|
14775
|
-
import polars as pl
|
|
14776
|
-
|
|
14777
|
-
# Create a single-column DataFrame with all values
|
|
14778
|
-
df = pl.DataFrame({"values": values})
|
|
14779
|
-
|
|
14780
|
-
# Create GT object and format the column
|
|
14781
|
-
gt_obj = GT(df).fmt_number(columns="values", n_sigfig=n_sigfig, compact=compact, locale=locale)
|
|
14782
|
-
|
|
14783
|
-
# Extract the formatted values using _get_column_of_values
|
|
14784
|
-
formatted_values = _get_column_of_values(gt_obj, column_name="values", context="html")
|
|
14785
|
-
|
|
14786
|
-
return formatted_values
|
|
14787
|
-
|
|
14788
|
-
|
|
14789
14997
|
def _format_single_number_with_gt(
|
|
14790
14998
|
value: int, n_sigfig: int = 3, compact: bool = True, locale: str = "en", df_lib=None
|
|
14791
14999
|
) -> str:
|
|
@@ -14796,12 +15004,14 @@ def _format_single_number_with_gt(
|
|
|
14796
15004
|
import polars as pl
|
|
14797
15005
|
|
|
14798
15006
|
df_lib = pl
|
|
14799
|
-
elif _is_lib_present("pandas"):
|
|
14800
|
-
import pandas as pd
|
|
15007
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
15008
|
+
import pandas as pd # pragma: no cover
|
|
14801
15009
|
|
|
14802
|
-
df_lib = pd
|
|
14803
|
-
else:
|
|
14804
|
-
raise ImportError(
|
|
15010
|
+
df_lib = pd # pragma: no cover
|
|
15011
|
+
else: # pragma: no cover
|
|
15012
|
+
raise ImportError(
|
|
15013
|
+
"Neither Polars nor Pandas is available for formatting"
|
|
15014
|
+
) # pragma: no cover
|
|
14805
15015
|
|
|
14806
15016
|
# Create a single-row, single-column DataFrame using the specified library
|
|
14807
15017
|
df = df_lib.DataFrame({"value": [value]})
|
|
@@ -14867,12 +15077,14 @@ def _format_single_float_with_gt(
|
|
|
14867
15077
|
import polars as pl
|
|
14868
15078
|
|
|
14869
15079
|
df_lib = pl
|
|
14870
|
-
elif _is_lib_present("pandas"):
|
|
14871
|
-
import pandas as pd
|
|
15080
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
15081
|
+
import pandas as pd # pragma: no cover
|
|
14872
15082
|
|
|
14873
|
-
df_lib = pd
|
|
14874
|
-
else:
|
|
14875
|
-
raise ImportError(
|
|
15083
|
+
df_lib = pd # pragma: no cover
|
|
15084
|
+
else: # pragma: no cover
|
|
15085
|
+
raise ImportError(
|
|
15086
|
+
"Neither Polars nor Pandas is available for formatting"
|
|
15087
|
+
) # pragma: no cover
|
|
14876
15088
|
|
|
14877
15089
|
# Create a single-row, single-column DataFrame using the specified library
|
|
14878
15090
|
df = df_lib.DataFrame({"value": [value]})
|
|
@@ -14904,7 +15116,7 @@ def _transform_passed_failed(
|
|
|
14904
15116
|
return _format_single_float_with_gt(value, decimals=2, locale=locale, df_lib=df_lib)
|
|
14905
15117
|
else:
|
|
14906
15118
|
# Fallback to the original behavior
|
|
14907
|
-
return vals.fmt_number(value, decimals=2, locale=locale)[0]
|
|
15119
|
+
return vals.fmt_number(value, decimals=2, locale=locale)[0] # pragma: no cover
|
|
14908
15120
|
|
|
14909
15121
|
passed_failed = [
|
|
14910
15122
|
(
|
|
@@ -15044,7 +15256,7 @@ def _get_callable_source(fn: Callable) -> str:
|
|
|
15044
15256
|
return pre_arg
|
|
15045
15257
|
except (OSError, TypeError): # pragma: no cover
|
|
15046
15258
|
return fn.__name__
|
|
15047
|
-
return fn
|
|
15259
|
+
return fn # pragma: no cover
|
|
15048
15260
|
|
|
15049
15261
|
|
|
15050
15262
|
def _extract_pre_argument(source: str) -> str:
|
|
@@ -15128,12 +15340,14 @@ def _format_single_integer_with_gt(value: int, locale: str = "en", df_lib=None)
|
|
|
15128
15340
|
import polars as pl
|
|
15129
15341
|
|
|
15130
15342
|
df_lib = pl
|
|
15131
|
-
elif _is_lib_present("pandas"):
|
|
15132
|
-
import pandas as pd
|
|
15343
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
15344
|
+
import pandas as pd # pragma: no cover
|
|
15133
15345
|
|
|
15134
|
-
df_lib = pd
|
|
15135
|
-
else:
|
|
15136
|
-
raise ImportError(
|
|
15346
|
+
df_lib = pd # pragma: no cover
|
|
15347
|
+
else: # pragma: no cover
|
|
15348
|
+
raise ImportError(
|
|
15349
|
+
"Neither Polars nor Pandas is available for formatting"
|
|
15350
|
+
) # pragma: no cover
|
|
15137
15351
|
|
|
15138
15352
|
# Create a single-row, single-column DataFrame using the specified library
|
|
15139
15353
|
df = df_lib.DataFrame({"value": [value]})
|
|
@@ -15161,12 +15375,14 @@ def _format_single_float_with_gt_custom(
|
|
|
15161
15375
|
import polars as pl
|
|
15162
15376
|
|
|
15163
15377
|
df_lib = pl
|
|
15164
|
-
elif _is_lib_present("pandas"):
|
|
15165
|
-
import pandas as pd
|
|
15378
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
15379
|
+
import pandas as pd # pragma: no cover
|
|
15166
15380
|
|
|
15167
|
-
df_lib = pd
|
|
15168
|
-
else:
|
|
15169
|
-
raise ImportError(
|
|
15381
|
+
df_lib = pd # pragma: no cover
|
|
15382
|
+
else: # pragma: no cover
|
|
15383
|
+
raise ImportError(
|
|
15384
|
+
"Neither Polars nor Pandas is available for formatting"
|
|
15385
|
+
) # pragma: no cover
|
|
15170
15386
|
|
|
15171
15387
|
# Create a single-row, single-column DataFrame using the specified library
|
|
15172
15388
|
df = df_lib.DataFrame({"value": [value]})
|
|
@@ -15201,7 +15417,7 @@ def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) ->
|
|
|
15201
15417
|
# Fallback to the original behavior
|
|
15202
15418
|
return fmt_number(
|
|
15203
15419
|
value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
15204
|
-
)[0]
|
|
15420
|
+
)[0] # pragma: no cover
|
|
15205
15421
|
|
|
15206
15422
|
def _format_integer_safe(value: int) -> str:
|
|
15207
15423
|
if df_lib is not None and value is not None:
|
|
@@ -15333,7 +15549,8 @@ def _step_report_row_based(
|
|
|
15333
15549
|
elements = ", ".join(values)
|
|
15334
15550
|
text = f"{column} ∉ {{{elements}}}"
|
|
15335
15551
|
elif assertion_type == "col_vals_regex":
|
|
15336
|
-
|
|
15552
|
+
pattern = values["pattern"]
|
|
15553
|
+
text = STEP_REPORT_TEXT["column_matches_regex"][lang].format(column=column, values=pattern)
|
|
15337
15554
|
elif assertion_type == "col_vals_null":
|
|
15338
15555
|
text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
|
|
15339
15556
|
elif assertion_type == "col_vals_not_null":
|
|
@@ -15386,9 +15603,12 @@ def _step_report_row_based(
|
|
|
15386
15603
|
title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN
|
|
15387
15604
|
assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
|
|
15388
15605
|
|
|
15389
|
-
# Use success_statement_no_column for col_vals_expr since it doesn't target
|
|
15606
|
+
# Use 'success_statement_no_column' for col_vals_expr() since it doesn't target
|
|
15607
|
+
# a specific column
|
|
15390
15608
|
if assertion_type == "col_vals_expr":
|
|
15391
|
-
success_stmt = STEP_REPORT_TEXT["success_statement_no_column"][lang].format(
|
|
15609
|
+
success_stmt = STEP_REPORT_TEXT["success_statement_no_column"][lang].format(
|
|
15610
|
+
n=n
|
|
15611
|
+
) # pragma: no cover
|
|
15392
15612
|
else:
|
|
15393
15613
|
success_stmt = STEP_REPORT_TEXT["success_statement"][lang].format(
|
|
15394
15614
|
n=n,
|
|
@@ -16101,14 +16321,14 @@ def _step_report_schema_any_order(
|
|
|
16101
16321
|
if exp_columns_dict[column_name_exp_i]["colname_matched"]:
|
|
16102
16322
|
col_exp_correct.append(CHECK_MARK_SPAN)
|
|
16103
16323
|
else:
|
|
16104
|
-
col_exp_correct.append(CROSS_MARK_SPAN)
|
|
16324
|
+
col_exp_correct.append(CROSS_MARK_SPAN) # pragma: no cover
|
|
16105
16325
|
|
|
16106
16326
|
#
|
|
16107
16327
|
# `dtype_exp` values
|
|
16108
16328
|
#
|
|
16109
16329
|
|
|
16110
16330
|
if not exp_columns_dict[column_name_exp_i]["dtype_present"]:
|
|
16111
|
-
dtype_exp.append("")
|
|
16331
|
+
dtype_exp.append("") # pragma: no cover
|
|
16112
16332
|
|
|
16113
16333
|
elif len(exp_columns_dict[column_name_exp_i]["dtype_input"]) > 1:
|
|
16114
16334
|
dtype = exp_columns_dict[column_name_exp_i]["dtype_input"]
|
|
@@ -16143,9 +16363,9 @@ def _step_report_schema_any_order(
|
|
|
16143
16363
|
#
|
|
16144
16364
|
|
|
16145
16365
|
if not exp_columns_dict[column_name_exp_i]["colname_matched"]:
|
|
16146
|
-
dtype_exp_correct.append("—")
|
|
16366
|
+
dtype_exp_correct.append("—") # pragma: no cover
|
|
16147
16367
|
elif not exp_columns_dict[column_name_exp_i]["dtype_present"]:
|
|
16148
|
-
dtype_exp_correct.append("")
|
|
16368
|
+
dtype_exp_correct.append("") # pragma: no cover
|
|
16149
16369
|
elif exp_columns_dict[column_name_exp_i]["dtype_matched"]:
|
|
16150
16370
|
dtype_exp_correct.append(CHECK_MARK_SPAN)
|
|
16151
16371
|
else:
|
|
@@ -16191,13 +16411,17 @@ def _step_report_schema_any_order(
|
|
|
16191
16411
|
#
|
|
16192
16412
|
|
|
16193
16413
|
if not exp_columns_dict[column_name_exp_i]["dtype_present"]:
|
|
16194
|
-
dtype_exp.append("")
|
|
16414
|
+
dtype_exp.append("") # pragma: no cover
|
|
16195
16415
|
|
|
16196
16416
|
elif len(exp_columns_dict[column_name_exp_i]["dtype_input"]) > 1:
|
|
16197
|
-
dtype = exp_columns_dict[column_name_exp_i]["dtype_input"]
|
|
16417
|
+
dtype = exp_columns_dict[column_name_exp_i]["dtype_input"] # pragma: no cover
|
|
16198
16418
|
|
|
16199
|
-
if
|
|
16200
|
-
|
|
16419
|
+
if (
|
|
16420
|
+
exp_columns_dict[column_name_exp_i]["dtype_matched_pos"] is not None
|
|
16421
|
+
): # pragma: no cover
|
|
16422
|
+
pos = exp_columns_dict[column_name_exp_i][
|
|
16423
|
+
"dtype_matched_pos"
|
|
16424
|
+
] # pragma: no cover
|
|
16201
16425
|
|
|
16202
16426
|
# Combine the dtypes together with pipes but underline the matched dtype in
|
|
16203
16427
|
# green with an HTML span tag and style attribute
|
|
@@ -16209,13 +16433,13 @@ def _step_report_schema_any_order(
|
|
|
16209
16433
|
else dtype[i]
|
|
16210
16434
|
)
|
|
16211
16435
|
for i in range(len(dtype))
|
|
16212
|
-
]
|
|
16213
|
-
dtype = " | ".join(dtype)
|
|
16214
|
-
dtype_exp.append(dtype)
|
|
16436
|
+
] # pragma: no cover
|
|
16437
|
+
dtype = " | ".join(dtype) # pragma: no cover
|
|
16438
|
+
dtype_exp.append(dtype) # pragma: no cover
|
|
16215
16439
|
|
|
16216
16440
|
else:
|
|
16217
|
-
dtype = " | ".join(dtype)
|
|
16218
|
-
dtype_exp.append(dtype)
|
|
16441
|
+
dtype = " | ".join(dtype) # pragma: no cover
|
|
16442
|
+
dtype_exp.append(dtype) # pragma: no cover
|
|
16219
16443
|
|
|
16220
16444
|
else:
|
|
16221
16445
|
dtype = exp_columns_dict[column_name_exp_i]["dtype_input"][0]
|
|
@@ -16227,12 +16451,12 @@ def _step_report_schema_any_order(
|
|
|
16227
16451
|
|
|
16228
16452
|
if not exp_columns_dict[column_name_exp_i]["colname_matched"]:
|
|
16229
16453
|
dtype_exp_correct.append("—")
|
|
16230
|
-
elif not exp_columns_dict[column_name_exp_i]["dtype_present"]:
|
|
16231
|
-
dtype_exp_correct.append("")
|
|
16232
|
-
elif exp_columns_dict[column_name_exp_i]["dtype_matched"]:
|
|
16233
|
-
dtype_exp_correct.append(CHECK_MARK_SPAN)
|
|
16234
|
-
else:
|
|
16235
|
-
dtype_exp_correct.append(CROSS_MARK_SPAN)
|
|
16454
|
+
elif not exp_columns_dict[column_name_exp_i]["dtype_present"]: # pragma: no cover
|
|
16455
|
+
dtype_exp_correct.append("") # pragma: no cover
|
|
16456
|
+
elif exp_columns_dict[column_name_exp_i]["dtype_matched"]: # pragma: no cover
|
|
16457
|
+
dtype_exp_correct.append(CHECK_MARK_SPAN) # pragma: no cover
|
|
16458
|
+
else: # pragma: no cover
|
|
16459
|
+
dtype_exp_correct.append(CROSS_MARK_SPAN) # pragma: no cover
|
|
16236
16460
|
|
|
16237
16461
|
if len(columns_found) > 0:
|
|
16238
16462
|
# Get the last index of the columns found
|
|
@@ -16248,7 +16472,9 @@ def _step_report_schema_any_order(
|
|
|
16248
16472
|
]
|
|
16249
16473
|
|
|
16250
16474
|
else:
|
|
16251
|
-
index_exp = [
|
|
16475
|
+
index_exp = [
|
|
16476
|
+
str(i) for i in range(1, len(colnames_exp_unmatched) + 1)
|
|
16477
|
+
] # pragma: no cover
|
|
16252
16478
|
|
|
16253
16479
|
schema_exp_unmatched = pl.DataFrame(
|
|
16254
16480
|
{
|