pointblank 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_constants.py +13 -0
- pointblank/_constants_translations.py +216 -0
- pointblank/_interrogation.py +182 -0
- pointblank/_utils.py +2 -0
- pointblank/column.py +352 -4
- pointblank/data/api-docs.txt +270 -4
- pointblank/validate.py +462 -5
- pointblank-0.8.6.dist-info/METADATA +312 -0
- {pointblank-0.8.4.dist-info → pointblank-0.8.6.dist-info}/RECORD +13 -13
- pointblank-0.8.4.dist-info/METADATA +0 -269
- {pointblank-0.8.4.dist-info → pointblank-0.8.6.dist-info}/WHEEL +0 -0
- {pointblank-0.8.4.dist-info → pointblank-0.8.6.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.8.4.dist-info → pointblank-0.8.6.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -52,6 +52,7 @@ from pointblank._interrogation import (
|
|
|
52
52
|
ColValsCompareTwo,
|
|
53
53
|
ColValsExpr,
|
|
54
54
|
ColValsRegex,
|
|
55
|
+
ConjointlyValidation,
|
|
55
56
|
NumberOfTestUnits,
|
|
56
57
|
RowCountMatch,
|
|
57
58
|
RowsDistinct,
|
|
@@ -6462,6 +6463,250 @@ class Validate:
|
|
|
6462
6463
|
|
|
6463
6464
|
return self
|
|
6464
6465
|
|
|
6466
|
+
def conjointly(
|
|
6467
|
+
self,
|
|
6468
|
+
*exprs: Callable,
|
|
6469
|
+
pre: Callable | None = None,
|
|
6470
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
6471
|
+
actions: Actions | None = None,
|
|
6472
|
+
brief: str | bool | None = None,
|
|
6473
|
+
active: bool = True,
|
|
6474
|
+
) -> Validate:
|
|
6475
|
+
"""
|
|
6476
|
+
Perform multiple row-wise validations for joint validity.
|
|
6477
|
+
|
|
6478
|
+
The `conjointly()` validation method checks whether each row in the table passes multiple
|
|
6479
|
+
validation conditions simultaneously. This enables compound validation logic where a test
|
|
6480
|
+
unit (typically a row) must satisfy all specified conditions to pass the validation.
|
|
6481
|
+
|
|
6482
|
+
This method accepts multiple validation expressions as callables, which should return
|
|
6483
|
+
boolean expressions when applied to the data. You can use lambdas that incorporate
|
|
6484
|
+
Polars/Pandas/Ibis expressions (based on the target table type) or create more complex
|
|
6485
|
+
validation functions. The validation will operate over the number of test units that is
|
|
6486
|
+
equal to the number of rows in the table (determined after any `pre=` mutation has been
|
|
6487
|
+
applied).
|
|
6488
|
+
|
|
6489
|
+
Parameters
|
|
6490
|
+
----------
|
|
6491
|
+
*exprs
|
|
6492
|
+
Multiple validation expressions provided as callable functions. Each callable should
|
|
6493
|
+
accept a table as its single argument and return a boolean expression or Series/Column
|
|
6494
|
+
that evaluates to boolean values for each row.
|
|
6495
|
+
pre
|
|
6496
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
6497
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
6498
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
6499
|
+
argument.
|
|
6500
|
+
thresholds
|
|
6501
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
6502
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
6503
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
6504
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
6505
|
+
section for information on how to set threshold levels.
|
|
6506
|
+
actions
|
|
6507
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6508
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6509
|
+
define the actions.
|
|
6510
|
+
brief
|
|
6511
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6512
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6513
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6514
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6515
|
+
won't be a brief.
|
|
6516
|
+
active
|
|
6517
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6518
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6519
|
+
for the steps unchanged).
|
|
6520
|
+
|
|
6521
|
+
Returns
|
|
6522
|
+
-------
|
|
6523
|
+
Validate
|
|
6524
|
+
The `Validate` object with the added validation step.
|
|
6525
|
+
|
|
6526
|
+
Preprocessing
|
|
6527
|
+
-------------
|
|
6528
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
6529
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
6530
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
6531
|
+
before the validation step is applied.
|
|
6532
|
+
|
|
6533
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
6534
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
6535
|
+
certain criteria or to apply a transformation to the data. Regarding the lifetime of the
|
|
6536
|
+
transformed table, it only exists during the validation step and is not stored in the
|
|
6537
|
+
`Validate` object or used in subsequent validation steps.
|
|
6538
|
+
|
|
6539
|
+
Thresholds
|
|
6540
|
+
----------
|
|
6541
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6542
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6543
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6544
|
+
|
|
6545
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
6546
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
6547
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
6548
|
+
|
|
6549
|
+
Thresholds can be defined using one of these input schemes:
|
|
6550
|
+
|
|
6551
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6552
|
+
thresholds)
|
|
6553
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6554
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6555
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6556
|
+
'critical'
|
|
6557
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6558
|
+
for the 'warning' level only
|
|
6559
|
+
|
|
6560
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
6561
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
6562
|
+
set, you're free to set any combination of them.
|
|
6563
|
+
|
|
6564
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
6565
|
+
take for each level of failure (using the `actions=` parameter).
|
|
6566
|
+
|
|
6567
|
+
Examples
|
|
6568
|
+
--------
|
|
6569
|
+
```{python}
|
|
6570
|
+
#| echo: false
|
|
6571
|
+
#| output: false
|
|
6572
|
+
import pointblank as pb
|
|
6573
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
6574
|
+
```
|
|
6575
|
+
For the examples here, we'll use a simple Polars DataFrame with three numeric columns (`a`,
|
|
6576
|
+
`b`, and `c`). The table is shown below:
|
|
6577
|
+
|
|
6578
|
+
```{python}
|
|
6579
|
+
import pointblank as pb
|
|
6580
|
+
import polars as pl
|
|
6581
|
+
|
|
6582
|
+
tbl = pl.DataFrame(
|
|
6583
|
+
{
|
|
6584
|
+
"a": [5, 7, 1, 3, 9, 4],
|
|
6585
|
+
"b": [6, 3, 0, 5, 8, 2],
|
|
6586
|
+
"c": [10, 4, 8, 9, 10, 5],
|
|
6587
|
+
}
|
|
6588
|
+
)
|
|
6589
|
+
|
|
6590
|
+
pb.preview(tbl)
|
|
6591
|
+
```
|
|
6592
|
+
|
|
6593
|
+
Let's validate that the values in each row satisfy multiple conditions simultaneously:
|
|
6594
|
+
|
|
6595
|
+
1. Column `a` should be greater than 2
|
|
6596
|
+
2. Column `b` should be less than 7
|
|
6597
|
+
3. The sum of `a` and `b` should be less than the value in column `c`
|
|
6598
|
+
|
|
6599
|
+
We'll use `conjointly()` to check all these conditions together:
|
|
6600
|
+
|
|
6601
|
+
```{python}
|
|
6602
|
+
validation = (
|
|
6603
|
+
pb.Validate(data=tbl)
|
|
6604
|
+
.conjointly(
|
|
6605
|
+
lambda df: pl.col("a") > 2,
|
|
6606
|
+
lambda df: pl.col("b") < 7,
|
|
6607
|
+
lambda df: pl.col("a") + pl.col("b") < pl.col("c")
|
|
6608
|
+
)
|
|
6609
|
+
.interrogate()
|
|
6610
|
+
)
|
|
6611
|
+
|
|
6612
|
+
validation
|
|
6613
|
+
```
|
|
6614
|
+
|
|
6615
|
+
The validation table shows that not all rows satisfy all three conditions together. For a
|
|
6616
|
+
row to pass the conjoint validation, all three conditions must be true for that row.
|
|
6617
|
+
|
|
6618
|
+
We can also use preprocessing to filter the data before applying the conjoint validation:
|
|
6619
|
+
|
|
6620
|
+
```{python}
|
|
6621
|
+
validation = (
|
|
6622
|
+
pb.Validate(data=tbl)
|
|
6623
|
+
.conjointly(
|
|
6624
|
+
lambda df: pl.col("a") > 2,
|
|
6625
|
+
lambda df: pl.col("b") < 7,
|
|
6626
|
+
lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
|
|
6627
|
+
pre=lambda df: df.filter(pl.col("c") > 5)
|
|
6628
|
+
)
|
|
6629
|
+
.interrogate()
|
|
6630
|
+
)
|
|
6631
|
+
|
|
6632
|
+
validation
|
|
6633
|
+
```
|
|
6634
|
+
|
|
6635
|
+
This allows for more complex validation scenarios where the data is first prepared and then
|
|
6636
|
+
validated against multiple conditions simultaneously.
|
|
6637
|
+
|
|
6638
|
+
Or, you can use the backend-agnostic column expression helper
|
|
6639
|
+
[`expr_col()`](`pointblank.expr_col`) to write expressions that work across different table
|
|
6640
|
+
backends:
|
|
6641
|
+
|
|
6642
|
+
```{python}
|
|
6643
|
+
tbl = pl.DataFrame(
|
|
6644
|
+
{
|
|
6645
|
+
"a": [5, 7, 1, 3, 9, 4],
|
|
6646
|
+
"b": [6, 3, 0, 5, 8, 2],
|
|
6647
|
+
"c": [10, 4, 8, 9, 10, 5],
|
|
6648
|
+
}
|
|
6649
|
+
)
|
|
6650
|
+
|
|
6651
|
+
# Using backend-agnostic syntax with expr_col()
|
|
6652
|
+
validation = (
|
|
6653
|
+
pb.Validate(data=tbl)
|
|
6654
|
+
.conjointly(
|
|
6655
|
+
lambda df: pb.expr_col("a") > 2,
|
|
6656
|
+
lambda df: pb.expr_col("b") < 7,
|
|
6657
|
+
lambda df: pb.expr_col("a") + pb.expr_col("b") < pb.expr_col("c")
|
|
6658
|
+
)
|
|
6659
|
+
.interrogate()
|
|
6660
|
+
)
|
|
6661
|
+
|
|
6662
|
+
validation
|
|
6663
|
+
```
|
|
6664
|
+
|
|
6665
|
+
Using [`expr_col()`](`pointblank.expr_col`) allows your validation code to work consistently
|
|
6666
|
+
across Pandas, Polars, and Ibis table backends without changes, making your validation
|
|
6667
|
+
pipelines more portable.
|
|
6668
|
+
|
|
6669
|
+
See Also
|
|
6670
|
+
--------
|
|
6671
|
+
Look at the documentation of the [`expr_col()`](`pointblank.expr_col`) function for more
|
|
6672
|
+
information on how to use it with different table backends.
|
|
6673
|
+
"""
|
|
6674
|
+
|
|
6675
|
+
assertion_type = _get_fn_name()
|
|
6676
|
+
|
|
6677
|
+
if len(exprs) == 0:
|
|
6678
|
+
raise ValueError("At least one validation expression must be provided")
|
|
6679
|
+
|
|
6680
|
+
_check_pre(pre=pre)
|
|
6681
|
+
_check_thresholds(thresholds=thresholds)
|
|
6682
|
+
_check_boolean_input(param=active, param_name="active")
|
|
6683
|
+
|
|
6684
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
6685
|
+
thresholds = (
|
|
6686
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
6687
|
+
)
|
|
6688
|
+
|
|
6689
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
6690
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
6691
|
+
|
|
6692
|
+
# Package the validation expressions for later evaluation
|
|
6693
|
+
values = {"expressions": exprs}
|
|
6694
|
+
|
|
6695
|
+
val_info = _ValidationInfo(
|
|
6696
|
+
assertion_type=assertion_type,
|
|
6697
|
+
column=None, # This is a rowwise validation, not specific to any column
|
|
6698
|
+
values=values,
|
|
6699
|
+
pre=pre,
|
|
6700
|
+
thresholds=thresholds,
|
|
6701
|
+
actions=actions,
|
|
6702
|
+
brief=brief,
|
|
6703
|
+
active=active,
|
|
6704
|
+
)
|
|
6705
|
+
|
|
6706
|
+
self._add_validation(validation_info=val_info)
|
|
6707
|
+
|
|
6708
|
+
return self
|
|
6709
|
+
|
|
6465
6710
|
def interrogate(
|
|
6466
6711
|
self,
|
|
6467
6712
|
collect_extracts: bool = True,
|
|
@@ -6841,6 +7086,14 @@ class Validate:
|
|
|
6841
7086
|
|
|
6842
7087
|
results_tbl = None
|
|
6843
7088
|
|
|
7089
|
+
if assertion_category == "CONJOINTLY":
|
|
7090
|
+
results_tbl = ConjointlyValidation(
|
|
7091
|
+
data_tbl=data_tbl_step,
|
|
7092
|
+
expressions=value["expressions"],
|
|
7093
|
+
threshold=threshold,
|
|
7094
|
+
tbl_type=tbl_type,
|
|
7095
|
+
).get_test_results()
|
|
7096
|
+
|
|
6844
7097
|
if assertion_category not in [
|
|
6845
7098
|
"COL_EXISTS_HAS_TYPE",
|
|
6846
7099
|
"COL_SCHEMA_MATCH",
|
|
@@ -6849,9 +7102,18 @@ class Validate:
|
|
|
6849
7102
|
]:
|
|
6850
7103
|
# Extract the `pb_is_good_` column from the table as a results list
|
|
6851
7104
|
if tbl_type in IBIS_BACKENDS:
|
|
6852
|
-
|
|
6853
|
-
|
|
6854
|
-
|
|
7105
|
+
# Select the DataFrame library to use for getting the results list
|
|
7106
|
+
df_lib = _select_df_lib(preference="polars")
|
|
7107
|
+
df_lib_name = df_lib.__name__
|
|
7108
|
+
|
|
7109
|
+
if df_lib_name == "pandas":
|
|
7110
|
+
results_list = (
|
|
7111
|
+
results_tbl.select("pb_is_good_").to_pandas()["pb_is_good_"].to_list()
|
|
7112
|
+
)
|
|
7113
|
+
else:
|
|
7114
|
+
results_list = (
|
|
7115
|
+
results_tbl.select("pb_is_good_").to_polars()["pb_is_good_"].to_list()
|
|
7116
|
+
)
|
|
6855
7117
|
|
|
6856
7118
|
else:
|
|
6857
7119
|
results_list = nw.from_native(results_tbl)["pb_is_good_"].to_list()
|
|
@@ -7012,7 +7274,7 @@ class Validate:
|
|
|
7012
7274
|
# TODO: Add support for extraction of rows for Ibis backends
|
|
7013
7275
|
if (
|
|
7014
7276
|
collect_extracts
|
|
7015
|
-
and assertion_type in ROW_BASED_VALIDATION_TYPES
|
|
7277
|
+
and assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_distinct"]
|
|
7016
7278
|
and tbl_type not in IBIS_BACKENDS
|
|
7017
7279
|
):
|
|
7018
7280
|
# Add row numbers to the results table
|
|
@@ -7038,6 +7300,32 @@ class Validate:
|
|
|
7038
7300
|
if len(validation_extract_nw) > extract_limit:
|
|
7039
7301
|
validation_extract_nw = validation_extract_nw.head(extract_limit)
|
|
7040
7302
|
|
|
7303
|
+
# If a 'rows_distinct' validation step, then the extract should have the
|
|
7304
|
+
# duplicate rows arranged together
|
|
7305
|
+
if assertion_type == "rows_distinct":
|
|
7306
|
+
# Get the list of column names in the extract, excluding the `_row_num_` column
|
|
7307
|
+
column_names = validation_extract_nw.columns
|
|
7308
|
+
column_names.remove("_row_num_")
|
|
7309
|
+
|
|
7310
|
+
# Only include the columns that were defined in `rows_distinct(columns_subset=)`
|
|
7311
|
+
# (stored here in `column`), if supplied
|
|
7312
|
+
if column is not None:
|
|
7313
|
+
column_names = column
|
|
7314
|
+
column_names_subset = ["_row_num_"] + column
|
|
7315
|
+
validation_extract_nw = validation_extract_nw.select(column_names_subset)
|
|
7316
|
+
|
|
7317
|
+
validation_extract_nw = (
|
|
7318
|
+
validation_extract_nw.with_columns(
|
|
7319
|
+
group_min_row=nw.min("_row_num_").over(*column_names)
|
|
7320
|
+
)
|
|
7321
|
+
# First sort by the columns to group duplicates and by row numbers
|
|
7322
|
+
# within groups; this type of sorting will preserve the original order in a
|
|
7323
|
+
# single operation
|
|
7324
|
+
.sort(by=["group_min_row"] + column_names + ["_row_num_"])
|
|
7325
|
+
.drop("group_min_row")
|
|
7326
|
+
)
|
|
7327
|
+
|
|
7328
|
+
# Ensure that the extract is set to its native format
|
|
7041
7329
|
validation.extract = nw.to_native(validation_extract_nw)
|
|
7042
7330
|
|
|
7043
7331
|
# Get the end time for this step
|
|
@@ -7976,6 +8264,7 @@ class Validate:
|
|
|
7976
8264
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
7977
8265
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
7978
8266
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
8267
|
+
- [`rows_distinct()`](`pointblank.Validate.rows_distinct`)
|
|
7979
8268
|
|
|
7980
8269
|
An extracted row means that a test unit failed for that row in the validation step. The
|
|
7981
8270
|
extracted rows are a subset of the original table and are useful for further analysis or for
|
|
@@ -8357,6 +8646,7 @@ class Validate:
|
|
|
8357
8646
|
# Do we have a DataFrame library to work with?
|
|
8358
8647
|
_check_any_df_lib(method_used="get_tabular_report")
|
|
8359
8648
|
|
|
8649
|
+
# Select the DataFrame library
|
|
8360
8650
|
df_lib = _select_df_lib(preference="polars")
|
|
8361
8651
|
|
|
8362
8652
|
# Get information on the input data table
|
|
@@ -8586,6 +8876,9 @@ class Validate:
|
|
|
8586
8876
|
else:
|
|
8587
8877
|
# With a column subset list, format with commas between the column names
|
|
8588
8878
|
columns_upd.append(", ".join(column))
|
|
8879
|
+
|
|
8880
|
+
elif assertion_type[i] in ["conjointly"]:
|
|
8881
|
+
columns_upd.append("")
|
|
8589
8882
|
else:
|
|
8590
8883
|
columns_upd.append(str(column))
|
|
8591
8884
|
|
|
@@ -8657,6 +8950,9 @@ class Validate:
|
|
|
8657
8950
|
|
|
8658
8951
|
values_upd.append(str(count))
|
|
8659
8952
|
|
|
8953
|
+
elif assertion_type[i] in ["conjointly"]:
|
|
8954
|
+
values_upd.append("COLUMN EXPR")
|
|
8955
|
+
|
|
8660
8956
|
# If the assertion type is not recognized, add the value as a string
|
|
8661
8957
|
else:
|
|
8662
8958
|
values_upd.append(str(value))
|
|
@@ -9330,6 +9626,24 @@ class Validate:
|
|
|
9330
9626
|
lang=lang,
|
|
9331
9627
|
)
|
|
9332
9628
|
|
|
9629
|
+
elif assertion_type == "rows_distinct":
|
|
9630
|
+
extract = self.get_data_extracts(i=i, frame=True)
|
|
9631
|
+
|
|
9632
|
+
step_report = _step_report_rows_distinct(
|
|
9633
|
+
i=i,
|
|
9634
|
+
column=column,
|
|
9635
|
+
column_position=column_position,
|
|
9636
|
+
columns_subset=columns_subset,
|
|
9637
|
+
n=n,
|
|
9638
|
+
n_failed=n_failed,
|
|
9639
|
+
all_passed=all_passed,
|
|
9640
|
+
extract=extract,
|
|
9641
|
+
tbl_preview=tbl_preview,
|
|
9642
|
+
header=header,
|
|
9643
|
+
limit=limit,
|
|
9644
|
+
lang=lang,
|
|
9645
|
+
)
|
|
9646
|
+
|
|
9333
9647
|
elif assertion_type == "col_schema_match":
|
|
9334
9648
|
# Get the parameters for column-schema matching
|
|
9335
9649
|
values_dict = validation_step["values"]
|
|
@@ -9925,6 +10239,9 @@ def _create_autobrief_or_failure_text(
|
|
|
9925
10239
|
for_failure=for_failure,
|
|
9926
10240
|
)
|
|
9927
10241
|
|
|
10242
|
+
if assertion_type == "conjointly":
|
|
10243
|
+
return _create_text_conjointly(lang=lang, for_failure=for_failure)
|
|
10244
|
+
|
|
9928
10245
|
return None # pragma: no cover
|
|
9929
10246
|
|
|
9930
10247
|
|
|
@@ -10099,6 +10416,12 @@ def _create_text_col_count_match(lang: str, value: int, for_failure: bool = Fals
|
|
|
10099
10416
|
return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text)
|
|
10100
10417
|
|
|
10101
10418
|
|
|
10419
|
+
def _create_text_conjointly(lang: str, for_failure: bool = False) -> str:
|
|
10420
|
+
type_ = _expect_failure_type(for_failure=for_failure)
|
|
10421
|
+
|
|
10422
|
+
return EXPECT_FAIL_TEXT[f"conjointly_{type_}_text"][lang]
|
|
10423
|
+
|
|
10424
|
+
|
|
10102
10425
|
def _prep_column_text(column: str | list[str]) -> str:
|
|
10103
10426
|
if isinstance(column, list):
|
|
10104
10427
|
return "`" + str(column[0]) + "`"
|
|
@@ -10672,7 +10995,7 @@ def _step_report_row_based(
|
|
|
10672
10995
|
header: str,
|
|
10673
10996
|
limit: int | None,
|
|
10674
10997
|
lang: str,
|
|
10675
|
-
):
|
|
10998
|
+
) -> GT:
|
|
10676
10999
|
# Get the length of the extracted data for the step
|
|
10677
11000
|
extract_length = get_row_count(extract)
|
|
10678
11001
|
|
|
@@ -10889,6 +11212,140 @@ def _step_report_row_based(
|
|
|
10889
11212
|
return step_report
|
|
10890
11213
|
|
|
10891
11214
|
|
|
11215
|
+
def _step_report_rows_distinct(
|
|
11216
|
+
i: int,
|
|
11217
|
+
column: list[str],
|
|
11218
|
+
column_position: list[int],
|
|
11219
|
+
columns_subset: list[str] | None,
|
|
11220
|
+
n: int,
|
|
11221
|
+
n_failed: int,
|
|
11222
|
+
all_passed: bool,
|
|
11223
|
+
extract: any,
|
|
11224
|
+
tbl_preview: GT,
|
|
11225
|
+
header: str,
|
|
11226
|
+
limit: int | None,
|
|
11227
|
+
lang: str,
|
|
11228
|
+
) -> GT:
|
|
11229
|
+
# Get the length of the extracted data for the step
|
|
11230
|
+
extract_length = get_row_count(extract)
|
|
11231
|
+
|
|
11232
|
+
# Determine whether the `lang` value represents a right-to-left language
|
|
11233
|
+
is_rtl_lang = lang in RTL_LANGUAGES
|
|
11234
|
+
direction_rtl = " direction: rtl;" if is_rtl_lang else ""
|
|
11235
|
+
|
|
11236
|
+
if column is None:
|
|
11237
|
+
text = STEP_REPORT_TEXT["rows_distinct_all"][lang].format(column=column)
|
|
11238
|
+
else:
|
|
11239
|
+
columns_list = ", ".join(column)
|
|
11240
|
+
text = STEP_REPORT_TEXT["rows_distinct_subset"][lang].format(columns_subset=columns_list)
|
|
11241
|
+
|
|
11242
|
+
if all_passed:
|
|
11243
|
+
step_report = tbl_preview
|
|
11244
|
+
|
|
11245
|
+
if header is None:
|
|
11246
|
+
return step_report
|
|
11247
|
+
|
|
11248
|
+
title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN
|
|
11249
|
+
|
|
11250
|
+
success_stmt = STEP_REPORT_TEXT["success_statement_no_column"][lang].format(
|
|
11251
|
+
n=n,
|
|
11252
|
+
column_position=column_position,
|
|
11253
|
+
)
|
|
11254
|
+
preview_stmt = STEP_REPORT_TEXT["preview_statement"][lang]
|
|
11255
|
+
|
|
11256
|
+
details = (
|
|
11257
|
+
f"<div style='font-size: 13.6px; {direction_rtl}'>"
|
|
11258
|
+
"<div style='padding-top: 7px;'>"
|
|
11259
|
+
f"{text}"
|
|
11260
|
+
"</div>"
|
|
11261
|
+
"<div style='padding-top: 7px;'>"
|
|
11262
|
+
f"{success_stmt}"
|
|
11263
|
+
"</div>"
|
|
11264
|
+
f"{preview_stmt}"
|
|
11265
|
+
"</div>"
|
|
11266
|
+
)
|
|
11267
|
+
|
|
11268
|
+
# Generate the default template text for the header when `":default:"` is used
|
|
11269
|
+
if header == ":default:":
|
|
11270
|
+
header = "{title}{details}"
|
|
11271
|
+
|
|
11272
|
+
# Use commonmark to convert the header text to HTML
|
|
11273
|
+
header = commonmark.commonmark(header)
|
|
11274
|
+
|
|
11275
|
+
# Place any templated text in the header
|
|
11276
|
+
header = header.format(title=title, details=details)
|
|
11277
|
+
|
|
11278
|
+
# Create the header with `header` string
|
|
11279
|
+
step_report = step_report.tab_header(title=md(header))
|
|
11280
|
+
|
|
11281
|
+
else:
|
|
11282
|
+
if limit is None:
|
|
11283
|
+
limit = extract_length
|
|
11284
|
+
|
|
11285
|
+
# Create a preview of the extracted data
|
|
11286
|
+
step_report = _generate_display_table(
|
|
11287
|
+
data=extract,
|
|
11288
|
+
columns_subset=columns_subset,
|
|
11289
|
+
n_head=limit,
|
|
11290
|
+
n_tail=0,
|
|
11291
|
+
limit=limit,
|
|
11292
|
+
min_tbl_width=600,
|
|
11293
|
+
incl_header=False,
|
|
11294
|
+
mark_missing_values=False,
|
|
11295
|
+
)
|
|
11296
|
+
|
|
11297
|
+
title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i)
|
|
11298
|
+
failure_rate_metrics = f"<strong>{n_failed}</strong> / <strong>{n}</strong>"
|
|
11299
|
+
|
|
11300
|
+
failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary_rows_distinct"][lang].format(
|
|
11301
|
+
failure_rate=failure_rate_metrics,
|
|
11302
|
+
column_position=column_position,
|
|
11303
|
+
)
|
|
11304
|
+
|
|
11305
|
+
if limit < extract_length: # pragma: no cover
|
|
11306
|
+
extract_length_resolved = limit
|
|
11307
|
+
extract_text = STEP_REPORT_TEXT["extract_text_first_rows_distinct"][lang].format(
|
|
11308
|
+
extract_length_resolved=extract_length_resolved
|
|
11309
|
+
)
|
|
11310
|
+
|
|
11311
|
+
else:
|
|
11312
|
+
extract_length_resolved = extract_length
|
|
11313
|
+
extract_text = STEP_REPORT_TEXT["extract_text_all_rows_distinct"][lang].format(
|
|
11314
|
+
extract_length_resolved=extract_length_resolved
|
|
11315
|
+
)
|
|
11316
|
+
|
|
11317
|
+
details = (
|
|
11318
|
+
f"<div style='font-size: 13.6px; {direction_rtl}'>"
|
|
11319
|
+
"<div style='padding-top: 7px;'>"
|
|
11320
|
+
f"{text}"
|
|
11321
|
+
"</div>"
|
|
11322
|
+
"<div style='padding-top: 7px;'>"
|
|
11323
|
+
f"{failure_rate_stmt}"
|
|
11324
|
+
"</div>"
|
|
11325
|
+
f"{extract_text}"
|
|
11326
|
+
"</div>"
|
|
11327
|
+
)
|
|
11328
|
+
|
|
11329
|
+
# If `header` is None then don't add a header and just return the step report
|
|
11330
|
+
if header is None:
|
|
11331
|
+
return step_report
|
|
11332
|
+
|
|
11333
|
+
# Generate the default template text for the header when `":default:"` is used
|
|
11334
|
+
if header == ":default:":
|
|
11335
|
+
header = "{title}{details}"
|
|
11336
|
+
|
|
11337
|
+
# Use commonmark to convert the header text to HTML
|
|
11338
|
+
header = commonmark.commonmark(header)
|
|
11339
|
+
|
|
11340
|
+
# Place any templated text in the header
|
|
11341
|
+
header = header.format(title=title, details=details)
|
|
11342
|
+
|
|
11343
|
+
# Create the header with `header` string
|
|
11344
|
+
step_report = step_report.tab_header(title=md(header))
|
|
11345
|
+
|
|
11346
|
+
return step_report
|
|
11347
|
+
|
|
11348
|
+
|
|
10892
11349
|
def _step_report_schema_in_order(
|
|
10893
11350
|
step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False
|
|
10894
11351
|
) -> GT | any:
|