pointblank 0.16.0__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -12,6 +12,7 @@ import tempfile
12
12
  import threading
13
13
  from dataclasses import dataclass
14
14
  from enum import Enum
15
+ from functools import partial
15
16
  from importlib.metadata import version
16
17
  from pathlib import Path
17
18
  from typing import TYPE_CHECKING, Any, Callable, Literal
@@ -54,6 +55,7 @@ from pointblank._interrogation import (
54
55
  SpeciallyValidation,
55
56
  col_count_match,
56
57
  col_exists,
58
+ col_pct_null,
57
59
  col_schema_match,
58
60
  col_vals_expr,
59
61
  conjointly_validation,
@@ -363,12 +365,16 @@ class PointblankConfig:
363
365
 
364
366
  report_incl_header: bool = True
365
367
  report_incl_footer: bool = True
368
+ report_incl_footer_timings: bool = True
369
+ report_incl_footer_notes: bool = True
366
370
  preview_incl_header: bool = True
367
371
 
368
372
  def __repr__(self):
369
373
  return (
370
374
  f"PointblankConfig(report_incl_header={self.report_incl_header}, "
371
375
  f"report_incl_footer={self.report_incl_footer}, "
376
+ f"report_incl_footer_timings={self.report_incl_footer_timings}, "
377
+ f"report_incl_footer_notes={self.report_incl_footer_notes}, "
372
378
  f"preview_incl_header={self.preview_incl_header})"
373
379
  )
374
380
 
@@ -380,6 +386,8 @@ global_config = PointblankConfig()
380
386
  def config(
381
387
  report_incl_header: bool = True,
382
388
  report_incl_footer: bool = True,
389
+ report_incl_footer_timings: bool = True,
390
+ report_incl_footer_notes: bool = True,
383
391
  preview_incl_header: bool = True,
384
392
  ) -> PointblankConfig:
385
393
  """
@@ -393,7 +401,13 @@ def config(
393
401
  threshold levels (if set).
394
402
  report_incl_footer
395
403
  Should the footer of the validation table report be displayed? The footer contains the
396
- starting and ending times of the interrogation.
404
+ starting and ending times of the interrogation and any notes added to validation steps.
405
+ report_incl_footer_timings
406
+ Controls whether the validation timing information (start time, duration, and end time)
407
+ should be displayed in the footer. Only applies when `report_incl_footer=True`.
408
+ report_incl_footer_notes
409
+ Controls whether the notes from validation steps should be displayed in the footer. Only
410
+ applies when `report_incl_footer=True`.
397
411
  preview_incl_header
398
412
  Whether the header should be present in any preview table (generated via the
399
413
  [`preview()`](`pointblank.preview`) function).
@@ -407,6 +421,8 @@ def config(
407
421
  global global_config
408
422
  global_config.report_incl_header = report_incl_header # pragma: no cover
409
423
  global_config.report_incl_footer = report_incl_footer # pragma: no cover
424
+ global_config.report_incl_footer_timings = report_incl_footer_timings # pragma: no cover
425
+ global_config.report_incl_footer_notes = report_incl_footer_notes # pragma: no cover
410
426
  global_config.preview_incl_header = preview_incl_header # pragma: no cover
411
427
 
412
428
 
@@ -9755,6 +9771,302 @@ class Validate:
9755
9771
 
9756
9772
  return self
9757
9773
 
9774
+ def col_pct_null(
9775
+ self,
9776
+ columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
9777
+ p: float,
9778
+ tol: Tolerance = 0,
9779
+ thresholds: int | float | None | bool | tuple | dict | Thresholds = None,
9780
+ actions: Actions | None = None,
9781
+ brief: str | bool | None = None,
9782
+ active: bool = True,
9783
+ ) -> Validate:
9784
+ """
9785
+ Validate whether a column has a specific percentage of Null values.
9786
+
9787
+ The `col_pct_null()` validation method checks whether the percentage of Null values in a
9788
+ column matches a specified percentage `p=` (within an optional tolerance `tol=`). This
9789
+ validation operates at the column level, generating a single validation step per column that
9790
+ passes or fails based on whether the actual percentage of Null values falls within the
9791
+ acceptable range defined by `p ± tol`.
9792
+
9793
+ Parameters
9794
+ ----------
9795
+ columns
9796
+ A single column or a list of columns to validate. Can also use
9797
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
9798
+ multiple columns are supplied or resolved, there will be a separate validation step
9799
+ generated for each column.
9800
+ p
9801
+ The expected percentage of Null values in the column, expressed as a decimal between
9802
+ `0.0` and `1.0`. For example, `p=0.5` means 50% of values should be Null.
9803
+ tol
9804
+ The tolerance allowed when comparing the actual percentage of Null values to the
9805
+ expected percentage `p=`. The validation passes if the actual percentage falls within
9806
+ the range `[p - tol, p + tol]`. Default is `0`, meaning an exact match is required. See
9807
+ the *Tolerance* section for details on all supported formats (absolute, relative,
9808
+ symmetric, and asymmetric bounds).
9809
+ thresholds
9810
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
9811
+ The thresholds are set at the step level and will override any global thresholds set in
9812
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
9813
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
9814
+ section for information on how to set threshold levels.
9815
+ actions
9816
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
9817
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
9818
+ define the actions.
9819
+ brief
9820
+ An optional brief description of the validation step that will be displayed in the
9821
+ reporting table. You can use the templating elements like `"{step}"` to insert
9822
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
9823
+ the entire brief will be automatically generated. If `None` (the default) then there
9824
+ won't be a brief.
9825
+ active
9826
+ A boolean value indicating whether the validation step should be active. Using `False`
9827
+ will make the validation step inactive (still reporting its presence and keeping indexes
9828
+ for the steps unchanged).
9829
+
9830
+ Returns
9831
+ -------
9832
+ Validate
9833
+ The `Validate` object with the added validation step.
9834
+
9835
+ Tolerance
9836
+ ---------
9837
+ The `tol=` parameter accepts several different formats to specify the acceptable deviation
9838
+ from the expected percentage `p=`. The tolerance can be expressed as:
9839
+
9840
+ 1. *single integer* (absolute tolerance): the exact number of test units that can deviate.
9841
+ For example, `tol=2` means the actual count can differ from the expected count by up to 2
9842
+ units in either direction.
9843
+
9844
+ 2. *single float between 0 and 1* (relative tolerance): a proportion of the expected
9845
+ count. For example, if the expected count is 50 and `tol=0.1`, the acceptable range is
9846
+ 45 to 55 (50 ± 10% of 50 = 50 ± 5).
9847
+
9848
+ 3. *tuple of two integers* (absolute bounds): explicitly specify the lower and upper
9849
+ bounds as absolute deviations. For example, `tol=(1, 3)` means the actual count can be
9850
+ 1 unit below or 3 units above the expected count.
9851
+
9852
+ 4. *tuple of two floats between 0 and 1* (relative bounds): explicitly specify the lower
9853
+ and upper bounds as proportional deviations. For example, `tol=(0.05, 0.15)` means the
9854
+ lower bound is 5% below and the upper bound is 15% above the expected count.
9855
+
9856
+ When using a single value (integer or float), the tolerance is applied symmetrically in both
9857
+ directions. When using a tuple, you can specify asymmetric tolerances where the lower and
9858
+ upper bounds differ.
9859
+
9860
+ Thresholds
9861
+ ----------
9862
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
9863
+ step. If they are set here at the step level, these thresholds will override any thresholds
9864
+ set at the global level in `Validate(thresholds=...)`.
9865
+
9866
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
9867
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
9868
+ or, the absolute number of failing test units (as integer that's `1` or greater).
9869
+
9870
+ Thresholds can be defined using one of these input schemes:
9871
+
9872
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
9873
+ thresholds)
9874
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
9875
+ the 'error' level, and position `2` is the 'critical' level
9876
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
9877
+ 'critical'
9878
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
9879
+ for the 'warning' level only
9880
+
9881
+ If the number of failing test units exceeds set thresholds, the validation step will be
9882
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
9883
+ set, you're free to set any combination of them.
9884
+
9885
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
9886
+ take for each level of failure (using the `actions=` parameter).
9887
+
9888
+ Examples
9889
+ --------
9890
+ ```{python}
9891
+ #| echo: false
9892
+ #| output: false
9893
+ import pointblank as pb
9894
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
9895
+ ```
9896
+ For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`,
9897
+ and `c`) that have different percentages of Null values. The table is shown below:
9898
+
9899
+ ```{python}
9900
+ import pointblank as pb
9901
+ import polars as pl
9902
+
9903
+ tbl = pl.DataFrame(
9904
+ {
9905
+ "a": [1, 2, 3, 4, 5, 6, 7, 8],
9906
+ "b": [1, None, 3, None, 5, None, 7, None],
9907
+ "c": [None, None, None, None, None, None, 1, 2],
9908
+ }
9909
+ )
9910
+
9911
+ pb.preview(tbl)
9912
+ ```
9913
+
9914
+ Let's validate that column `a` has 0% Null values (i.e., no Null values at all).
9915
+
9916
+ ```{python}
9917
+ validation = (
9918
+ pb.Validate(data=tbl)
9919
+ .col_pct_null(columns="a", p=0.0)
9920
+ .interrogate()
9921
+ )
9922
+
9923
+ validation
9924
+ ```
9925
+
9926
+ Printing the `validation` object shows the validation table in an HTML viewing environment.
9927
+ The validation table shows the single entry that corresponds to the validation step created
9928
+ by using `col_pct_null()`. The validation passed since column `a` has no Null values.
9929
+
9930
+ Now, let's check that column `b` has exactly 50% Null values.
9931
+
9932
+ ```{python}
9933
+ validation = (
9934
+ pb.Validate(data=tbl)
9935
+ .col_pct_null(columns="b", p=0.5)
9936
+ .interrogate()
9937
+ )
9938
+
9939
+ validation
9940
+ ```
9941
+
9942
+ This validation also passes, as column `b` has exactly 4 out of 8 values as Null (50%).
9943
+
9944
+ Finally, let's validate column `c` with a tolerance. Column `c` has 75% Null values, so
9945
+ we'll check if it's approximately 70% Null with a tolerance of 10%.
9946
+
9947
+ ```{python}
9948
+ validation = (
9949
+ pb.Validate(data=tbl)
9950
+ .col_pct_null(columns="c", p=0.70, tol=0.10)
9951
+ .interrogate()
9952
+ )
9953
+
9954
+ validation
9955
+ ```
9956
+
9957
+ This validation passes because the actual percentage (75%) falls within the acceptable
9958
+ range of 60% to 80% (70% ± 10%).
9959
+
9960
+ The `tol=` parameter supports multiple formats to express tolerance. Let's explore all the
9961
+ different ways to specify tolerance using column `b`, which has exactly 50% Null values
9962
+ (4 out of 8 values).
9963
+
9964
+ *Using an absolute tolerance (integer)*: Specify the exact number of rows that can
9965
+ deviate. With `tol=1`, we allow the count to differ by 1 row in either direction.
9966
+
9967
+ ```{python}
9968
+ validation = (
9969
+ pb.Validate(data=tbl)
9970
+ .col_pct_null(columns="b", p=0.375, tol=1) # Expect 3 nulls, allow ±1 (range: 2-4)
9971
+ .interrogate()
9972
+ )
9973
+
9974
+ validation
9975
+ ```
9976
+
9977
+ This passes because column `b` has 4 Null values, which falls within the acceptable range
9978
+ of 2 to 4 (3 ± 1).
9979
+
9980
+ *Using a relative tolerance (float)*: Specify the tolerance as a proportion of the
9981
+ expected count. With `tol=0.25`, we allow a 25% deviation from the expected count.
9982
+
9983
+ ```{python}
9984
+ validation = (
9985
+ pb.Validate(data=tbl)
9986
+ .col_pct_null(columns="b", p=0.375, tol=0.25) # Expect 3 nulls, allow ±25% (range: 2.25-3.75)
9987
+ .interrogate()
9988
+ )
9989
+
9990
+ validation
9991
+ ```
9992
+
9993
+ This passes because 4 Null values falls within the acceptable range (3 ± 0.75 calculates
9994
+ to 2.25 to 3.75, which rounds down to 2 to 3 rows).
9995
+
9996
+ *Using asymmetric absolute bounds (tuple of integers)*: Specify different lower and
9997
+ upper bounds as absolute values. With `tol=(0, 2)`, we allow no deviation below but up
9998
+ to 2 rows above the expected count.
9999
+
10000
+ ```{python}
10001
+ validation = (
10002
+ pb.Validate(data=tbl)
10003
+ .col_pct_null(columns="b", p=0.25, tol=(0, 2)) # Expect 2 Nulls, allow +0/-2 (range: 2-4)
10004
+ .interrogate()
10005
+ )
10006
+
10007
+ validation
10008
+ ```
10009
+
10010
+ This passes because 4 Null values falls within the acceptable range of 2 to 4.
10011
+
10012
+ *Using asymmetric relative bounds (tuple of floats)*: Specify different lower and upper
10013
+ bounds as proportions. With `tol=(0.1, 0.3)`, we allow 10% below and 30% above the
10014
+ expected count.
10015
+
10016
+ ```{python}
10017
+ validation = (
10018
+ pb.Validate(data=tbl)
10019
+ .col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3)) # Expect 3 Nulls, allow -10%/+30%
10020
+ .interrogate()
10021
+ )
10022
+
10023
+ validation
10024
+ ```
10025
+
10026
+ This passes because 4 Null values falls within the acceptable range (3 - 0.3 to 3 + 0.9
10027
+ calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows).
10028
+ """
10029
+ assertion_type = _get_fn_name()
10030
+
10031
+ _check_column(column=columns)
10032
+ _check_thresholds(thresholds=thresholds)
10033
+ _check_boolean_input(param=active, param_name="active")
10034
+
10035
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
10036
+ thresholds = (
10037
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
10038
+ )
10039
+
10040
+ # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
10041
+ # resolve the columns
10042
+ if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
10043
+ columns = col(columns)
10044
+
10045
+ # If `columns` is Column value or a string, place it in a list for iteration
10046
+ if isinstance(columns, (Column, str)):
10047
+ columns = [columns]
10048
+
10049
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
10050
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
10051
+
10052
+ bound_finder: Callable[[int], AbsoluteBounds] = partial(_derive_bounds, tol=tol)
10053
+
10054
+ # Iterate over the columns and create a validation step for each
10055
+ for column in columns:
10056
+ val_info = _ValidationInfo(
10057
+ assertion_type=assertion_type,
10058
+ column=column,
10059
+ values={"p": p, "bound_finder": bound_finder},
10060
+ thresholds=thresholds,
10061
+ actions=actions,
10062
+ brief=brief,
10063
+ active=active,
10064
+ )
10065
+
10066
+ self._add_validation(validation_info=val_info)
10067
+
10068
+ return self
10069
+
9758
10070
  def rows_distinct(
9759
10071
  self,
9760
10072
  columns_subset: str | list[str] | None = None,
@@ -12282,12 +12594,19 @@ class Validate:
12282
12594
  # Generate the autobrief description for the validation step; it's important to perform
12283
12595
  # that here since text components like the column and the value(s) have been resolved
12284
12596
  # at this point
12597
+ # Get row count for col_pct_null to properly calculate absolute tolerance percentages
12598
+ n_rows = None
12599
+ if assertion_type == "col_pct_null":
12600
+ n_rows = get_row_count(data_tbl)
12601
+
12285
12602
  autobrief = _create_autobrief_or_failure_text(
12286
12603
  assertion_type=assertion_type,
12287
12604
  lang=self.lang,
12288
12605
  column=column,
12289
12606
  values=value,
12290
12607
  for_failure=False,
12608
+ locale=self.locale,
12609
+ n_rows=n_rows,
12291
12610
  )
12292
12611
 
12293
12612
  validation.autobrief = autobrief
@@ -12315,6 +12634,12 @@ class Validate:
12315
12634
  # This prevents modifications from one validation step affecting others
12316
12635
  data_tbl_step = _copy_dataframe(data_tbl)
12317
12636
 
12637
+ # Capture original table dimensions and columns before preprocessing
12638
+ # (only if preprocessing is present - we'll set these inside the preprocessing block)
12639
+ original_rows = None
12640
+ original_cols = None
12641
+ original_column_names = None
12642
+
12318
12643
  # ------------------------------------------------
12319
12644
  # Preprocessing stage
12320
12645
  # ------------------------------------------------
@@ -12322,6 +12647,16 @@ class Validate:
12322
12647
  # Determine whether any preprocessing functions are to be applied to the table
12323
12648
  if validation.pre is not None:
12324
12649
  try:
12650
+ # Capture original table dimensions before preprocessing
12651
+ # Use get_row_count() instead of len() for compatibility with PySpark, etc.
12652
+ original_rows = get_row_count(data_tbl_step)
12653
+ original_cols = get_column_count(data_tbl_step)
12654
+ original_column_names = set(
12655
+ data_tbl_step.columns
12656
+ if hasattr(data_tbl_step, "columns")
12657
+ else list(data_tbl_step.columns)
12658
+ )
12659
+
12325
12660
  # Read the text of the preprocessing function
12326
12661
  pre_text = _pre_processing_funcs_to_str(validation.pre)
12327
12662
 
@@ -12354,6 +12689,62 @@ class Validate:
12354
12689
  elif isinstance(validation.pre, Callable):
12355
12690
  data_tbl_step = validation.pre(data_tbl_step)
12356
12691
 
12692
+ # After successful preprocessing, check dimensions and create notes
12693
+ # Use get_row_count() and get_column_count() for compatibility
12694
+ processed_rows = get_row_count(data_tbl_step)
12695
+ processed_cols = get_column_count(data_tbl_step)
12696
+
12697
+ # Always add a note when preprocessing is applied
12698
+ if original_rows != processed_rows or original_cols != processed_cols:
12699
+ # Dimensions changed - show the change
12700
+ note_html = _create_preprocessing_note_html(
12701
+ original_rows=original_rows,
12702
+ original_cols=original_cols,
12703
+ processed_rows=processed_rows,
12704
+ processed_cols=processed_cols,
12705
+ locale=self.locale,
12706
+ )
12707
+ note_text = _create_preprocessing_note_text(
12708
+ original_rows=original_rows,
12709
+ original_cols=original_cols,
12710
+ processed_rows=processed_rows,
12711
+ processed_cols=processed_cols,
12712
+ )
12713
+ else:
12714
+ # No dimension change - just indicate preprocessing was applied
12715
+ note_html = _create_preprocessing_no_change_note_html(locale=self.locale)
12716
+ note_text = _create_preprocessing_no_change_note_text()
12717
+
12718
+ validation._add_note(
12719
+ key="pre_applied",
12720
+ markdown=note_html,
12721
+ text=note_text,
12722
+ )
12723
+
12724
+ # Check if target column is synthetic (exists in processed but not original)
12725
+ # Only check for single column names (not lists used in rows_distinct, etc.)
12726
+ if column is not None and isinstance(column, str):
12727
+ processed_column_names = set(
12728
+ data_tbl_step.columns
12729
+ if hasattr(data_tbl_step, "columns")
12730
+ else list(data_tbl_step.columns)
12731
+ )
12732
+
12733
+ # Check if the target column is in the processed table but not in original
12734
+ if column in processed_column_names and column not in original_column_names:
12735
+ note_html = _create_synthetic_target_column_note_html(
12736
+ column_name=column,
12737
+ locale=self.locale,
12738
+ )
12739
+ note_text = _create_synthetic_target_column_note_text(
12740
+ column_name=column,
12741
+ )
12742
+ validation._add_note(
12743
+ key="syn_target_col",
12744
+ markdown=note_html,
12745
+ text=note_text,
12746
+ )
12747
+
12357
12748
  except Exception:
12358
12749
  # If preprocessing fails, mark the validation as having an eval_error
12359
12750
  validation.eval_error = True
@@ -12543,6 +12934,21 @@ class Validate:
12543
12934
  tbl=tbl, column=column, values=value, na_pass=na_pass
12544
12935
  )
12545
12936
 
12937
+ elif assertion_type == "col_pct_null":
12938
+ result_bool = col_pct_null(
12939
+ data_tbl=data_tbl_step,
12940
+ column=column,
12941
+ p=value["p"],
12942
+ bound_finder=value["bound_finder"],
12943
+ )
12944
+
12945
+ validation.all_passed = result_bool
12946
+ validation.n = 1
12947
+ validation.n_passed = int(result_bool)
12948
+ validation.n_failed = 1 - int(result_bool)
12949
+
12950
+ results_tbl = None
12951
+
12546
12952
  elif assertion_type == "col_vals_expr":
12547
12953
  results_tbl = col_vals_expr(
12548
12954
  data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
@@ -12602,10 +13008,21 @@ class Validate:
12602
13008
  # Add the schema validation info to the validation object
12603
13009
  validation.val_info = schema_validation_info
12604
13010
 
13011
+ # Add a note with the schema expectation and results
13012
+ schema_note_html = _create_col_schema_match_note_html(
13013
+ schema_info=schema_validation_info, locale=self.locale
13014
+ )
13015
+ schema_note_text = _create_col_schema_match_note_text(
13016
+ schema_info=schema_validation_info
13017
+ )
13018
+ validation._add_note(
13019
+ key="schema_check", markdown=schema_note_html, text=schema_note_text
13020
+ )
13021
+
12605
13022
  validation.all_passed = result_bool
12606
13023
  validation.n = 1
12607
13024
  validation.n_passed = int(result_bool)
12608
- validation.n_failed = 1 - result_bool
13025
+ validation.n_failed = 1 - int(result_bool)
12609
13026
 
12610
13027
  results_tbl = None
12611
13028
 
@@ -12620,7 +13037,7 @@ class Validate:
12620
13037
  validation.all_passed = result_bool
12621
13038
  validation.n = 1
12622
13039
  validation.n_passed = int(result_bool)
12623
- validation.n_failed = 1 - result_bool
13040
+ validation.n_failed = 1 - int(result_bool)
12624
13041
 
12625
13042
  results_tbl = None
12626
13043
 
@@ -12632,7 +13049,7 @@ class Validate:
12632
13049
  validation.all_passed = result_bool
12633
13050
  validation.n = 1
12634
13051
  validation.n_passed = int(result_bool)
12635
- validation.n_failed = 1 - result_bool
13052
+ validation.n_failed = 1 - int(result_bool)
12636
13053
 
12637
13054
  results_tbl = None
12638
13055
 
@@ -12651,7 +13068,7 @@ class Validate:
12651
13068
  validation.all_passed = result_bool
12652
13069
  validation.n = 1
12653
13070
  validation.n_passed = int(result_bool)
12654
- validation.n_failed = 1 - result_bool
13071
+ validation.n_failed = 1 - int(result_bool)
12655
13072
 
12656
13073
  results_tbl = None
12657
13074
 
@@ -12669,8 +13086,9 @@ class Validate:
12669
13086
  ) # pragma: no cover
12670
13087
 
12671
13088
  except Exception as e:
12672
- # Only catch specific data quality comparison errors, not programming errors
13089
+ # Catch data quality errors and column not found errors
12673
13090
  error_msg = str(e).lower()
13091
+
12674
13092
  is_comparison_error = (
12675
13093
  "boolean value of na is ambiguous" in error_msg
12676
13094
  or "cannot compare" in error_msg
@@ -12681,20 +13099,101 @@ class Validate:
12681
13099
  or ("dtype" in error_msg and "compare" in error_msg)
12682
13100
  )
12683
13101
 
12684
- if is_comparison_error: # pragma: no cover
12685
- # If data quality comparison fails, mark the validation as having an eval_error
12686
- validation.eval_error = True # pragma: no cover
13102
+ is_column_not_found = "column" in error_msg and "not found" in error_msg
13103
+
13104
+ is_comparison_column_not_found = (
13105
+ "unable to find column" in error_msg and "valid columns" in error_msg
13106
+ )
13107
+
13108
+ if (
13109
+ is_comparison_error or is_column_not_found or is_comparison_column_not_found
13110
+ ): # pragma: no cover
13111
+ # If data quality comparison fails or column not found, mark as eval_error
13112
+ validation.eval_error = True # pragma: no cover
13113
+
13114
+ # Add a note for column not found errors (target column)
13115
+ if is_column_not_found:
13116
+ note_html = _create_column_not_found_note_html(
13117
+ column_name=column,
13118
+ available_columns=list(data_tbl_step.columns)
13119
+ if hasattr(data_tbl_step, "columns")
13120
+ else [],
13121
+ locale=self.locale,
13122
+ )
13123
+ note_text = _create_column_not_found_note_text(
13124
+ column_name=column,
13125
+ available_columns=list(data_tbl_step.columns)
13126
+ if hasattr(data_tbl_step, "columns")
13127
+ else [],
13128
+ )
13129
+ validation._add_note(
13130
+ key="column_not_found",
13131
+ markdown=note_html,
13132
+ text=note_text,
13133
+ )
13134
+
13135
+ # Add a note for comparison column not found errors
13136
+ elif is_comparison_column_not_found:
13137
+ # Extract column name from error message
13138
+ # Error format: 'unable to find column "col_name"; valid columns: ...'
13139
+ match = re.search(r'unable to find column "([^"]+)"', str(e))
13140
+
13141
+ if match:
13142
+ missing_col_name = match.group(1)
13143
+
13144
+ # Determine position for between/outside validations
13145
+ position = None
13146
+ if assertion_type in ["col_vals_between", "col_vals_outside"]:
13147
+ # Check if missing column is in left or right position
13148
+ from pointblank.column import Column
13149
+
13150
+ if (
13151
+ isinstance(value[0], Column)
13152
+ and value[0].exprs == missing_col_name
13153
+ ):
13154
+ position = "left"
13155
+ elif (
13156
+ isinstance(value[1], Column)
13157
+ and value[1].exprs == missing_col_name
13158
+ ):
13159
+ position = "right"
13160
+
13161
+ note_html = _create_comparison_column_not_found_note_html(
13162
+ column_name=missing_col_name,
13163
+ position=position,
13164
+ available_columns=list(data_tbl_step.columns)
13165
+ if hasattr(data_tbl_step, "columns")
13166
+ else [],
13167
+ locale=self.locale,
13168
+ )
13169
+ note_text = _create_comparison_column_not_found_note_text(
13170
+ column_name=missing_col_name,
13171
+ position=position,
13172
+ available_columns=list(data_tbl_step.columns)
13173
+ if hasattr(data_tbl_step, "columns")
13174
+ else [],
13175
+ )
13176
+ validation._add_note(
13177
+ key="comparison_column_not_found",
13178
+ markdown=note_html,
13179
+ text=note_text,
13180
+ )
13181
+
12687
13182
  end_time = datetime.datetime.now(datetime.timezone.utc) # pragma: no cover
13183
+
12688
13184
  validation.proc_duration_s = (
12689
13185
  end_time - start_time
12690
13186
  ).total_seconds() # pragma: no cover
13187
+
12691
13188
  validation.time_processed = end_time.isoformat(
12692
13189
  timespec="milliseconds"
12693
13190
  ) # pragma: no cover
13191
+
12694
13192
  validation.active = False # pragma: no cover
13193
+
12695
13194
  continue # pragma: no cover
12696
13195
  else:
12697
- # For other errors (like missing columns), let them propagate
13196
+ # For other unexpected errors, let them propagate
12698
13197
  raise
12699
13198
 
12700
13199
  else:
@@ -12792,6 +13291,7 @@ class Validate:
12792
13291
  markdown=threshold_note_html,
12793
13292
  text=threshold_note_text,
12794
13293
  )
13294
+
12795
13295
  elif self.thresholds != Thresholds():
12796
13296
  # Thresholds explicitly reset to empty when global thresholds exist
12797
13297
  reset_note_html = _create_threshold_reset_note_html(locale=self.locale)
@@ -12814,6 +13314,8 @@ class Validate:
12814
13314
  column=column,
12815
13315
  values=value,
12816
13316
  for_failure=True,
13317
+ locale=self.locale,
13318
+ n_rows=n_rows,
12817
13319
  )
12818
13320
 
12819
13321
  # Set the failure text in the validation step
@@ -14892,7 +15394,12 @@ class Validate:
14892
15394
  return None
14893
15395
 
14894
15396
  def get_tabular_report(
14895
- self, title: str | None = ":default:", incl_header: bool = None, incl_footer: bool = None
15397
+ self,
15398
+ title: str | None = ":default:",
15399
+ incl_header: bool = None,
15400
+ incl_footer: bool = None,
15401
+ incl_footer_timings: bool = None,
15402
+ incl_footer_notes: bool = None,
14896
15403
  ) -> GT:
14897
15404
  """
14898
15405
  Validation report as a GT table.
@@ -14915,6 +15422,20 @@ class Validate:
14915
15422
  name of the table as the title for the report. If no title is wanted, then `":none:"`
14916
15423
  can be used. Aside from keyword options, text can be provided for the title. This will
14917
15424
  be interpreted as Markdown text and transformed internally to HTML.
15425
+ incl_header
15426
+ Controls whether the header section should be displayed. If `None`, uses the global
15427
+ configuration setting. The header contains the table name, label, and threshold
15428
+ information.
15429
+ incl_footer
15430
+ Controls whether the footer section should be displayed. If `None`, uses the global
15431
+ configuration setting. The footer can contain validation timing information and notes.
15432
+ incl_footer_timings
15433
+ Controls whether validation timing information (start time, duration, end time) should
15434
+ be displayed in the footer. If `None`, uses the global configuration setting. Only
15435
+ applies when `incl_footer=True`.
15436
+ incl_footer_notes
15437
+ Controls whether notes from validation steps should be displayed in the footer. If
15438
+ `None`, uses the global configuration setting. Only applies when `incl_footer=True`.
14918
15439
 
14919
15440
  Returns
14920
15441
  -------
@@ -14974,6 +15495,10 @@ class Validate:
14974
15495
  incl_header = global_config.report_incl_header
14975
15496
  if incl_footer is None:
14976
15497
  incl_footer = global_config.report_incl_footer
15498
+ if incl_footer_timings is None:
15499
+ incl_footer_timings = global_config.report_incl_footer_timings
15500
+ if incl_footer_notes is None:
15501
+ incl_footer_notes = global_config.report_incl_footer_notes
14977
15502
 
14978
15503
  # Do we have a DataFrame library to work with?
14979
15504
  _check_any_df_lib(method_used="get_tabular_report")
@@ -15212,30 +15737,53 @@ class Validate:
15212
15737
  columns_upd = []
15213
15738
 
15214
15739
  columns = validation_info_dict["column"]
15740
+ notes = validation_info_dict["notes"]
15215
15741
 
15216
15742
  assertion_type = validation_info_dict["assertion_type"]
15217
15743
 
15218
15744
  # Iterate over the values in the `column` entry
15219
15745
  for i, column in enumerate(columns):
15746
+ # Check if this validation has a synthetic target column note
15747
+ has_synthetic_column = (
15748
+ notes[i] is not None and isinstance(notes[i], dict) and "syn_target_col" in notes[i]
15749
+ )
15750
+
15751
+ column_text = None
15752
+
15220
15753
  if assertion_type[i] in [
15221
15754
  "col_schema_match",
15222
15755
  "row_count_match",
15223
15756
  "col_count_match",
15224
15757
  "col_vals_expr",
15225
15758
  ]:
15226
- columns_upd.append("—")
15759
+ column_text = "—"
15227
15760
  elif assertion_type[i] in ["rows_distinct", "rows_complete", "prompt"]:
15228
15761
  if not column:
15229
15762
  # If there is no column subset, then all columns are used
15230
- columns_upd.append("ALL COLUMNS")
15763
+ column_text = "ALL COLUMNS"
15231
15764
  else:
15232
15765
  # With a column subset list, format with commas between the column names
15233
- columns_upd.append(", ".join(column))
15234
-
15766
+ column_text = ", ".join(column)
15235
15767
  elif assertion_type[i] in ["conjointly", "specially"]:
15236
- columns_upd.append("")
15768
+ column_text = ""
15237
15769
  else:
15238
- columns_upd.append(str(column))
15770
+ column_text = str(column)
15771
+
15772
+ # Apply underline styling for synthetic columns (using the purple color from the icon)
15773
+ # Only apply styling if column_text is not empty and not a special marker
15774
+ if (
15775
+ has_synthetic_column
15776
+ and column_text
15777
+ and column_text not in ["—", "ALL COLUMNS", ""]
15778
+ ):
15779
+ column_text = (
15780
+ f'<span style="text-decoration: underline; '
15781
+ f"text-decoration-color: #9A7CB4; text-decoration-thickness: 1px; "
15782
+ f'text-underline-offset: 3px;">'
15783
+ f"{column_text}</span>"
15784
+ )
15785
+
15786
+ columns_upd.append(column_text)
15239
15787
 
15240
15788
  # Add the `columns_upd` entry to the dictionary
15241
15789
  validation_info_dict["columns_upd"] = columns_upd
@@ -15291,6 +15839,15 @@ class Validate:
15291
15839
  ]:
15292
15840
  values_upd.append("&mdash;")
15293
15841
 
15842
+ elif assertion_type[i] in ["col_pct_null"]:
15843
+ # Extract p and tol from the values dict for nice formatting
15844
+ p_value = value["p"]
15845
+
15846
+ # Extract tol from the bound_finder partial function
15847
+ bound_finder = value.get("bound_finder")
15848
+ tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
15849
+ values_upd.append(f"p = {p_value}<br/>tol = {tol_value}")
15850
+
15294
15851
  elif assertion_type[i] in ["col_schema_match"]:
15295
15852
  values_upd.append("SCHEMA")
15296
15853
 
@@ -15766,13 +16323,15 @@ class Validate:
15766
16323
  gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
15767
16324
 
15768
16325
  if incl_footer:
15769
- # Add table time as HTML source note
15770
- gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
16326
+ # Add table time as HTML source note if enabled
16327
+ if incl_footer_timings:
16328
+ gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
15771
16329
 
15772
- # Create notes markdown from validation steps and add as separate source note
15773
- notes_markdown = _create_notes_html(self.validation_info)
15774
- if notes_markdown:
15775
- gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
16330
+ # Create notes markdown from validation steps and add as separate source note if enabled
16331
+ if incl_footer_notes:
16332
+ notes_markdown = _create_notes_html(self.validation_info)
16333
+ if notes_markdown:
16334
+ gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
15776
16335
 
15777
16336
  # If the interrogation has not been performed, then style the table columns dealing with
15778
16337
  # interrogation data as grayed out
@@ -16189,6 +16748,12 @@ class Validate:
16189
16748
 
16190
16749
  except Exception: # pragma: no cover
16191
16750
  validation.eval_error = True
16751
+ columns_resolved = []
16752
+ # Store columns list for note generation
16753
+ try:
16754
+ columns = list(table.columns) if "table" in locals() else []
16755
+ except Exception:
16756
+ columns = []
16192
16757
 
16193
16758
  # If no columns were resolved, then create a patched validation step with the
16194
16759
  # `eval_error` and `column` attributes set
@@ -16196,6 +16761,22 @@ class Validate:
16196
16761
  validation.eval_error = True
16197
16762
  validation.column = str(column_expr)
16198
16763
 
16764
+ # Add a helpful note explaining that no columns were resolved
16765
+ note_html = _create_no_columns_resolved_note_html(
16766
+ column_expr=str(column_expr),
16767
+ available_columns=columns,
16768
+ locale=self.locale,
16769
+ )
16770
+ note_text = _create_no_columns_resolved_note_text(
16771
+ column_expr=str(column_expr),
16772
+ available_columns=columns,
16773
+ )
16774
+ validation._add_note(
16775
+ key="no_columns_resolved",
16776
+ markdown=note_html,
16777
+ text=note_text,
16778
+ )
16779
+
16199
16780
  expanded_validation_info.append(validation)
16200
16781
  continue
16201
16782
 
@@ -16754,7 +17335,13 @@ def _process_action_str(
16754
17335
 
16755
17336
 
16756
17337
  def _create_autobrief_or_failure_text(
16757
- assertion_type: str, lang: str, column: str | None, values: str | None, for_failure: bool
17338
+ assertion_type: str,
17339
+ lang: str,
17340
+ column: str | None,
17341
+ values: str | None,
17342
+ for_failure: bool,
17343
+ locale: str | None = None,
17344
+ n_rows: int | None = None,
16758
17345
  ) -> str:
16759
17346
  if assertion_type in [
16760
17347
  "col_vals_gt",
@@ -16878,6 +17465,16 @@ def _create_autobrief_or_failure_text(
16878
17465
  for_failure=for_failure,
16879
17466
  )
16880
17467
 
17468
+ if assertion_type == "col_pct_null":
17469
+ return _create_text_col_pct_null(
17470
+ lang=lang,
17471
+ column=column,
17472
+ value=values,
17473
+ for_failure=for_failure,
17474
+ locale=locale if locale else lang,
17475
+ n_rows=n_rows,
17476
+ )
17477
+
16881
17478
  if assertion_type == "conjointly":
16882
17479
  return _create_text_conjointly(lang=lang, for_failure=for_failure)
16883
17480
 
@@ -17100,6 +17697,115 @@ def _create_text_col_count_match(lang: str, value: int, for_failure: bool = Fals
17100
17697
  return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text)
17101
17698
 
17102
17699
 
17700
+ def _create_text_col_pct_null(
17701
+ lang: str,
17702
+ column: str | None,
17703
+ value: dict,
17704
+ for_failure: bool = False,
17705
+ locale: str | None = None,
17706
+ n_rows: int | None = None,
17707
+ ) -> str:
17708
+ """Create text for col_pct_null validation with tolerance handling."""
17709
+ type_ = _expect_failure_type(for_failure=for_failure)
17710
+
17711
+ column_text = _prep_column_text(column=column)
17712
+
17713
+ # Use locale for number formatting, defaulting to lang if not provided
17714
+ fmt_locale = locale if locale else lang
17715
+
17716
+ # Extract p and tol from the values dict
17717
+ p_value = value.get("p", 0) * 100 # Convert to percentage
17718
+ p_value_original = value.get("p", 0) # Keep original value for deviation format
17719
+
17720
+ # Extract tol from the bound_finder partial function
17721
+ bound_finder = value.get("bound_finder")
17722
+ tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
17723
+
17724
+ # Handle different tolerance types
17725
+ has_tolerance = False
17726
+ is_asymmetric = False
17727
+
17728
+ if isinstance(tol_value, tuple):
17729
+ # Tuple tolerance: can be (lower, upper) in absolute or relative terms
17730
+ tol_lower, tol_upper = tol_value
17731
+
17732
+ # Check if we have any non-zero tolerance
17733
+ has_tolerance = tol_lower != 0 or tol_upper != 0
17734
+ is_asymmetric = tol_lower != tol_upper
17735
+
17736
+ # For relative tolerances (floats < 1), we can compute exact percentage bounds
17737
+ # For absolute tolerances (ints >= 1), calculate based on actual row count if available
17738
+ if tol_lower < 1:
17739
+ # Relative tolerance (float)
17740
+ lower_pct_delta = tol_lower * 100
17741
+ else:
17742
+ # Absolute tolerance (int); uses actual row count if available
17743
+ if n_rows is not None and n_rows > 0:
17744
+ lower_pct_delta = (tol_lower / n_rows) * 100
17745
+ else:
17746
+ lower_pct_delta = tol_lower # Fallback approximation
17747
+
17748
+ if tol_upper < 1:
17749
+ # Relative tolerance (float)
17750
+ upper_pct_delta = tol_upper * 100
17751
+ else:
17752
+ # Absolute tolerance (int); uses actual row count if available
17753
+ if n_rows is not None and n_rows > 0:
17754
+ upper_pct_delta = (tol_upper / n_rows) * 100
17755
+ else:
17756
+ upper_pct_delta = tol_upper # Fallback approximation
17757
+ else:
17758
+ # Single value tolerance: symmetric
17759
+ has_tolerance = tol_value != 0
17760
+
17761
+ if tol_value < 1:
17762
+ # Relative tolerance (float)
17763
+ tol_pct = tol_value * 100
17764
+ else:
17765
+ # Absolute tolerance (int) - use actual row count if available
17766
+ if n_rows is not None and n_rows > 0:
17767
+ tol_pct = (tol_value / n_rows) * 100
17768
+ else:
17769
+ tol_pct = tol_value # Fallback approximation
17770
+
17771
+ lower_pct_delta = tol_pct
17772
+ upper_pct_delta = tol_pct
17773
+
17774
+ # Format numbers with locale-aware formatting
17775
+ p_formatted = _format_number_safe(p_value, decimals=1, locale=fmt_locale)
17776
+ p_original_formatted = _format_number_safe(p_value_original, decimals=2, locale=fmt_locale)
17777
+
17778
+ # Choose the appropriate translation key based on tolerance
17779
+ if not has_tolerance:
17780
+ # No tolerance - use simple text
17781
+ text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text"][lang].format(
17782
+ column_text=column_text,
17783
+ p=p_formatted,
17784
+ )
17785
+ elif is_asymmetric or isinstance(tol_value, tuple):
17786
+ # Use deviation format for tuple tolerances (including symmetric ones)
17787
+ # Format the deviation values with signs (using proper minus sign U+2212)
17788
+ lower_dev = f"−{_format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)}%"
17789
+ upper_dev = f"+{_format_number_safe(upper_pct_delta, decimals=1, locale=fmt_locale)}%"
17790
+
17791
+ text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol_deviation"][lang].format(
17792
+ column_text=column_text,
17793
+ lower_dev=lower_dev,
17794
+ upper_dev=upper_dev,
17795
+ p=p_original_formatted,
17796
+ )
17797
+ else:
17798
+ # Single value tolerance - use the symmetric ± format
17799
+ tol_formatted = _format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)
17800
+ text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol"][lang].format(
17801
+ column_text=column_text,
17802
+ p=p_formatted,
17803
+ tol=tol_formatted,
17804
+ )
17805
+
17806
+ return text
17807
+
17808
+
17103
17809
  def _create_text_conjointly(lang: str, for_failure: bool = False) -> str:
17104
17810
  type_ = _expect_failure_type(for_failure=for_failure)
17105
17811
 
@@ -17498,6 +18204,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
17498
18204
 
17499
18205
  def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]:
17500
18206
  # For each icon, get the assertion icon SVG test from SVG_ICONS_FOR_ASSERTION_TYPES dictionary
18207
+ # TODO: No point in using `get` if we can't handle missing keys anyways
17501
18208
  icon_svg = [SVG_ICONS_FOR_ASSERTION_TYPES.get(icon) for icon in icon]
17502
18209
 
17503
18210
  # Replace the width and height in the SVG string
@@ -18433,6 +19140,603 @@ def _create_threshold_reset_note_text() -> str:
18433
19140
  return "Global thresholds explicitly not used for this step."
18434
19141
 
18435
19142
 
19143
+ def _create_no_columns_resolved_note_html(
19144
+ column_expr: str, available_columns: list[str], locale: str = "en"
19145
+ ) -> str:
19146
+ """
19147
+ Create an HTML note explaining that a column expression resolved to no columns.
19148
+
19149
+ Parameters
19150
+ ----------
19151
+ column_expr
19152
+ The column expression that failed to resolve columns (as a string).
19153
+ available_columns
19154
+ List of available column names in the table.
19155
+ locale
19156
+ The locale string (e.g., 'en', 'fr').
19157
+
19158
+ Returns
19159
+ -------
19160
+ str
19161
+ HTML-formatted note text.
19162
+ """
19163
+ # Get translated strings
19164
+ intro = NOTES_TEXT.get("column_not_found_intro", {}).get(
19165
+ locale, NOTES_TEXT.get("column_not_found_intro", {}).get("en", "The column expression")
19166
+ )
19167
+ no_resolve = NOTES_TEXT.get("column_not_found_no_resolve", {}).get(
19168
+ locale,
19169
+ NOTES_TEXT.get("column_not_found_no_resolve", {}).get(
19170
+ "en", "does not resolve to any columns"
19171
+ ),
19172
+ )
19173
+
19174
+ # Format the column expression with monospace font
19175
+ col_expr_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_expr}</code>"
19176
+
19177
+ # Build the HTML note
19178
+ html = f"{intro} {col_expr_html} {no_resolve}."
19179
+
19180
+ return html
19181
+
19182
+
19183
+ def _create_no_columns_resolved_note_text(column_expr: str, available_columns: list[str]) -> str:
19184
+ """
19185
+ Create a plain text note explaining that a column expression resolved to no columns.
19186
+
19187
+ Parameters
19188
+ ----------
19189
+ column_expr
19190
+ The column expression that failed to resolve columns (as a string).
19191
+ available_columns
19192
+ List of available column names in the table.
19193
+
19194
+ Returns
19195
+ -------
19196
+ str
19197
+ Plain text note.
19198
+ """
19199
+ return f"The column expression `{column_expr}` does not resolve to any columns."
19200
+
19201
+
19202
+ def _create_column_not_found_note_html(
19203
+ column_name: str, available_columns: list[str], locale: str = "en"
19204
+ ) -> str:
19205
+ """
19206
+ Create an HTML note explaining that a specific column was not found.
19207
+
19208
+ Parameters
19209
+ ----------
19210
+ column_name
19211
+ The column name that was not found.
19212
+ available_columns
19213
+ List of available column names in the table.
19214
+ locale
19215
+ The locale string (e.g., 'en', 'fr').
19216
+
19217
+ Returns
19218
+ -------
19219
+ str
19220
+ HTML-formatted note text.
19221
+ """
19222
+ # Get translated strings
19223
+ intro = NOTES_TEXT.get("target_column_provided", {}).get(
19224
+ locale, NOTES_TEXT.get("target_column_provided", {}).get("en", "The target column provided")
19225
+ )
19226
+ not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get(
19227
+ locale,
19228
+ NOTES_TEXT.get("does_not_match_any_columns", {}).get(
19229
+ "en", "does not match any columns in the table"
19230
+ ),
19231
+ )
19232
+
19233
+ # Format the column name with monospace font
19234
+ col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
19235
+
19236
+ # Build the HTML note
19237
+ html = f"{intro} ({col_name_html}) {not_found}."
19238
+
19239
+ return html
19240
+
19241
+
19242
+ def _create_column_not_found_note_text(column_name: str, available_columns: list[str]) -> str:
19243
+ """
19244
+ Create a plain text note explaining that a specific column was not found.
19245
+
19246
+ Parameters
19247
+ ----------
19248
+ column_name
19249
+ The column name that was not found.
19250
+ available_columns
19251
+ List of available column names in the table.
19252
+
19253
+ Returns
19254
+ -------
19255
+ str
19256
+ Plain text note.
19257
+ """
19258
+ return f"The target column provided ({column_name}) does not match any columns in the table."
19259
+
19260
+
19261
+ def _create_comparison_column_not_found_note_html(
19262
+ column_name: str, position: str | None, available_columns: list[str], locale: str = "en"
19263
+ ) -> str:
19264
+ """
19265
+ Create an HTML note explaining that a comparison column was not found.
19266
+
19267
+ Parameters
19268
+ ----------
19269
+ column_name
19270
+ The comparison column name that was not found.
19271
+ position
19272
+ Optional position indicator ("left", "right") for between/outside validations.
19273
+ available_columns
19274
+ List of available column names in the table.
19275
+ locale
19276
+ The locale string (e.g., 'en', 'fr').
19277
+
19278
+ Returns
19279
+ -------
19280
+ str
19281
+ HTML-formatted note text.
19282
+ """
19283
+ # Get translated strings
19284
+ intro = NOTES_TEXT.get("comparison_column_provided", {}).get(
19285
+ locale,
19286
+ NOTES_TEXT.get("comparison_column_provided", {}).get(
19287
+ "en", "The comparison column provided"
19288
+ ),
19289
+ )
19290
+ intro_with_for = NOTES_TEXT.get("comparison_column_for", {}).get(
19291
+ locale,
19292
+ NOTES_TEXT.get("comparison_column_for", {}).get("en", "The comparison column provided for"),
19293
+ )
19294
+ not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get(
19295
+ locale,
19296
+ NOTES_TEXT.get("does_not_match_any_columns", {}).get(
19297
+ "en", "does not match any columns in the table"
19298
+ ),
19299
+ )
19300
+
19301
+ # Format the column name with monospace font
19302
+ col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
19303
+
19304
+ # Add position if provided (for between/outside validations)
19305
+ if position:
19306
+ # Format position parameter with monospace font (e.g., "left=", "right=")
19307
+ position_param = (
19308
+ f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{position}=</code>"
19309
+ )
19310
+ # Use the "for" version of the intro text
19311
+ html = f"{intro_with_for} {position_param} ({col_name_html}) {not_found}."
19312
+ else:
19313
+ # Use the standard intro text without "for"
19314
+ html = f"{intro} ({col_name_html}) {not_found}."
19315
+
19316
+ return html
19317
+
19318
+
19319
+ def _create_comparison_column_not_found_note_text(
19320
+ column_name: str, position: str | None, available_columns: list[str]
19321
+ ) -> str:
19322
+ """
19323
+ Create a plain text note explaining that a comparison column was not found.
19324
+
19325
+ Parameters
19326
+ ----------
19327
+ column_name
19328
+ The comparison column name that was not found.
19329
+ position
19330
+ Optional position indicator ("left", "right") for between/outside validations.
19331
+ available_columns
19332
+ List of available column names in the table.
19333
+
19334
+ Returns
19335
+ -------
19336
+ str
19337
+ Plain text note.
19338
+ """
19339
+ if position:
19340
+ position_text = f" for {position}="
19341
+ else:
19342
+ position_text = ""
19343
+
19344
+ return (
19345
+ f"The comparison column provided{position_text} ({column_name}) "
19346
+ f"does not match any columns in the table."
19347
+ )
19348
+
19349
+
19350
+ def _create_preprocessing_note_html(
19351
+ original_rows: int,
19352
+ original_cols: int,
19353
+ processed_rows: int,
19354
+ processed_cols: int,
19355
+ locale: str = "en",
19356
+ ) -> str:
19357
+ """
19358
+ Create an HTML note showing table dimension changes from preprocessing.
19359
+
19360
+ Parameters
19361
+ ----------
19362
+ original_rows
19363
+ Number of rows in the original table.
19364
+ original_cols
19365
+ Number of columns in the original table.
19366
+ processed_rows
19367
+ Number of rows after preprocessing.
19368
+ processed_cols
19369
+ Number of columns after preprocessing.
19370
+ locale
19371
+ The locale string (e.g., 'en', 'fr').
19372
+
19373
+ Returns
19374
+ -------
19375
+ str
19376
+ HTML-formatted note text.
19377
+ """
19378
+ # Get translated strings
19379
+ precondition_text = NOTES_TEXT.get("precondition_applied", {}).get(
19380
+ locale, NOTES_TEXT.get("precondition_applied", {}).get("en", "Precondition applied")
19381
+ )
19382
+ table_dims_text = NOTES_TEXT.get("table_dimensions", {}).get(
19383
+ locale, NOTES_TEXT.get("table_dimensions", {}).get("en", "table dimensions")
19384
+ )
19385
+
19386
+ # Helper function to get singular or plural form
19387
+ def get_row_text(count: int) -> str:
19388
+ if count == 1:
19389
+ return NOTES_TEXT.get("row", {}).get(locale, NOTES_TEXT.get("row", {}).get("en", "row"))
19390
+ return NOTES_TEXT.get("rows", {}).get(locale, NOTES_TEXT.get("rows", {}).get("en", "rows"))
19391
+
19392
+ def get_col_text(count: int) -> str:
19393
+ if count == 1:
19394
+ return NOTES_TEXT.get("column", {}).get(
19395
+ locale, NOTES_TEXT.get("column", {}).get("en", "column")
19396
+ )
19397
+ return NOTES_TEXT.get("columns", {}).get(
19398
+ locale, NOTES_TEXT.get("columns", {}).get("en", "columns")
19399
+ )
19400
+
19401
+ # Determine which dimensions changed
19402
+ rows_changed = original_rows != processed_rows
19403
+ cols_changed = original_cols != processed_cols
19404
+
19405
+ # Format original dimensions
19406
+ original_rows_text = get_row_text(original_rows)
19407
+ original_cols_text = get_col_text(original_cols)
19408
+ original_dim = (
19409
+ f'<span style="font-family: monospace;">'
19410
+ f"[{original_rows:,} {original_rows_text}, {original_cols} {original_cols_text}]"
19411
+ f"</span>"
19412
+ )
19413
+
19414
+ # Format processed dimensions with bold for changed values
19415
+ processed_rows_text = get_row_text(processed_rows)
19416
+ processed_cols_text = get_col_text(processed_cols)
19417
+
19418
+ if rows_changed:
19419
+ rows_display = f"<strong>{processed_rows:,}</strong> {processed_rows_text}"
19420
+ else:
19421
+ rows_display = f"{processed_rows:,} {processed_rows_text}"
19422
+
19423
+ if cols_changed:
19424
+ cols_display = f"<strong>{processed_cols}</strong> {processed_cols_text}"
19425
+ else:
19426
+ cols_display = f"{processed_cols} {processed_cols_text}"
19427
+
19428
+ processed_dim = f'<span style="font-family: monospace;">[{rows_display}, {cols_display}]</span>'
19429
+
19430
+ # Build the HTML note
19431
+ html = f"{precondition_text}: {table_dims_text} {original_dim} → {processed_dim}."
19432
+
19433
+ return html
19434
+
19435
+
19436
+ def _create_preprocessing_note_text(
19437
+ original_rows: int,
19438
+ original_cols: int,
19439
+ processed_rows: int,
19440
+ processed_cols: int,
19441
+ ) -> str:
19442
+ """
19443
+ Create a plain text note showing table dimension changes from preprocessing.
19444
+
19445
+ Parameters
19446
+ ----------
19447
+ original_rows
19448
+ Number of rows in the original table.
19449
+ original_cols
19450
+ Number of columns in the original table.
19451
+ processed_rows
19452
+ Number of rows after preprocessing.
19453
+ processed_cols
19454
+ Number of columns after preprocessing.
19455
+
19456
+ Returns
19457
+ -------
19458
+ str
19459
+ Plain text note.
19460
+ """
19461
+ # Get singular or plural forms
19462
+ original_rows_text = "row" if original_rows == 1 else "rows"
19463
+ original_cols_text = "column" if original_cols == 1 else "columns"
19464
+ processed_rows_text = "row" if processed_rows == 1 else "rows"
19465
+ processed_cols_text = "column" if processed_cols == 1 else "columns"
19466
+
19467
+ return (
19468
+ f"Precondition applied: table dimensions "
19469
+ f"[{original_rows:,} {original_rows_text}, {original_cols} {original_cols_text}] → "
19470
+ f"[{processed_rows:,} {processed_rows_text}, {processed_cols} {processed_cols_text}]."
19471
+ )
19472
+
19473
+
19474
+ def _create_preprocessing_no_change_note_html(locale: str = "en") -> str:
19475
+ """
19476
+ Create an HTML note indicating preprocessing was applied with no dimension change.
19477
+
19478
+ Parameters
19479
+ ----------
19480
+ locale
19481
+ The locale string (e.g., 'en', 'fr').
19482
+
19483
+ Returns
19484
+ -------
19485
+ str
19486
+ HTML-formatted note text.
19487
+ """
19488
+ # Get translated string
19489
+ note_text = NOTES_TEXT.get("precondition_applied_no_change", {}).get(
19490
+ locale,
19491
+ NOTES_TEXT.get("precondition_applied_no_change", {}).get(
19492
+ "en", "Precondition applied: no table dimension change"
19493
+ ),
19494
+ )
19495
+
19496
+ return f"{note_text}."
19497
+
19498
+
19499
+ def _create_preprocessing_no_change_note_text() -> str:
19500
+ """
19501
+ Create a plain text note indicating preprocessing was applied with no dimension change.
19502
+
19503
+ Returns
19504
+ -------
19505
+ str
19506
+ Plain text note.
19507
+ """
19508
+ return "Precondition applied: no table dimension change."
19509
+
19510
+
19511
+ def _create_synthetic_target_column_note_html(column_name: str, locale: str = "en") -> str:
19512
+ """
19513
+ Create an HTML note indicating that the target column was created via preprocessing.
19514
+
19515
+ Parameters
19516
+ ----------
19517
+ column_name
19518
+ The name of the synthetic target column.
19519
+ locale
19520
+ The locale string (e.g., 'en', 'fr').
19521
+
19522
+ Returns
19523
+ -------
19524
+ str
19525
+ HTML-formatted note text.
19526
+ """
19527
+ # Get translated strings
19528
+ synthetic_text = NOTES_TEXT.get("synthetic_target_column", {}).get(
19529
+ locale, NOTES_TEXT.get("synthetic_target_column", {}).get("en", "Synthetic target column")
19530
+ )
19531
+ created_via_text = NOTES_TEXT.get("created_via_preprocessing", {}).get(
19532
+ locale,
19533
+ NOTES_TEXT.get("created_via_preprocessing", {}).get("en", "created via preprocessing"),
19534
+ )
19535
+
19536
+ # Format the column name with monospace font
19537
+ col_name_html = f"<code style='font-family: \"IBM Plex Mono\", monospace;'>{column_name}</code>"
19538
+
19539
+ # Build the HTML note
19540
+ html = f"{synthetic_text} {col_name_html} {created_via_text}."
19541
+
19542
+ return html
19543
+
19544
+
19545
+ def _create_synthetic_target_column_note_text(column_name: str) -> str:
19546
+ """
19547
+ Create a plain text note indicating that the target column was created via preprocessing.
19548
+
19549
+ Parameters
19550
+ ----------
19551
+ column_name
19552
+ The name of the synthetic target column.
19553
+
19554
+ Returns
19555
+ -------
19556
+ str
19557
+ Plain text note.
19558
+ """
19559
+ return f"Synthetic target column ({column_name}) created via preprocessing."
19560
+
19561
+
19562
+ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> str:
19563
+ """
19564
+ Create an HTML note with collapsible schema expectation and results.
19565
+
19566
+ This generates a disclosure-style note showing:
19567
+ 1. A summary of what failed (if anything)
19568
+ 2. The full step report table (collapsible)
19569
+
19570
+ Parameters
19571
+ ----------
19572
+ schema_info
19573
+ The schema validation information dictionary from interrogation.
19574
+ locale
19575
+ The locale string (e.g., 'en', 'fr').
19576
+
19577
+ Returns
19578
+ -------
19579
+ str
19580
+ HTML-formatted note with collapsible schema details.
19581
+ """
19582
+ passed = schema_info["passed"]
19583
+ expect_schema = schema_info["expect_schema"]
19584
+ target_schema = schema_info["target_schema"]
19585
+ params = schema_info["params"]
19586
+ columns_dict = schema_info["columns"]
19587
+ in_order = params["in_order"]
19588
+
19589
+ # Get translations for the locale
19590
+ passed_text = VALIDATION_REPORT_TEXT["note_schema_comparison_passed"].get(
19591
+ locale, VALIDATION_REPORT_TEXT["note_schema_comparison_passed"]["en"]
19592
+ )
19593
+ failed_text = VALIDATION_REPORT_TEXT["note_schema_comparison_failed"].get(
19594
+ locale, VALIDATION_REPORT_TEXT["note_schema_comparison_failed"]["en"]
19595
+ )
19596
+ disclosure_text = VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"].get(
19597
+ locale, VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"]["en"]
19598
+ )
19599
+ settings_title_text = VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"].get(
19600
+ locale, VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"]["en"]
19601
+ )
19602
+
19603
+ # Build summary message
19604
+ if passed:
19605
+ summary = f'<span style="color:#4CA64C;">✓</span> {passed_text}.'
19606
+ else:
19607
+ # Analyze what failed
19608
+ failures = []
19609
+
19610
+ # Check column count mismatch
19611
+ n_expect = len(expect_schema)
19612
+ n_target = len(target_schema)
19613
+ if n_expect != n_target:
19614
+ count_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"].get(
19615
+ locale, VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"]["en"]
19616
+ )
19617
+ failures.append(count_mismatch_text.format(n_expect=n_expect, n_target=n_target))
19618
+
19619
+ # Check for unmatched columns
19620
+ unmatched_cols = [col for col, info in columns_dict.items() if not info["colname_matched"]]
19621
+ if unmatched_cols:
19622
+ unmatched_text = VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"].get(
19623
+ locale, VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"]["en"]
19624
+ )
19625
+ failures.append(unmatched_text.format(n=len(unmatched_cols)))
19626
+
19627
+ # Check for wrong order (if in_order=True)
19628
+ if params["in_order"]:
19629
+ wrong_order = [
19630
+ col
19631
+ for col, info in columns_dict.items()
19632
+ if info["colname_matched"] and not info["index_matched"]
19633
+ ]
19634
+ if wrong_order:
19635
+ wrong_order_text = VALIDATION_REPORT_TEXT["note_schema_wrong_order"].get(
19636
+ locale, VALIDATION_REPORT_TEXT["note_schema_wrong_order"]["en"]
19637
+ )
19638
+ failures.append(wrong_order_text.format(n=len(wrong_order)))
19639
+
19640
+ # Check for dtype mismatches
19641
+ dtype_mismatches = [
19642
+ col
19643
+ for col, info in columns_dict.items()
19644
+ if info["colname_matched"] and info["dtype_present"] and not info["dtype_matched"]
19645
+ ]
19646
+ if dtype_mismatches:
19647
+ dtype_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"].get(
19648
+ locale, VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"]["en"]
19649
+ )
19650
+ failures.append(dtype_mismatch_text.format(n=len(dtype_mismatches)))
19651
+
19652
+ if failures:
19653
+ summary = (
19654
+ f'<span style="color:#FF3300;">✗</span> {failed_text}: ' + ", ".join(failures) + "."
19655
+ )
19656
+ else:
19657
+ summary = f'<span style="color:#FF3300;">✗</span> {failed_text}.'
19658
+
19659
+ # Generate the step report table using the existing function
19660
+ # We'll call either _step_report_schema_in_order or _step_report_schema_any_order
19661
+ # depending on the in_order parameter
19662
+ if in_order:
19663
+ step_report_gt = _step_report_schema_in_order(
19664
+ step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
19665
+ )
19666
+ else:
19667
+ step_report_gt = _step_report_schema_any_order(
19668
+ step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False
19669
+ )
19670
+
19671
+ # Generate the settings HTML using the existing function
19672
+ settings_html = _create_col_schema_match_params_html(
19673
+ lang=locale,
19674
+ complete=params["complete"],
19675
+ in_order=params["in_order"],
19676
+ case_sensitive_colnames=params["case_sensitive_colnames"],
19677
+ case_sensitive_dtypes=params["case_sensitive_dtypes"],
19678
+ full_match_dtypes=params["full_match_dtypes"],
19679
+ )
19680
+
19681
+ # Remove the inner div containing column_schema_match_str
19682
+ settings_html = re.sub(r'<div style="margin-right: 5px;">.*?</div>', "", settings_html, count=1)
19683
+
19684
+ # Change padding-top from 7px to 2px
19685
+ settings_html = settings_html.replace("padding-top: 7px;", "padding-top: 2px;")
19686
+
19687
+ # Create new source note HTML that includes both settings and schema
19688
+ source_note_html = f"""
19689
+ <div style='padding-bottom: 2px;'>{settings_title_text}</div>
19690
+ <div style='padding-bottom: 4px;'>{settings_html}</div>
19691
+ """
19692
+
19693
+ # Add the settings as an additional source note to the step report
19694
+ step_report_gt = step_report_gt.tab_source_note(source_note=html(source_note_html))
19695
+
19696
+ # Extract the HTML from the GT object
19697
+ step_report_html = step_report_gt._repr_html_()
19698
+
19699
+ # Create collapsible section with the step report
19700
+ note_html = f"""
19701
+ {summary}
19702
+
19703
+ <details style="margin-top: 2px; margin-bottom: 8px; font-size: 12px; text-indent: 12px;">
19704
+ <summary style="cursor: pointer; font-weight: bold; color: #555; margin-bottom: -5px;">{disclosure_text}</summary>
19705
+ <div style="margin-top: 6px; padding-left: 15px; padding-right: 15px;">
19706
+
19707
+ {step_report_html}
19708
+
19709
+ </div>
19710
+ </details>
19711
+ """
19712
+
19713
+ return note_html.strip()
19714
+
19715
+
19716
+ def _create_col_schema_match_note_text(schema_info: dict) -> str:
19717
+ """
19718
+ Create a plain text note for schema validation.
19719
+
19720
+ Parameters
19721
+ ----------
19722
+ schema_info
19723
+ The schema validation information dictionary from interrogation.
19724
+
19725
+ Returns
19726
+ -------
19727
+ str
19728
+ Plain text note.
19729
+ """
19730
+ passed = schema_info["passed"]
19731
+ expect_schema = schema_info["expect_schema"]
19732
+ target_schema = schema_info["target_schema"]
19733
+
19734
+ if passed:
19735
+ return f"Schema validation passed. Expected {len(expect_schema)} column(s), found {len(target_schema)}."
19736
+ else:
19737
+ return f"Schema validation failed. Expected {len(expect_schema)} column(s), found {len(target_schema)}."
19738
+
19739
+
18436
19740
  def _step_report_row_based(
18437
19741
  assertion_type: str,
18438
19742
  i: int,
@@ -18880,16 +20184,33 @@ def _step_report_schema_in_order(
18880
20184
  dtype_exp = []
18881
20185
  dtype_exp_correct = []
18882
20186
 
18883
- for i in range(len(exp_columns_dict)):
20187
+ for i in range(len(expect_schema)):
18884
20188
  #
18885
20189
  # `col_name_exp` values
18886
20190
  #
18887
20191
 
18888
- # The column name is the key in the dictionary, get the column name and
18889
- # append it to the `col_name_exp` list
18890
- col_name_exp.append(list(exp_columns_dict.keys())[i])
20192
+ # Get the column name from expect_schema (which can have duplicates)
20193
+ column_name_exp_i = expect_schema[i][0]
20194
+ col_name_exp.append(column_name_exp_i)
20195
+
20196
+ # Check if this column exists in exp_columns_dict (it might not if it's a duplicate)
20197
+ # For duplicates, we need to handle them specially
20198
+ if column_name_exp_i not in exp_columns_dict:
20199
+ # This is a duplicate or invalid column, mark it as incorrect
20200
+ col_exp_correct.append(CROSS_MARK_SPAN)
18891
20201
 
18892
- column_name_exp_i = col_name_exp[i]
20202
+ # For dtype, check if there's a dtype specified in the schema
20203
+ if len(expect_schema[i]) > 1:
20204
+ dtype_value = expect_schema[i][1]
20205
+ if isinstance(dtype_value, list):
20206
+ dtype_exp.append(" | ".join(dtype_value))
20207
+ else:
20208
+ dtype_exp.append(str(dtype_value))
20209
+ else:
20210
+ dtype_exp.append("&mdash;")
20211
+
20212
+ dtype_exp_correct.append("&mdash;")
20213
+ continue
18893
20214
 
18894
20215
  #
18895
20216
  # `col_exp_correct` values