pointblank 0.14.0__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -3699,6 +3699,10 @@ class _ValidationInfo:
3699
3699
  The time the validation step was processed. This is in the ISO 8601 format in UTC time.
3700
3700
  proc_duration_s
3701
3701
  The duration of processing for the validation step in seconds.
3702
+ notes
3703
+ An ordered dictionary of notes/footnotes associated with the validation step. Each entry
3704
+ contains both 'markdown' and 'text' versions of the note content. The dictionary preserves
3705
+ insertion order, ensuring notes appear in a consistent sequence in reports and logs.
3702
3706
  """
3703
3707
 
3704
3708
  # Validation plan
@@ -3736,10 +3740,183 @@ class _ValidationInfo:
3736
3740
  val_info: dict[str, any] | None = None
3737
3741
  time_processed: str | None = None
3738
3742
  proc_duration_s: float | None = None
3743
+ notes: dict[str, dict[str, str]] | None = None
3739
3744
 
3740
3745
  def get_val_info(self) -> dict[str, any]:
3741
3746
  return self.val_info
3742
3747
 
3748
+ def _add_note(self, key: str, markdown: str, text: str | None = None) -> None:
3749
+ """
3750
+ Add a note/footnote to the validation step.
3751
+
3752
+ This internal method adds a note entry to the validation step's notes dictionary.
3753
+ Notes are displayed as footnotes in validation reports and included in log output.
3754
+
3755
+ Parameters
3756
+ ----------
3757
+ key
3758
+ A unique identifier for the note. If a note with this key already exists, it will
3759
+ be overwritten.
3760
+ markdown
3761
+ The note content formatted with Markdown. This version is used for display in
3762
+ HTML reports and other rich text formats.
3763
+ text
3764
+ The note content as plain text. This version is used for log files and text-based
3765
+ output. If not provided, the markdown version will be used (with markdown formatting
3766
+ intact).
3767
+
3768
+ Examples
3769
+ --------
3770
+ ```python
3771
+ # Add a note about evaluation failure
3772
+ validation_info._add_note(
3773
+ key="eval_error",
3774
+ markdown="Column expression evaluation **failed**",
3775
+ text="Column expression evaluation failed"
3776
+ )
3777
+
3778
+ # Add a note about LLM response
3779
+ validation_info._add_note(
3780
+ key="llm_response",
3781
+ markdown="LLM validation returned `200` passing rows",
3782
+ text="LLM validation returned 200 passing rows"
3783
+ )
3784
+ ```
3785
+ """
3786
+ # Initialize notes dictionary if it doesn't exist
3787
+ if self.notes is None:
3788
+ self.notes = {}
3789
+
3790
+ # Use markdown as text if text is not provided
3791
+ if text is None:
3792
+ text = markdown
3793
+
3794
+ # Add the note entry
3795
+ self.notes[key] = {"markdown": markdown, "text": text}
3796
+
3797
+ def _get_notes(self, format: str = "dict") -> dict[str, dict[str, str]] | list[str] | None:
3798
+ """
3799
+ Get notes associated with this validation step.
3800
+
3801
+ Parameters
3802
+ ----------
3803
+ format
3804
+ The format to return notes in:
3805
+ - `"dict"`: Returns the full notes dictionary (default)
3806
+ - `"markdown"`: Returns a list of markdown-formatted note values
3807
+ - `"text"`: Returns a list of plain text note values
3808
+ - `"keys"`: Returns a list of note keys
3809
+
3810
+ Returns
3811
+ -------
3812
+ dict, list, or None
3813
+ The notes in the requested format, or `None` if no notes exist.
3814
+
3815
+ Examples
3816
+ --------
3817
+ ```python
3818
+ # Get all notes as dictionary
3819
+ notes = validation_info._get_notes()
3820
+ # Returns: {'key1': {'markdown': '...', 'text': '...'}, ...}
3821
+
3822
+ # Get just markdown versions
3823
+ markdown_notes = validation_info._get_notes(format="markdown")
3824
+ # Returns: ['First note with **emphasis**', 'Second note']
3825
+
3826
+ # Get just plain text versions
3827
+ text_notes = validation_info._get_notes(format="text")
3828
+ # Returns: ['First note with emphasis', 'Second note']
3829
+
3830
+ # Get just the keys
3831
+ keys = validation_info._get_notes(format="keys")
3832
+ # Returns: ['key1', 'key2']
3833
+ ```
3834
+ """
3835
+ if self.notes is None:
3836
+ return None
3837
+
3838
+ if format == "dict":
3839
+ return self.notes
3840
+ elif format == "markdown":
3841
+ return [note["markdown"] for note in self.notes.values()]
3842
+ elif format == "text":
3843
+ return [note["text"] for note in self.notes.values()]
3844
+ elif format == "keys":
3845
+ return list(self.notes.keys())
3846
+ else:
3847
+ raise ValueError(
3848
+ f"Invalid format '{format}'. Must be one of: 'dict', 'markdown', 'text', 'keys'"
3849
+ )
3850
+
3851
+ def _get_note(self, key: str, format: str = "dict") -> dict[str, str] | str | None:
3852
+ """
3853
+ Get a specific note by its key.
3854
+
3855
+ Parameters
3856
+ ----------
3857
+ key
3858
+ The unique identifier of the note to retrieve.
3859
+ format
3860
+ The format to return the note in:
3861
+ - `"dict"`: Returns `{'markdown': '...', 'text': '...'}` (default)
3862
+ - `"markdown"`: Returns just the markdown string
3863
+ - `"text"`: Returns just the plain text string
3864
+
3865
+ Returns
3866
+ -------
3867
+ dict, str, or None
3868
+ The note in the requested format, or `None` if the note doesn't exist.
3869
+
3870
+ Examples
3871
+ --------
3872
+ ```python
3873
+ # Get a specific note as dictionary
3874
+ note = validation_info._get_note("threshold_info")
3875
+ # Returns: {'markdown': 'Using **default** thresholds', 'text': '...'}
3876
+
3877
+ # Get just the markdown version
3878
+ markdown = validation_info._get_note("threshold_info", format="markdown")
3879
+ # Returns: 'Using **default** thresholds'
3880
+
3881
+ # Get just the text version
3882
+ text = validation_info._get_note("threshold_info", format="text")
3883
+ # Returns: 'Using default thresholds'
3884
+ ```
3885
+ """
3886
+ if self.notes is None or key not in self.notes:
3887
+ return None
3888
+
3889
+ note = self.notes[key]
3890
+
3891
+ if format == "dict":
3892
+ return note
3893
+ elif format == "markdown":
3894
+ return note["markdown"]
3895
+ elif format == "text":
3896
+ return note["text"]
3897
+ else:
3898
+ raise ValueError(
3899
+ f"Invalid format '{format}'. Must be one of: 'dict', 'markdown', 'text'"
3900
+ )
3901
+
3902
+ def _has_notes(self) -> bool:
3903
+ """
3904
+ Check if this validation step has any notes.
3905
+
3906
+ Returns
3907
+ -------
3908
+ bool
3909
+ `True` if the validation step has notes, `False` otherwise.
3910
+
3911
+ Examples
3912
+ --------
3913
+ ```python
3914
+ if validation_info._has_notes():
3915
+ print("This step has notes")
3916
+ ```
3917
+ """
3918
+ return self.notes is not None and len(self.notes) > 0
3919
+
3743
3920
 
3744
3921
  def connect_to_table(connection_string: str) -> Any:
3745
3922
  """
@@ -7718,9 +7895,12 @@ class Validate:
7718
7895
 
7719
7896
  return self
7720
7897
 
7721
- def col_vals_null(
7898
+ def col_vals_increasing(
7722
7899
  self,
7723
7900
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
7901
+ allow_stationary: bool = False,
7902
+ decreasing_tol: float | None = None,
7903
+ na_pass: bool = False,
7724
7904
  pre: Callable | None = None,
7725
7905
  segments: SegmentSpec | None = None,
7726
7906
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
@@ -7729,11 +7909,14 @@ class Validate:
7729
7909
  active: bool = True,
7730
7910
  ) -> Validate:
7731
7911
  """
7732
- Validate whether values in a column are Null.
7912
+ Are column data increasing by row?
7733
7913
 
7734
- The `col_vals_null()` validation method checks whether column values in a table are Null.
7735
- This validation will operate over the number of test units that is equal to the number
7736
- of rows in the table.
7914
+ The `col_vals_increasing()` validation method checks whether column values in a table are
7915
+ increasing when moving down a table. There are options for allowing missing values in the
7916
+ target column, allowing stationary phases (where consecutive values don't change), and even
7917
+ one for allowing decreasing movements up to a certain threshold. This validation will
7918
+ operate over the number of test units that is equal to the number of rows in the table
7919
+ (determined after any `pre=` mutation has been applied).
7737
7920
 
7738
7921
  Parameters
7739
7922
  ----------
@@ -7742,6 +7925,20 @@ class Validate:
7742
7925
  [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
7743
7926
  multiple columns are supplied or resolved, there will be a separate validation step
7744
7927
  generated for each column.
7928
+ allow_stationary
7929
+ An option to allow pauses in increasing values. For example, if the values for the test
7930
+ units are `[80, 82, 82, 85, 88]` then the third unit (`82`, appearing a second time)
7931
+ would be marked as failing when `allow_stationary` is `False`. Using
7932
+ `allow_stationary=True` will result in all the test units in `[80, 82, 82, 85, 88]` to
7933
+ be marked as passing.
7934
+ decreasing_tol
7935
+ An optional threshold value that allows for movement of numerical values in the negative
7936
+ direction. By default this is `None` but using a numerical value will set the absolute
7937
+ threshold of negative travel allowed across numerical test units. Note that setting a
7938
+ value here also has the effect of setting `allow_stationary` to `True`.
7939
+ na_pass
7940
+ Should any encountered None, NA, or Null values be considered as passing test units? By
7941
+ default, this is `False`. Set to `True` to pass test units with missing values.
7745
7942
  pre
7746
7943
  An optional preprocessing function or lambda to apply to the data table during
7747
7944
  interrogation. This function should take a table as input and return a modified table.
@@ -7778,89 +7975,6 @@ class Validate:
7778
7975
  Validate
7779
7976
  The `Validate` object with the added validation step.
7780
7977
 
7781
- Preprocessing
7782
- -------------
7783
- The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
7784
- table during interrogation. This function should take a table as input and return a modified
7785
- table. This is useful for performing any necessary transformations or filtering on the data
7786
- before the validation step is applied.
7787
-
7788
- The preprocessing function can be any callable that takes a table as input and returns a
7789
- modified table. For example, you could use a lambda function to filter the table based on
7790
- certain criteria or to apply a transformation to the data. Note that you can refer to
7791
- a column via `columns=` that is expected to be present in the transformed table, but may not
7792
- exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
7793
- only exists during the validation step and is not stored in the `Validate` object or used in
7794
- subsequent validation steps.
7795
-
7796
- Segmentation
7797
- ------------
7798
- The `segments=` argument allows for the segmentation of a validation step into multiple
7799
- segments. This is useful for applying the same validation step to different subsets of the
7800
- data. The segmentation can be done based on a single column or specific fields within a
7801
- column.
7802
-
7803
- Providing a single column name will result in a separate validation step for each unique
7804
- value in that column. For example, if you have a column called `"region"` with values
7805
- `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
7806
- region.
7807
-
7808
- Alternatively, you can provide a tuple that specifies a column name and its corresponding
7809
- values to segment on. For example, if you have a column called `"date"` and you want to
7810
- segment on only specific dates, you can provide a tuple like
7811
- `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
7812
- (i.e., no validation steps will be created for them).
7813
-
7814
- A list with a combination of column names and tuples can be provided as well. This allows
7815
- for more complex segmentation scenarios. The following inputs are both valid:
7816
-
7817
- ```
7818
- # Segments from all unique values in the `region` column
7819
- # and specific dates in the `date` column
7820
- segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
7821
-
7822
- # Segments from all unique values in the `region` and `date` columns
7823
- segments=["region", "date"]
7824
- ```
7825
-
7826
- The segmentation is performed during interrogation, and the resulting validation steps will
7827
- be numbered sequentially. Each segment will have its own validation step, and the results
7828
- will be reported separately. This allows for a more granular analysis of the data and helps
7829
- identify issues within specific segments.
7830
-
7831
- Importantly, the segmentation process will be performed after any preprocessing of the data
7832
- table. Because of this, one can conceivably use the `pre=` argument to generate a column
7833
- that can be used for segmentation. For example, you could create a new column called
7834
- `"segment"` through use of `pre=` and then use that column for segmentation.
7835
-
7836
- Thresholds
7837
- ----------
7838
- The `thresholds=` parameter is used to set the failure-condition levels for the validation
7839
- step. If they are set here at the step level, these thresholds will override any thresholds
7840
- set at the global level in `Validate(thresholds=...)`.
7841
-
7842
- There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
7843
- can either be set as a proportion failing of all test units (a value between `0` to `1`),
7844
- or, the absolute number of failing test units (as integer that's `1` or greater).
7845
-
7846
- Thresholds can be defined using one of these input schemes:
7847
-
7848
- 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
7849
- thresholds)
7850
- 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
7851
- the 'error' level, and position `2` is the 'critical' level
7852
- 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
7853
- 'critical'
7854
- 4. a single integer/float value denoting absolute number or fraction of failing test units
7855
- for the 'warning' level only
7856
-
7857
- If the number of failing test units exceeds set thresholds, the validation step will be
7858
- marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
7859
- set, you're free to set any combination of them.
7860
-
7861
- Aside from reporting failure conditions, thresholds can be used to determine the actions to
7862
- take for each level of failure (using the `actions=` parameter).
7863
-
7864
7978
  Examples
7865
7979
  --------
7866
7980
  ```{python}
@@ -7869,8 +7983,9 @@ class Validate:
7869
7983
  import pointblank as pb
7870
7984
  pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
7871
7985
  ```
7872
- For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and
7873
- `b`). The table is shown below:
7986
+
7987
+ For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
7988
+ table is shown below:
7874
7989
 
7875
7990
  ```{python}
7876
7991
  import pointblank as pb
@@ -7878,54 +7993,55 @@ class Validate:
7878
7993
 
7879
7994
  tbl = pl.DataFrame(
7880
7995
  {
7881
- "a": [None, None, None, None],
7882
- "b": [None, 2, None, 9],
7996
+ "a": [1, 2, 3, 4, 5, 6],
7997
+ "b": [1, 2, 2, 3, 4, 5],
7998
+ "c": [1, 2, 1, 3, 4, 5],
7883
7999
  }
7884
- ).with_columns(pl.col("a").cast(pl.Int64))
8000
+ )
7885
8001
 
7886
8002
  pb.preview(tbl)
7887
8003
  ```
7888
8004
 
7889
- Let's validate that values in column `a` are all Null values. We'll determine if this
7890
- validation had any failing test units (there are four test units, one for each row).
8005
+ Let's validate that values in column `a` are increasing. We'll determine if this validation
8006
+ had any failing test units (there are six test units, one for each row).
7891
8007
 
7892
8008
  ```{python}
7893
8009
  validation = (
7894
8010
  pb.Validate(data=tbl)
7895
- .col_vals_null(columns="a")
8011
+ .col_vals_increasing(columns="a")
7896
8012
  .interrogate()
7897
8013
  )
7898
8014
 
7899
8015
  validation
7900
8016
  ```
7901
8017
 
7902
- Printing the `validation` object shows the validation table in an HTML viewing environment.
7903
- The validation table shows the single entry that corresponds to the validation step created
7904
- by using `col_vals_null()`. All test units passed, and there are no failing test units.
7905
-
7906
- Now, let's use that same set of values for a validation on column `b`.
8018
+ The validation passed as all values in column `a` are increasing. Now let's check column
8019
+ `b` which has a stationary value:
7907
8020
 
7908
8021
  ```{python}
7909
8022
  validation = (
7910
8023
  pb.Validate(data=tbl)
7911
- .col_vals_null(columns="b")
8024
+ .col_vals_increasing(columns="b")
7912
8025
  .interrogate()
7913
8026
  )
7914
8027
 
7915
8028
  validation
7916
8029
  ```
7917
8030
 
7918
- The validation table reports two failing test units. The specific failing cases are for the
7919
- two non-Null values in column `b`.
7920
- """
7921
- assertion_type = _get_fn_name()
8031
+ This validation fails at the third row because the value `2` is repeated. If we want to
8032
+ allow stationary values, we can use `allow_stationary=True`:
7922
8033
 
7923
- _check_column(column=columns)
7924
- _check_pre(pre=pre)
7925
- # TODO: add check for segments
7926
- # _check_segments(segments=segments)
7927
- _check_thresholds(thresholds=thresholds)
7928
- _check_boolean_input(param=active, param_name="active")
8034
+ ```{python}
8035
+ validation = (
8036
+ pb.Validate(data=tbl)
8037
+ .col_vals_increasing(columns="b", allow_stationary=True)
8038
+ .interrogate()
8039
+ )
8040
+
8041
+ validation
8042
+ ```
8043
+ """
8044
+ assertion_type = "col_vals_increasing"
7929
8045
 
7930
8046
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
7931
8047
  thresholds = (
@@ -7949,21 +8065,30 @@ class Validate:
7949
8065
  val_info = _ValidationInfo(
7950
8066
  assertion_type=assertion_type,
7951
8067
  column=column,
8068
+ values="",
8069
+ na_pass=na_pass,
7952
8070
  pre=pre,
7953
8071
  segments=segments,
7954
8072
  thresholds=thresholds,
7955
8073
  actions=actions,
7956
8074
  brief=brief,
7957
8075
  active=active,
8076
+ val_info={
8077
+ "allow_stationary": allow_stationary,
8078
+ "decreasing_tol": decreasing_tol if decreasing_tol else 0.0,
8079
+ },
7958
8080
  )
7959
8081
 
7960
8082
  self._add_validation(validation_info=val_info)
7961
8083
 
7962
8084
  return self
7963
8085
 
7964
- def col_vals_not_null(
8086
+ def col_vals_decreasing(
7965
8087
  self,
7966
8088
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
8089
+ allow_stationary: bool = False,
8090
+ increasing_tol: float | None = None,
8091
+ na_pass: bool = False,
7967
8092
  pre: Callable | None = None,
7968
8093
  segments: SegmentSpec | None = None,
7969
8094
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
@@ -7972,11 +8097,14 @@ class Validate:
7972
8097
  active: bool = True,
7973
8098
  ) -> Validate:
7974
8099
  """
7975
- Validate whether values in a column are not Null.
8100
+ Are column data decreasing by row?
7976
8101
 
7977
- The `col_vals_not_null()` validation method checks whether column values in a table are not
7978
- Null. This validation will operate over the number of test units that is equal to the number
7979
- of rows in the table.
8102
+ The `col_vals_decreasing()` validation method checks whether column values in a table are
8103
+ decreasing when moving down a table. There are options for allowing missing values in the
8104
+ target column, allowing stationary phases (where consecutive values don't change), and even
8105
+ one for allowing increasing movements up to a certain threshold. This validation will
8106
+ operate over the number of test units that is equal to the number of rows in the table
8107
+ (determined after any `pre=` mutation has been applied).
7980
8108
 
7981
8109
  Parameters
7982
8110
  ----------
@@ -7985,6 +8113,20 @@ class Validate:
7985
8113
  [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
7986
8114
  multiple columns are supplied or resolved, there will be a separate validation step
7987
8115
  generated for each column.
8116
+ allow_stationary
8117
+ An option to allow pauses in decreasing values. For example, if the values for the test
8118
+ units are `[88, 85, 85, 82, 80]` then the third unit (`85`, appearing a second time)
8119
+ would be marked as failing when `allow_stationary` is `False`. Using
8120
+ `allow_stationary=True` will result in all the test units in `[88, 85, 85, 82, 80]` to
8121
+ be marked as passing.
8122
+ increasing_tol
8123
+ An optional threshold value that allows for movement of numerical values in the positive
8124
+ direction. By default this is `None` but using a numerical value will set the absolute
8125
+ threshold of positive travel allowed across numerical test units. Note that setting a
8126
+ value here also has the effect of setting `allow_stationary` to `True`.
8127
+ na_pass
8128
+ Should any encountered None, NA, or Null values be considered as passing test units? By
8129
+ default, this is `False`. Set to `True` to pass test units with missing values.
7988
8130
  pre
7989
8131
  An optional preprocessing function or lambda to apply to the data table during
7990
8132
  interrogation. This function should take a table as input and return a modified table.
@@ -8021,38 +8163,449 @@ class Validate:
8021
8163
  Validate
8022
8164
  The `Validate` object with the added validation step.
8023
8165
 
8024
- Preprocessing
8025
- -------------
8026
- The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
8027
- table during interrogation. This function should take a table as input and return a modified
8028
- table. This is useful for performing any necessary transformations or filtering on the data
8029
- before the validation step is applied.
8166
+ Examples
8167
+ --------
8168
+ ```{python}
8169
+ #| echo: false
8170
+ #| output: false
8171
+ import pointblank as pb
8172
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
8173
+ ```
8030
8174
 
8031
- The preprocessing function can be any callable that takes a table as input and returns a
8032
- modified table. For example, you could use a lambda function to filter the table based on
8033
- certain criteria or to apply a transformation to the data. Note that you can refer to
8034
- a column via `columns=` that is expected to be present in the transformed table, but may not
8035
- exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
8036
- only exists during the validation step and is not stored in the `Validate` object or used in
8037
- subsequent validation steps.
8175
+ For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
8176
+ table is shown below:
8038
8177
 
8039
- Segmentation
8040
- ------------
8041
- The `segments=` argument allows for the segmentation of a validation step into multiple
8042
- segments. This is useful for applying the same validation step to different subsets of the
8043
- data. The segmentation can be done based on a single column or specific fields within a
8044
- column.
8178
+ ```{python}
8179
+ import pointblank as pb
8180
+ import polars as pl
8045
8181
 
8046
- Providing a single column name will result in a separate validation step for each unique
8047
- value in that column. For example, if you have a column called `"region"` with values
8048
- `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
8049
- region.
8182
+ tbl = pl.DataFrame(
8183
+ {
8184
+ "a": [6, 5, 4, 3, 2, 1],
8185
+ "b": [5, 4, 4, 3, 2, 1],
8186
+ "c": [5, 4, 5, 3, 2, 1],
8187
+ }
8188
+ )
8050
8189
 
8051
- Alternatively, you can provide a tuple that specifies a column name and its corresponding
8052
- values to segment on. For example, if you have a column called `"date"` and you want to
8053
- segment on only specific dates, you can provide a tuple like
8054
- `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
8055
- (i.e., no validation steps will be created for them).
8190
+ pb.preview(tbl)
8191
+ ```
8192
+
8193
+ Let's validate that values in column `a` are decreasing. We'll determine if this validation
8194
+ had any failing test units (there are six test units, one for each row).
8195
+
8196
+ ```{python}
8197
+ validation = (
8198
+ pb.Validate(data=tbl)
8199
+ .col_vals_decreasing(columns="a")
8200
+ .interrogate()
8201
+ )
8202
+
8203
+ validation
8204
+ ```
8205
+
8206
+ The validation passed as all values in column `a` are decreasing. Now let's check column
8207
+ `b` which has a stationary value:
8208
+
8209
+ ```{python}
8210
+ validation = (
8211
+ pb.Validate(data=tbl)
8212
+ .col_vals_decreasing(columns="b")
8213
+ .interrogate()
8214
+ )
8215
+
8216
+ validation
8217
+ ```
8218
+
8219
+ This validation fails at the third row because the value `4` is repeated. If we want to
8220
+ allow stationary values, we can use `allow_stationary=True`:
8221
+
8222
+ ```{python}
8223
+ validation = (
8224
+ pb.Validate(data=tbl)
8225
+ .col_vals_decreasing(columns="b", allow_stationary=True)
8226
+ .interrogate()
8227
+ )
8228
+
8229
+ validation
8230
+ ```
8231
+ """
8232
+ assertion_type = "col_vals_decreasing"
8233
+
8234
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
8235
+ thresholds = (
8236
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
8237
+ )
8238
+
8239
+ # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
8240
+ # resolve the columns
8241
+ if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
8242
+ columns = col(columns)
8243
+
8244
+ # If `columns` is Column value or a string, place it in a list for iteration
8245
+ if isinstance(columns, (Column, str)):
8246
+ columns = [columns]
8247
+
8248
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
8249
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
8250
+
8251
+ # Iterate over the columns and create a validation step for each
8252
+ for column in columns:
8253
+ val_info = _ValidationInfo(
8254
+ assertion_type=assertion_type,
8255
+ column=column,
8256
+ values="",
8257
+ na_pass=na_pass,
8258
+ pre=pre,
8259
+ segments=segments,
8260
+ thresholds=thresholds,
8261
+ actions=actions,
8262
+ brief=brief,
8263
+ active=active,
8264
+ val_info={
8265
+ "allow_stationary": allow_stationary,
8266
+ "increasing_tol": increasing_tol if increasing_tol else 0.0,
8267
+ },
8268
+ )
8269
+
8270
+ self._add_validation(validation_info=val_info)
8271
+
8272
+ return self
8273
+
8274
+ def col_vals_null(
8275
+ self,
8276
+ columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
8277
+ pre: Callable | None = None,
8278
+ segments: SegmentSpec | None = None,
8279
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
8280
+ actions: Actions | None = None,
8281
+ brief: str | bool | None = None,
8282
+ active: bool = True,
8283
+ ) -> Validate:
8284
+ """
8285
+ Validate whether values in a column are Null.
8286
+
8287
+ The `col_vals_null()` validation method checks whether column values in a table are Null.
8288
+ This validation will operate over the number of test units that is equal to the number
8289
+ of rows in the table.
8290
+
8291
+ Parameters
8292
+ ----------
8293
+ columns
8294
+ A single column or a list of columns to validate. Can also use
8295
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
8296
+ multiple columns are supplied or resolved, there will be a separate validation step
8297
+ generated for each column.
8298
+ pre
8299
+ An optional preprocessing function or lambda to apply to the data table during
8300
+ interrogation. This function should take a table as input and return a modified table.
8301
+ Have a look at the *Preprocessing* section for more information on how to use this
8302
+ argument.
8303
+ segments
8304
+ An optional directive on segmentation, which serves to split a validation step into
8305
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
8306
+ column name and its corresponding values to segment on, or a combination of both
8307
+ (provided as a list). Read the *Segmentation* section for usage information.
8308
+ thresholds
8309
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
8310
+ The thresholds are set at the step level and will override any global thresholds set in
8311
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
8312
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
8313
+ section for information on how to set threshold levels.
8314
+ actions
8315
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
8316
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
8317
+ define the actions.
8318
+ brief
8319
+ An optional brief description of the validation step that will be displayed in the
8320
+ reporting table. You can use the templating elements like `"{step}"` to insert
8321
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
8322
+ the entire brief will be automatically generated. If `None` (the default) then there
8323
+ won't be a brief.
8324
+ active
8325
+ A boolean value indicating whether the validation step should be active. Using `False`
8326
+ will make the validation step inactive (still reporting its presence and keeping indexes
8327
+ for the steps unchanged).
8328
+
8329
+ Returns
8330
+ -------
8331
+ Validate
8332
+ The `Validate` object with the added validation step.
8333
+
8334
+ Preprocessing
8335
+ -------------
8336
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
8337
+ table during interrogation. This function should take a table as input and return a modified
8338
+ table. This is useful for performing any necessary transformations or filtering on the data
8339
+ before the validation step is applied.
8340
+
8341
+ The preprocessing function can be any callable that takes a table as input and returns a
8342
+ modified table. For example, you could use a lambda function to filter the table based on
8343
+ certain criteria or to apply a transformation to the data. Note that you can refer to
8344
+ a column via `columns=` that is expected to be present in the transformed table, but may not
8345
+ exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
8346
+ only exists during the validation step and is not stored in the `Validate` object or used in
8347
+ subsequent validation steps.
8348
+
8349
+ Segmentation
8350
+ ------------
8351
+ The `segments=` argument allows for the segmentation of a validation step into multiple
8352
+ segments. This is useful for applying the same validation step to different subsets of the
8353
+ data. The segmentation can be done based on a single column or specific fields within a
8354
+ column.
8355
+
8356
+ Providing a single column name will result in a separate validation step for each unique
8357
+ value in that column. For example, if you have a column called `"region"` with values
8358
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
8359
+ region.
8360
+
8361
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
8362
+ values to segment on. For example, if you have a column called `"date"` and you want to
8363
+ segment on only specific dates, you can provide a tuple like
8364
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
8365
+ (i.e., no validation steps will be created for them).
8366
+
8367
+ A list with a combination of column names and tuples can be provided as well. This allows
8368
+ for more complex segmentation scenarios. The following inputs are both valid:
8369
+
8370
+ ```
8371
+ # Segments from all unique values in the `region` column
8372
+ # and specific dates in the `date` column
8373
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
8374
+
8375
+ # Segments from all unique values in the `region` and `date` columns
8376
+ segments=["region", "date"]
8377
+ ```
8378
+
8379
+ The segmentation is performed during interrogation, and the resulting validation steps will
8380
+ be numbered sequentially. Each segment will have its own validation step, and the results
8381
+ will be reported separately. This allows for a more granular analysis of the data and helps
8382
+ identify issues within specific segments.
8383
+
8384
+ Importantly, the segmentation process will be performed after any preprocessing of the data
8385
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
8386
+ that can be used for segmentation. For example, you could create a new column called
8387
+ `"segment"` through use of `pre=` and then use that column for segmentation.
8388
+
8389
+ Thresholds
8390
+ ----------
8391
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
8392
+ step. If they are set here at the step level, these thresholds will override any thresholds
8393
+ set at the global level in `Validate(thresholds=...)`.
8394
+
8395
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
8396
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
8397
+ or, the absolute number of failing test units (as integer that's `1` or greater).
8398
+
8399
+ Thresholds can be defined using one of these input schemes:
8400
+
8401
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
8402
+ thresholds)
8403
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
8404
+ the 'error' level, and position `2` is the 'critical' level
8405
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
8406
+ 'critical'
8407
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
8408
+ for the 'warning' level only
8409
+
8410
+ If the number of failing test units exceeds set thresholds, the validation step will be
8411
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
8412
+ set, you're free to set any combination of them.
8413
+
8414
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
8415
+ take for each level of failure (using the `actions=` parameter).
8416
+
8417
+ Examples
8418
+ --------
8419
+ ```{python}
8420
+ #| echo: false
8421
+ #| output: false
8422
+ import pointblank as pb
8423
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
8424
+ ```
8425
+ For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and
8426
+ `b`). The table is shown below:
8427
+
8428
+ ```{python}
8429
+ import pointblank as pb
8430
+ import polars as pl
8431
+
8432
+ tbl = pl.DataFrame(
8433
+ {
8434
+ "a": [None, None, None, None],
8435
+ "b": [None, 2, None, 9],
8436
+ }
8437
+ ).with_columns(pl.col("a").cast(pl.Int64))
8438
+
8439
+ pb.preview(tbl)
8440
+ ```
8441
+
8442
+ Let's validate that values in column `a` are all Null values. We'll determine if this
8443
+ validation had any failing test units (there are four test units, one for each row).
8444
+
8445
+ ```{python}
8446
+ validation = (
8447
+ pb.Validate(data=tbl)
8448
+ .col_vals_null(columns="a")
8449
+ .interrogate()
8450
+ )
8451
+
8452
+ validation
8453
+ ```
8454
+
8455
+ Printing the `validation` object shows the validation table in an HTML viewing environment.
8456
+ The validation table shows the single entry that corresponds to the validation step created
8457
+ by using `col_vals_null()`. All test units passed, and there are no failing test units.
8458
+
8459
+ Now, let's use that same set of values for a validation on column `b`.
8460
+
8461
+ ```{python}
8462
+ validation = (
8463
+ pb.Validate(data=tbl)
8464
+ .col_vals_null(columns="b")
8465
+ .interrogate()
8466
+ )
8467
+
8468
+ validation
8469
+ ```
8470
+
8471
+ The validation table reports two failing test units. The specific failing cases are for the
8472
+ two non-Null values in column `b`.
8473
+ """
8474
+ assertion_type = _get_fn_name()
8475
+
8476
+ _check_column(column=columns)
8477
+ _check_pre(pre=pre)
8478
+ # TODO: add check for segments
8479
+ # _check_segments(segments=segments)
8480
+ _check_thresholds(thresholds=thresholds)
8481
+ _check_boolean_input(param=active, param_name="active")
8482
+
8483
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
8484
+ thresholds = (
8485
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
8486
+ )
8487
+
8488
+ # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
8489
+ # resolve the columns
8490
+ if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
8491
+ columns = col(columns)
8492
+
8493
+ # If `columns` is Column value or a string, place it in a list for iteration
8494
+ if isinstance(columns, (Column, str)):
8495
+ columns = [columns]
8496
+
8497
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
8498
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
8499
+
8500
+ # Iterate over the columns and create a validation step for each
8501
+ for column in columns:
8502
+ val_info = _ValidationInfo(
8503
+ assertion_type=assertion_type,
8504
+ column=column,
8505
+ pre=pre,
8506
+ segments=segments,
8507
+ thresholds=thresholds,
8508
+ actions=actions,
8509
+ brief=brief,
8510
+ active=active,
8511
+ )
8512
+
8513
+ self._add_validation(validation_info=val_info)
8514
+
8515
+ return self
8516
+
8517
+ def col_vals_not_null(
8518
+ self,
8519
+ columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
8520
+ pre: Callable | None = None,
8521
+ segments: SegmentSpec | None = None,
8522
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
8523
+ actions: Actions | None = None,
8524
+ brief: str | bool | None = None,
8525
+ active: bool = True,
8526
+ ) -> Validate:
8527
+ """
8528
+ Validate whether values in a column are not Null.
8529
+
8530
+ The `col_vals_not_null()` validation method checks whether column values in a table are not
8531
+ Null. This validation will operate over the number of test units that is equal to the number
8532
+ of rows in the table.
8533
+
8534
+ Parameters
8535
+ ----------
8536
+ columns
8537
+ A single column or a list of columns to validate. Can also use
8538
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
8539
+ multiple columns are supplied or resolved, there will be a separate validation step
8540
+ generated for each column.
8541
+ pre
8542
+ An optional preprocessing function or lambda to apply to the data table during
8543
+ interrogation. This function should take a table as input and return a modified table.
8544
+ Have a look at the *Preprocessing* section for more information on how to use this
8545
+ argument.
8546
+ segments
8547
+ An optional directive on segmentation, which serves to split a validation step into
8548
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
8549
+ column name and its corresponding values to segment on, or a combination of both
8550
+ (provided as a list). Read the *Segmentation* section for usage information.
8551
+ thresholds
8552
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
8553
+ The thresholds are set at the step level and will override any global thresholds set in
8554
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
8555
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
8556
+ section for information on how to set threshold levels.
8557
+ actions
8558
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
8559
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
8560
+ define the actions.
8561
+ brief
8562
+ An optional brief description of the validation step that will be displayed in the
8563
+ reporting table. You can use the templating elements like `"{step}"` to insert
8564
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
8565
+ the entire brief will be automatically generated. If `None` (the default) then there
8566
+ won't be a brief.
8567
+ active
8568
+ A boolean value indicating whether the validation step should be active. Using `False`
8569
+ will make the validation step inactive (still reporting its presence and keeping indexes
8570
+ for the steps unchanged).
8571
+
8572
+ Returns
8573
+ -------
8574
+ Validate
8575
+ The `Validate` object with the added validation step.
8576
+
8577
+ Preprocessing
8578
+ -------------
8579
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
8580
+ table during interrogation. This function should take a table as input and return a modified
8581
+ table. This is useful for performing any necessary transformations or filtering on the data
8582
+ before the validation step is applied.
8583
+
8584
+ The preprocessing function can be any callable that takes a table as input and returns a
8585
+ modified table. For example, you could use a lambda function to filter the table based on
8586
+ certain criteria or to apply a transformation to the data. Note that you can refer to
8587
+ a column via `columns=` that is expected to be present in the transformed table, but may not
8588
+ exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
8589
+ only exists during the validation step and is not stored in the `Validate` object or used in
8590
+ subsequent validation steps.
8591
+
8592
+ Segmentation
8593
+ ------------
8594
+ The `segments=` argument allows for the segmentation of a validation step into multiple
8595
+ segments. This is useful for applying the same validation step to different subsets of the
8596
+ data. The segmentation can be done based on a single column or specific fields within a
8597
+ column.
8598
+
8599
+ Providing a single column name will result in a separate validation step for each unique
8600
+ value in that column. For example, if you have a column called `"region"` with values
8601
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
8602
+ region.
8603
+
8604
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
8605
+ values to segment on. For example, if you have a column called `"date"` and you want to
8606
+ segment on only specific dates, you can provide a tuple like
8607
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
8608
+ (i.e., no validation steps will be created for them).
8056
8609
 
8057
8610
  A list with a combination of column names and tuples can be provided as well. This allows
8058
8611
  for more complex segmentation scenarios. The following inputs are both valid:
@@ -8232,14 +8785,278 @@ class Validate:
8232
8785
  [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
8233
8786
  multiple columns are supplied or resolved, there will be a separate validation step
8234
8787
  generated for each column.
8235
- pattern
8236
- A regular expression pattern to compare against.
8788
+ pattern
8789
+ A regular expression pattern to compare against.
8790
+ na_pass
8791
+ Should any encountered None, NA, or Null values be considered as passing test units? By
8792
+ default, this is `False`. Set to `True` to pass test units with missing values.
8793
+ inverse
8794
+ Should the validation step be inverted? If `True`, then the expectation is that column
8795
+ values should *not* match the specified `pattern=` regex.
8796
+ pre
8797
+ An optional preprocessing function or lambda to apply to the data table during
8798
+ interrogation. This function should take a table as input and return a modified table.
8799
+ Have a look at the *Preprocessing* section for more information on how to use this
8800
+ argument.
8801
+ segments
8802
+ An optional directive on segmentation, which serves to split a validation step into
8803
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
8804
+ column name and its corresponding values to segment on, or a combination of both
8805
+ (provided as a list). Read the *Segmentation* section for usage information.
8806
+ thresholds
8807
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
8808
+ The thresholds are set at the step level and will override any global thresholds set in
8809
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
8810
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
8811
+ section for information on how to set threshold levels.
8812
+ actions
8813
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
8814
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
8815
+ define the actions.
8816
+ brief
8817
+ An optional brief description of the validation step that will be displayed in the
8818
+ reporting table. You can use the templating elements like `"{step}"` to insert
8819
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
8820
+ the entire brief will be automatically generated. If `None` (the default) then there
8821
+ won't be a brief.
8822
+ active
8823
+ A boolean value indicating whether the validation step should be active. Using `False`
8824
+ will make the validation step inactive (still reporting its presence and keeping indexes
8825
+ for the steps unchanged).
8826
+
8827
+ Returns
8828
+ -------
8829
+ Validate
8830
+ The `Validate` object with the added validation step.
8831
+
8832
+ Preprocessing
8833
+ -------------
8834
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
8835
+ table during interrogation. This function should take a table as input and return a modified
8836
+ table. This is useful for performing any necessary transformations or filtering on the data
8837
+ before the validation step is applied.
8838
+
8839
+ The preprocessing function can be any callable that takes a table as input and returns a
8840
+ modified table. For example, you could use a lambda function to filter the table based on
8841
+ certain criteria or to apply a transformation to the data. Note that you can refer to
8842
+ a column via `columns=` that is expected to be present in the transformed table, but may not
8843
+ exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
8844
+ only exists during the validation step and is not stored in the `Validate` object or used in
8845
+ subsequent validation steps.
8846
+
8847
+ Segmentation
8848
+ ------------
8849
+ The `segments=` argument allows for the segmentation of a validation step into multiple
8850
+ segments. This is useful for applying the same validation step to different subsets of the
8851
+ data. The segmentation can be done based on a single column or specific fields within a
8852
+ column.
8853
+
8854
+ Providing a single column name will result in a separate validation step for each unique
8855
+ value in that column. For example, if you have a column called `"region"` with values
8856
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
8857
+ region.
8858
+
8859
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
8860
+ values to segment on. For example, if you have a column called `"date"` and you want to
8861
+ segment on only specific dates, you can provide a tuple like
8862
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
8863
+ (i.e., no validation steps will be created for them).
8864
+
8865
+ A list with a combination of column names and tuples can be provided as well. This allows
8866
+ for more complex segmentation scenarios. The following inputs are both valid:
8867
+
8868
+ ```
8869
+ # Segments from all unique values in the `region` column
8870
+ # and specific dates in the `date` column
8871
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
8872
+
8873
+ # Segments from all unique values in the `region` and `date` columns
8874
+ segments=["region", "date"]
8875
+ ```
8876
+
8877
+ The segmentation is performed during interrogation, and the resulting validation steps will
8878
+ be numbered sequentially. Each segment will have its own validation step, and the results
8879
+ will be reported separately. This allows for a more granular analysis of the data and helps
8880
+ identify issues within specific segments.
8881
+
8882
+ Importantly, the segmentation process will be performed after any preprocessing of the data
8883
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
8884
+ that can be used for segmentation. For example, you could create a new column called
8885
+ `"segment"` through use of `pre=` and then use that column for segmentation.
8886
+
8887
+ Thresholds
8888
+ ----------
8889
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
8890
+ step. If they are set here at the step level, these thresholds will override any thresholds
8891
+ set at the global level in `Validate(thresholds=...)`.
8892
+
8893
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
8894
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
8895
+ or, the absolute number of failing test units (as integer that's `1` or greater).
8896
+
8897
+ Thresholds can be defined using one of these input schemes:
8898
+
8899
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
8900
+ thresholds)
8901
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
8902
+ the 'error' level, and position `2` is the 'critical' level
8903
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
8904
+ 'critical'
8905
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
8906
+ for the 'warning' level only
8907
+
8908
+ If the number of failing test units exceeds set thresholds, the validation step will be
8909
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
8910
+ set, you're free to set any combination of them.
8911
+
8912
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
8913
+ take for each level of failure (using the `actions=` parameter).
8914
+
8915
+ Examples
8916
+ --------
8917
+ ```{python}
8918
+ #| echo: false
8919
+ #| output: false
8920
+ import pointblank as pb
8921
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
8922
+ ```
8923
+ For the examples here, we'll use a simple Polars DataFrame with two string columns (`a` and
8924
+ `b`). The table is shown below:
8925
+
8926
+ ```{python}
8927
+ import pointblank as pb
8928
+ import polars as pl
8929
+
8930
+ tbl = pl.DataFrame(
8931
+ {
8932
+ "a": ["rb-0343", "ra-0232", "ry-0954", "rc-1343"],
8933
+ "b": ["ra-0628", "ra-583", "rya-0826", "rb-0735"],
8934
+ }
8935
+ )
8936
+
8937
+ pb.preview(tbl)
8938
+ ```
8939
+
8940
+ Let's validate that all of the values in column `a` match a particular regex pattern. We'll
8941
+ determine if this validation had any failing test units (there are four test units, one for
8942
+ each row).
8943
+
8944
+ ```{python}
8945
+ validation = (
8946
+ pb.Validate(data=tbl)
8947
+ .col_vals_regex(columns="a", pattern=r"r[a-z]-[0-9]{4}")
8948
+ .interrogate()
8949
+ )
8950
+
8951
+ validation
8952
+ ```
8953
+
8954
+ Printing the `validation` object shows the validation table in an HTML viewing environment.
8955
+ The validation table shows the single entry that corresponds to the validation step created
8956
+ by using `col_vals_regex()`. All test units passed, and there are no failing test units.
8957
+
8958
+ Now, let's use the same regex for a validation on column `b`.
8959
+
8960
+ ```{python}
8961
+ validation = (
8962
+ pb.Validate(data=tbl)
8963
+ .col_vals_regex(columns="b", pattern=r"r[a-z]-[0-9]{4}")
8964
+ .interrogate()
8965
+ )
8966
+
8967
+ validation
8968
+ ```
8969
+
8970
+ The validation table reports two failing test units. The specific failing cases are for the
8971
+ string values of rows 1 and 2 in column `b`.
8972
+ """
8973
+
8974
+ assertion_type = _get_fn_name()
8975
+
8976
+ _check_column(column=columns)
8977
+ _check_pre(pre=pre)
8978
+ # TODO: add check for segments
8979
+ # _check_segments(segments=segments)
8980
+ _check_thresholds(thresholds=thresholds)
8981
+ _check_boolean_input(param=na_pass, param_name="na_pass")
8982
+ _check_boolean_input(param=inverse, param_name="inverse")
8983
+ _check_boolean_input(param=active, param_name="active")
8984
+
8985
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
8986
+ thresholds = (
8987
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
8988
+ )
8989
+
8990
+ # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
8991
+ # resolve the columns
8992
+ if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
8993
+ columns = col(columns)
8994
+
8995
+ # If `columns` is Column value or a string, place it in a list for iteration
8996
+ if isinstance(columns, (Column, str)):
8997
+ columns = [columns]
8998
+
8999
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
9000
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
9001
+
9002
+ # Package up the `pattern=` and boolean params into a dictionary for later interrogation
9003
+ values = {"pattern": pattern, "inverse": inverse}
9004
+
9005
+ # Iterate over the columns and create a validation step for each
9006
+ for column in columns:
9007
+ val_info = _ValidationInfo(
9008
+ assertion_type=assertion_type,
9009
+ column=column,
9010
+ values=values,
9011
+ na_pass=na_pass,
9012
+ pre=pre,
9013
+ segments=segments,
9014
+ thresholds=thresholds,
9015
+ actions=actions,
9016
+ brief=brief,
9017
+ active=active,
9018
+ )
9019
+
9020
+ self._add_validation(validation_info=val_info)
9021
+
9022
+ return self
9023
+
9024
+ def col_vals_within_spec(
9025
+ self,
9026
+ columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
9027
+ spec: str,
9028
+ na_pass: bool = False,
9029
+ pre: Callable | None = None,
9030
+ segments: SegmentSpec | None = None,
9031
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
9032
+ actions: Actions | None = None,
9033
+ brief: str | bool | None = None,
9034
+ active: bool = True,
9035
+ ) -> Validate:
9036
+ """
9037
+ Validate whether column values fit within a specification.
9038
+
9039
+ The `col_vals_within_spec()` validation method checks whether column values in a table
9040
+ correspond to a specification (`spec=`) type (details of which are available in the
9041
+ *Specifications* section). Specifications include common data types like email addresses,
9042
+ URLs, postal codes, vehicle identification numbers (VINs), International Bank Account
9043
+ Numbers (IBANs), and more. This validation will operate over the number of test units that
9044
+ is equal to the number of rows in the table.
9045
+
9046
+ Parameters
9047
+ ----------
9048
+ columns
9049
+ A single column or a list of columns to validate. Can also use
9050
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
9051
+ multiple columns are supplied or resolved, there will be a separate validation step
9052
+ generated for each column.
9053
+ spec
9054
+ A specification string for defining the specification type. Examples are `"email"`,
9055
+ `"url"`, and `"postal_code[USA]"`. See the *Specifications* section for all available
9056
+ options.
8237
9057
  na_pass
8238
9058
  Should any encountered None, NA, or Null values be considered as passing test units? By
8239
9059
  default, this is `False`. Set to `True` to pass test units with missing values.
8240
- inverse
8241
- Should the validation step be inverted? If `True`, then the expectation is that column
8242
- values should *not* match the specified `pattern=` regex.
8243
9060
  pre
8244
9061
  An optional preprocessing function or lambda to apply to the data table during
8245
9062
  interrogation. This function should take a table as input and return a modified table.
@@ -8276,6 +9093,40 @@ class Validate:
8276
9093
  Validate
8277
9094
  The `Validate` object with the added validation step.
8278
9095
 
9096
+ Specifications
9097
+ --------------
9098
+ A specification type must be used with the `spec=` argument. This is a string-based keyword
9099
+ that corresponds to the type of data in the specified columns. The following keywords can
9100
+ be used:
9101
+
9102
+ - `"isbn"`: The International Standard Book Number (ISBN) is a unique numerical identifier
9103
+ for books. This keyword validates both 10-digit and 13-digit ISBNs.
9104
+
9105
+ - `"vin"`: A vehicle identification number (VIN) is a unique code used by the automotive
9106
+ industry to identify individual motor vehicles.
9107
+
9108
+ - `"postal_code[<country_code>]"`: A postal code (also known as postcodes, PIN, or ZIP
9109
+ codes) is a series of letters, digits, or both included in a postal address. Because the
9110
+ coding varies by country, a country code in either the 2-letter (ISO 3166-1 alpha-2) or
9111
+ 3-letter (ISO 3166-1 alpha-3) format needs to be supplied (e.g., `"postal_code[US]"` or
9112
+ `"postal_code[USA]"`). The keyword alias `"zip"` can be used for US ZIP codes.
9113
+
9114
+ - `"credit_card"`: A credit card number can be validated across a variety of issuers. The
9115
+ validation uses the Luhn algorithm.
9116
+
9117
+ - `"iban[<country_code>]"`: The International Bank Account Number (IBAN) is a system of
9118
+ identifying bank accounts across countries. Because the length and coding varies by
9119
+ country, a country code needs to be supplied (e.g., `"iban[DE]"` or `"iban[DEU]"`).
9120
+
9121
+ - `"swift"`: Business Identifier Codes (also known as SWIFT-BIC, BIC, or SWIFT code) are
9122
+ unique identifiers for financial and non-financial institutions.
9123
+
9124
+ - `"phone"`, `"email"`, `"url"`, `"ipv4"`, `"ipv6"`, `"mac"`: Phone numbers, email
9125
+ addresses, Internet URLs, IPv4 or IPv6 addresses, and MAC addresses can be validated with
9126
+ their respective keywords.
9127
+
9128
+ Only a single `spec=` value should be provided per function call.
9129
+
8279
9130
  Preprocessing
8280
9131
  -------------
8281
9132
  The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
@@ -8367,8 +9218,9 @@ class Validate:
8367
9218
  import pointblank as pb
8368
9219
  pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
8369
9220
  ```
8370
- For the examples here, we'll use a simple Polars DataFrame with two string columns (`a` and
8371
- `b`). The table is shown below:
9221
+
9222
+ For the examples here, we'll use a simple Polars DataFrame with an email column. The table
9223
+ is shown below:
8372
9224
 
8373
9225
  ```{python}
8374
9226
  import pointblank as pb
@@ -8376,46 +9228,33 @@ class Validate:
8376
9228
 
8377
9229
  tbl = pl.DataFrame(
8378
9230
  {
8379
- "a": ["rb-0343", "ra-0232", "ry-0954", "rc-1343"],
8380
- "b": ["ra-0628", "ra-583", "rya-0826", "rb-0735"],
9231
+ "email": [
9232
+ "user@example.com",
9233
+ "admin@test.org",
9234
+ "invalid-email",
9235
+ "contact@company.co.uk",
9236
+ ],
8381
9237
  }
8382
9238
  )
8383
9239
 
8384
9240
  pb.preview(tbl)
8385
9241
  ```
8386
9242
 
8387
- Let's validate that all of the values in column `a` match a particular regex pattern. We'll
8388
- determine if this validation had any failing test units (there are four test units, one for
8389
- each row).
8390
-
8391
- ```{python}
8392
- validation = (
8393
- pb.Validate(data=tbl)
8394
- .col_vals_regex(columns="a", pattern=r"r[a-z]-[0-9]{4}")
8395
- .interrogate()
8396
- )
8397
-
8398
- validation
8399
- ```
8400
-
8401
- Printing the `validation` object shows the validation table in an HTML viewing environment.
8402
- The validation table shows the single entry that corresponds to the validation step created
8403
- by using `col_vals_regex()`. All test units passed, and there are no failing test units.
8404
-
8405
- Now, let's use the same regex for a validation on column `b`.
9243
+ Let's validate that all of the values in the `email` column are valid email addresses.
9244
+ We'll determine if this validation had any failing test units (there are four test units,
9245
+ one for each row).
8406
9246
 
8407
9247
  ```{python}
8408
9248
  validation = (
8409
9249
  pb.Validate(data=tbl)
8410
- .col_vals_regex(columns="b", pattern=r"r[a-z]-[0-9]{4}")
9250
+ .col_vals_within_spec(columns="email", spec="email")
8411
9251
  .interrogate()
8412
9252
  )
8413
9253
 
8414
9254
  validation
8415
9255
  ```
8416
9256
 
8417
- The validation table reports two failing test units. The specific failing cases are for the
8418
- string values of rows 1 and 2 in column `b`.
9257
+ The validation table shows that one test unit failed (the invalid email address in row 3).
8419
9258
  """
8420
9259
 
8421
9260
  assertion_type = _get_fn_name()
@@ -8426,7 +9265,6 @@ class Validate:
8426
9265
  # _check_segments(segments=segments)
8427
9266
  _check_thresholds(thresholds=thresholds)
8428
9267
  _check_boolean_input(param=na_pass, param_name="na_pass")
8429
- _check_boolean_input(param=inverse, param_name="inverse")
8430
9268
  _check_boolean_input(param=active, param_name="active")
8431
9269
 
8432
9270
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
@@ -8446,8 +9284,8 @@ class Validate:
8446
9284
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
8447
9285
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
8448
9286
 
8449
- # Package up the `pattern=` and boolean params into a dictionary for later interrogation
8450
- values = {"pattern": pattern, "inverse": inverse}
9287
+ # Package up the `spec=` param into a dictionary for later interrogation
9288
+ values = {"spec": spec}
8451
9289
 
8452
9290
  # Iterate over the columns and create a validation step for each
8453
9291
  for column in columns:
@@ -9396,10 +10234,10 @@ class Validate:
9396
10234
  so try to include only the columns necessary for the validation.
9397
10235
  model
9398
10236
  The model to be used. This should be in the form of `provider:model` (e.g.,
9399
- `"anthropic:claude-3-5-sonnet-latest"`). Supported providers are `"anthropic"`,
9400
- `"openai"`, `"ollama"`, and `"bedrock"`. The model name should be the specific model to
9401
- be used from the provider. Model names are subject to change so consult the provider's
9402
- documentation for the most up-to-date model names.
10237
+ `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
10238
+ `"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from
10239
+ the provider. Model names are subject to change so consult the provider's documentation
10240
+ for the most up-to-date model names.
9403
10241
  batch_size
9404
10242
  Number of rows to process in each batch. Larger batches are more efficient but may hit
9405
10243
  API limits. Default is `1000`.
@@ -9551,13 +10389,6 @@ class Validate:
9551
10389
  - "Describe the quality of each row" (asks for description, not validation)
9552
10390
  - "How would you improve this data?" (asks for suggestions, not pass/fail)
9553
10391
 
9554
- Provider Setup
9555
- --------------
9556
- **OpenAI**: Set `OPENAI_API_KEY` environment variable or create `.env` file.
9557
- **Anthropic**: Set `ANTHROPIC_API_KEY` environment variable or create `.env` file.
9558
- **Ollama**: Ensure Ollama is running locally (default: http://localhost:11434).
9559
- **Bedrock**: Configure AWS credentials and region.
9560
-
9561
10392
  Performance Considerations
9562
10393
  --------------------------
9563
10394
  AI validation is significantly slower than traditional validation methods due to API calls
@@ -10089,63 +10920,242 @@ class Validate:
10089
10920
  pb.config(report_incl_header=False, report_incl_footer=False)
10090
10921
  ```
10091
10922
 
10092
- For the examples here, we'll use the built in dataset `"small_table"`. The table can be
10093
- obtained by calling `load_dataset("small_table")`.
10923
+ For the examples here, we'll use the built in dataset `"small_table"`. The table can be
10924
+ obtained by calling `load_dataset("small_table")`.
10925
+
10926
+ ```{python}
10927
+ import pointblank as pb
10928
+
10929
+ small_table = pb.load_dataset("small_table")
10930
+
10931
+ pb.preview(small_table)
10932
+ ```
10933
+
10934
+ Let's validate that the number of rows in the table matches a fixed value. In this case, we
10935
+ will use the value `13` as the expected row count.
10936
+
10937
+ ```{python}
10938
+ validation = (
10939
+ pb.Validate(data=small_table)
10940
+ .row_count_match(count=13)
10941
+ .interrogate()
10942
+ )
10943
+
10944
+ validation
10945
+ ```
10946
+
10947
+ The validation table shows that the expectation value of `13` matches the actual count of
10948
+ rows in the target table. So, the single test unit passed.
10949
+
10950
+
10951
+ Let's modify our example to show the different ways we can allow some tolerance to our validation
10952
+ by using the `tol` argument.
10953
+
10954
+ ```{python}
10955
+ smaller_small_table = small_table.sample(n = 12) # within the lower bound
10956
+ validation = (
10957
+ pb.Validate(data=smaller_small_table)
10958
+ .row_count_match(count=13,tol=(2, 0)) # minus 2 but plus 0, ie. 11-13
10959
+ .interrogate()
10960
+ )
10961
+
10962
+ validation
10963
+
10964
+ validation = (
10965
+ pb.Validate(data=smaller_small_table)
10966
+ .row_count_match(count=13,tol=.05) # .05% tolerance of 13
10967
+ .interrogate()
10968
+ )
10969
+
10970
+ even_smaller_table = small_table.sample(n = 2)
10971
+ validation = (
10972
+ pb.Validate(data=even_smaller_table)
10973
+ .row_count_match(count=13,tol=5) # plus or minus 5; this test will fail
10974
+ .interrogate()
10975
+ )
10976
+
10977
+ validation
10978
+ ```
10979
+
10980
+ """
10981
+
10982
+ assertion_type = _get_fn_name()
10983
+
10984
+ _check_pre(pre=pre)
10985
+ _check_thresholds(thresholds=thresholds)
10986
+ _check_boolean_input(param=active, param_name="active")
10987
+ _check_boolean_input(param=inverse, param_name="inverse")
10988
+
10989
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
10990
+ thresholds = (
10991
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
10992
+ )
10993
+
10994
+ # If `count` is a DataFrame or table then use the row count of the DataFrame as
10995
+ # the expected count
10996
+ if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
10997
+ count = get_row_count(count)
10998
+
10999
+ # Check the integrity of tolerance
11000
+ bounds: AbsoluteBounds = _derive_bounds(ref=int(count), tol=tol)
11001
+
11002
+ # Package up the `count=` and boolean params into a dictionary for later interrogation
11003
+ values = {"count": count, "inverse": inverse, "abs_tol_bounds": bounds}
11004
+
11005
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
11006
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
11007
+
11008
+ val_info = _ValidationInfo(
11009
+ assertion_type=assertion_type,
11010
+ values=values,
11011
+ pre=pre,
11012
+ thresholds=thresholds,
11013
+ actions=actions,
11014
+ brief=brief,
11015
+ active=active,
11016
+ )
11017
+
11018
+ self._add_validation(validation_info=val_info)
11019
+
11020
+ return self
11021
+
11022
+ def col_count_match(
11023
+ self,
11024
+ count: int | FrameT | Any,
11025
+ inverse: bool = False,
11026
+ pre: Callable | None = None,
11027
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
11028
+ actions: Actions | None = None,
11029
+ brief: str | bool | None = None,
11030
+ active: bool = True,
11031
+ ) -> Validate:
11032
+ """
11033
+ Validate whether the column count of the table matches a specified count.
11034
+
11035
+ The `col_count_match()` method checks whether the column count of the target table matches a
11036
+ specified count. This validation will operate over a single test unit, which is whether the
11037
+ column count matches the specified count.
11038
+
11039
+ We also have the option to invert the validation step by setting `inverse=True`. This will
11040
+ make the expectation that column row count of the target table *does not* match the
11041
+ specified count.
11042
+
11043
+ Parameters
11044
+ ----------
11045
+ count
11046
+ The expected column count of the table. This can be an integer value, a Polars or Pandas
11047
+ DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the column
11048
+ count of that object will be used as the expected count.
11049
+ inverse
11050
+ Should the validation step be inverted? If `True`, then the expectation is that the
11051
+ column count of the target table should not match the specified `count=` value.
11052
+ pre
11053
+ An optional preprocessing function or lambda to apply to the data table during
11054
+ interrogation. This function should take a table as input and return a modified table.
11055
+ Have a look at the *Preprocessing* section for more information on how to use this
11056
+ argument.
11057
+ thresholds
11058
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
11059
+ The thresholds are set at the step level and will override any global thresholds set in
11060
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
11061
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
11062
+ section for information on how to set threshold levels.
11063
+ actions
11064
+ Optional actions to take when the validation step meets or exceeds any set threshold
11065
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
11066
+ define the actions.
11067
+ brief
11068
+ An optional brief description of the validation step that will be displayed in the
11069
+ reporting table. You can use the templating elements like `"{step}"` to insert
11070
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
11071
+ the entire brief will be automatically generated. If `None` (the default) then there
11072
+ won't be a brief.
11073
+ active
11074
+ A boolean value indicating whether the validation step should be active. Using `False`
11075
+ will make the validation step inactive (still reporting its presence and keeping indexes
11076
+ for the steps unchanged).
11077
+
11078
+ Returns
11079
+ -------
11080
+ Validate
11081
+ The `Validate` object with the added validation step.
11082
+
11083
+ Preprocessing
11084
+ -------------
11085
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
11086
+ table during interrogation. This function should take a table as input and return a modified
11087
+ table. This is useful for performing any necessary transformations or filtering on the data
11088
+ before the validation step is applied.
11089
+
11090
+ The preprocessing function can be any callable that takes a table as input and returns a
11091
+ modified table. For example, you could use a lambda function to filter the table based on
11092
+ certain criteria or to apply a transformation to the data. Regarding the lifetime of the
11093
+ transformed table, it only exists during the validation step and is not stored in the
11094
+ `Validate` object or used in subsequent validation steps.
11095
+
11096
+ Thresholds
11097
+ ----------
11098
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
11099
+ step. If they are set here at the step level, these thresholds will override any thresholds
11100
+ set at the global level in `Validate(thresholds=...)`.
11101
+
11102
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
11103
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
11104
+ or, the absolute number of failing test units (as integer that's `1` or greater).
11105
+
11106
+ Thresholds can be defined using one of these input schemes:
11107
+
11108
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
11109
+ thresholds)
11110
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
11111
+ the 'error' level, and position `2` is the 'critical' level
11112
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
11113
+ 'critical'
11114
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
11115
+ for the 'warning' level only
11116
+
11117
+ If the number of failing test units exceeds set thresholds, the validation step will be
11118
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
11119
+ set, you're free to set any combination of them.
11120
+
11121
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
11122
+ take for each level of failure (using the `actions=` parameter).
11123
+
11124
+ Examples
11125
+ --------
11126
+ ```{python}
11127
+ #| echo: false
11128
+ #| output: false
11129
+ import pointblank as pb
11130
+ pb.config(report_incl_header=False, report_incl_footer=False)
11131
+ ```
11132
+
11133
+ For the examples here, we'll use the built in dataset `"game_revenue"`. The table can be
11134
+ obtained by calling `load_dataset("game_revenue")`.
10094
11135
 
10095
11136
  ```{python}
10096
11137
  import pointblank as pb
10097
11138
 
10098
- small_table = pb.load_dataset("small_table")
10099
-
10100
- pb.preview(small_table)
10101
- ```
10102
-
10103
- Let's validate that the number of rows in the table matches a fixed value. In this case, we
10104
- will use the value `13` as the expected row count.
10105
-
10106
- ```{python}
10107
- validation = (
10108
- pb.Validate(data=small_table)
10109
- .row_count_match(count=13)
10110
- .interrogate()
10111
- )
11139
+ game_revenue = pb.load_dataset("game_revenue")
10112
11140
 
10113
- validation
11141
+ pb.preview(game_revenue)
10114
11142
  ```
10115
11143
 
10116
- The validation table shows that the expectation value of `13` matches the actual count of
10117
- rows in the target table. So, the single test unit passed.
10118
-
10119
-
10120
- Let's modify our example to show the different ways we can allow some tolerance to our validation
10121
- by using the `tol` argument.
11144
+ Let's validate that the number of columns in the table matches a fixed value. In this case,
11145
+ we will use the value `11` as the expected column count.
10122
11146
 
10123
11147
  ```{python}
10124
- smaller_small_table = small_table.sample(n = 12) # within the lower bound
10125
- validation = (
10126
- pb.Validate(data=smaller_small_table)
10127
- .row_count_match(count=13,tol=(2, 0)) # minus 2 but plus 0, ie. 11-13
10128
- .interrogate()
10129
- )
10130
-
10131
- validation
10132
-
10133
- validation = (
10134
- pb.Validate(data=smaller_small_table)
10135
- .row_count_match(count=13,tol=.05) # .05% tolerance of 13
10136
- .interrogate()
10137
- )
10138
-
10139
- even_smaller_table = small_table.sample(n = 2)
10140
11148
  validation = (
10141
- pb.Validate(data=even_smaller_table)
10142
- .row_count_match(count=13,tol=5) # plus or minus 5; this test will fail
11149
+ pb.Validate(data=game_revenue)
11150
+ .col_count_match(count=11)
10143
11151
  .interrogate()
10144
11152
  )
10145
11153
 
10146
11154
  validation
10147
11155
  ```
10148
11156
 
11157
+ The validation table shows that the expectation value of `11` matches the actual count of
11158
+ columns in the target table. So, the single test unit passed.
10149
11159
  """
10150
11160
 
10151
11161
  assertion_type = _get_fn_name()
@@ -10160,16 +11170,13 @@ class Validate:
10160
11170
  self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
10161
11171
  )
10162
11172
 
10163
- # If `count` is a DataFrame or table then use the row count of the DataFrame as
11173
+ # If `count` is a DataFrame or table then use the column count of the DataFrame as
10164
11174
  # the expected count
10165
11175
  if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
10166
- count = get_row_count(count)
10167
-
10168
- # Check the integrity of tolerance
10169
- bounds: AbsoluteBounds = _derive_bounds(ref=int(count), tol=tol)
11176
+ count = get_column_count(count)
10170
11177
 
10171
11178
  # Package up the `count=` and boolean params into a dictionary for later interrogation
10172
- values = {"count": count, "inverse": inverse, "abs_tol_bounds": bounds}
11179
+ values = {"count": count, "inverse": inverse}
10173
11180
 
10174
11181
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
10175
11182
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
@@ -10188,10 +11195,9 @@ class Validate:
10188
11195
 
10189
11196
  return self
10190
11197
 
10191
- def col_count_match(
11198
+ def tbl_match(
10192
11199
  self,
10193
- count: int | FrameT | Any,
10194
- inverse: bool = False,
11200
+ tbl_compare: FrameT | Any,
10195
11201
  pre: Callable | None = None,
10196
11202
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
10197
11203
  actions: Actions | None = None,
@@ -10199,25 +11205,29 @@ class Validate:
10199
11205
  active: bool = True,
10200
11206
  ) -> Validate:
10201
11207
  """
10202
- Validate whether the column count of the table matches a specified count.
11208
+ Validate whether the target table matches a comparison table.
10203
11209
 
10204
- The `col_count_match()` method checks whether the column count of the target table matches a
10205
- specified count. This validation will operate over a single test unit, which is whether the
10206
- column count matches the specified count.
11210
+ The `tbl_match()` method checks whether the target table's composition matches that of a
11211
+ comparison table. The validation performs a comprehensive comparison using progressively
11212
+ stricter checks (from least to most stringent):
10207
11213
 
10208
- We also have the option to invert the validation step by setting `inverse=True`. This will
10209
- make the expectation that column row count of the target table *does not* match the
10210
- specified count.
11214
+ 1. **Column count match**: both tables must have the same number of columns
11215
+ 2. **Row count match**: both tables must have the same number of rows
11216
+ 3. **Schema match (loose)**: column names and dtypes match (case-insensitive, any order)
11217
+ 4. **Schema match (order)**: columns in the correct order (case-insensitive names)
11218
+ 5. **Schema match (exact)**: column names match exactly (case-sensitive, correct order)
11219
+ 6. **Data match**: values in corresponding cells must be identical
11220
+
11221
+ This progressive approach helps identify exactly where tables differ. The validation will
11222
+ fail at the first check that doesn't pass, making it easier to diagnose mismatches. This
11223
+ validation operates over a single test unit (pass/fail for complete table match).
10211
11224
 
10212
11225
  Parameters
10213
11226
  ----------
10214
- count
10215
- The expected column count of the table. This can be an integer value, a Polars or Pandas
10216
- DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the column
10217
- count of that object will be used as the expected count.
10218
- inverse
10219
- Should the validation step be inverted? If `True`, then the expectation is that the
10220
- column count of the target table should not match the specified `count=` value.
11227
+ tbl_compare
11228
+ The comparison table to validate against. This can be a DataFrame object (Polars or
11229
+ Pandas), an Ibis table object, or a callable that returns a table. If a callable is
11230
+ provided, it will be executed during interrogation to obtain the comparison table.
10221
11231
  pre
10222
11232
  An optional preprocessing function or lambda to apply to the data table during
10223
11233
  interrogation. This function should take a table as input and return a modified table.
@@ -10258,9 +11268,10 @@ class Validate:
10258
11268
 
10259
11269
  The preprocessing function can be any callable that takes a table as input and returns a
10260
11270
  modified table. For example, you could use a lambda function to filter the table based on
10261
- certain criteria or to apply a transformation to the data. Regarding the lifetime of the
10262
- transformed table, it only exists during the validation step and is not stored in the
10263
- `Validate` object or used in subsequent validation steps.
11271
+ certain criteria or to apply a transformation to the data. Note that the same preprocessing
11272
+ is **not** applied to the comparison table; only the target table is preprocessed. Regarding
11273
+ the lifetime of the transformed table, it only exists during the validation step and is not
11274
+ stored in the `Validate` object or used in subsequent validation steps.
10264
11275
 
10265
11276
  Thresholds
10266
11277
  ----------
@@ -10290,6 +11301,66 @@ class Validate:
10290
11301
  Aside from reporting failure conditions, thresholds can be used to determine the actions to
10291
11302
  take for each level of failure (using the `actions=` parameter).
10292
11303
 
11304
+ Cross-Backend Validation
11305
+ ------------------------
11306
+ The `tbl_match()` method supports **automatic backend coercion** when comparing tables from
11307
+ different backends (e.g., comparing a Polars DataFrame against a Pandas DataFrame, or
11308
+ comparing database tables from DuckDB/SQLite against in-memory DataFrames). When tables with
11309
+ different backends are detected, the comparison table is automatically converted to match the
11310
+ data table's backend before validation proceeds.
11311
+
11312
+ **Certified Backend Combinations:**
11313
+
11314
+ All combinations of the following backends have been tested and certified to work (in both
11315
+ directions):
11316
+
11317
+ - Pandas DataFrame
11318
+ - Polars DataFrame
11319
+ - DuckDB (native)
11320
+ - DuckDB (as Ibis table)
11321
+ - SQLite (via Ibis)
11322
+
11323
+ Note that database backends (DuckDB, SQLite, PostgreSQL, MySQL, Snowflake, BigQuery) are
11324
+ automatically materialized during validation:
11325
+
11326
+ - if comparing **against Polars**: materialized to Polars
11327
+ - if comparing **against Pandas**: materialized to Pandas
11328
+ - if **both tables are database backends**: both materialized to Polars
11329
+
11330
+ This ensures optimal performance and type consistency.
11331
+
11332
+ **Data Types That Work Best in Cross-Backend Validation:**
11333
+
11334
+ - numeric types: int, float columns (including proper NaN handling)
11335
+ - string types: text columns with consistent encodings
11336
+ - boolean types: True/False values
11337
+ - null values: `None` and `NaN` are treated as equivalent across backends
11338
+ - list columns: nested list structures (with basic types)
11339
+
11340
+ **Known Limitations:**
11341
+
11342
+ While many data types work well in cross-backend validation, there are some known
11343
+ limitations to be aware of:
11344
+
11345
+ - date/datetime types: When converting between Polars and Pandas, date objects may be
11346
+ represented differently. For example, `datetime.date` objects in Pandas may become
11347
+ `pd.Timestamp` objects when converted from Polars, leading to false mismatches. To work
11348
+ around this, ensure both tables use the same datetime representation before comparison.
11349
+ - custom types: User-defined types or complex nested structures may not convert cleanly
11350
+ between backends and could cause unexpected comparison failures.
11351
+ - categorical types: Categorical/factor columns may have different internal
11352
+ representations across backends.
11353
+ - timezone-aware datetimes: Timezone handling differs between backends and may cause
11354
+ comparison issues.
11355
+
11356
+ Here are some ideas to overcome such limitations:
11357
+
11358
+ - for date/datetime columns, consider using `pre=` preprocessing to normalize representations
11359
+ before comparison.
11360
+ - when working with custom types, manually convert tables to the same backend before using
11361
+ `tbl_match()`.
11362
+ - use the same datetime precision (e.g., milliseconds vs microseconds) in both tables.
11363
+
10293
11364
  Examples
10294
11365
  --------
10295
11366
  ```{python}
@@ -10299,32 +11370,67 @@ class Validate:
10299
11370
  pb.config(report_incl_header=False, report_incl_footer=False)
10300
11371
  ```
10301
11372
 
10302
- For the examples here, we'll use the built in dataset `"game_revenue"`. The table can be
10303
- obtained by calling `load_dataset("game_revenue")`.
11373
+ For the examples here, we'll create two simple tables to demonstrate the `tbl_match()`
11374
+ validation.
10304
11375
 
10305
11376
  ```{python}
10306
11377
  import pointblank as pb
11378
+ import polars as pl
10307
11379
 
10308
- game_revenue = pb.load_dataset("game_revenue")
11380
+ # Create the first table
11381
+ tbl_1 = pl.DataFrame({
11382
+ "a": [1, 2, 3, 4],
11383
+ "b": ["w", "x", "y", "z"],
11384
+ "c": [4.0, 5.0, 6.0, 7.0]
11385
+ })
10309
11386
 
10310
- pb.preview(game_revenue)
11387
+ # Create an identical table
11388
+ tbl_2 = pl.DataFrame({
11389
+ "a": [1, 2, 3, 4],
11390
+ "b": ["w", "x", "y", "z"],
11391
+ "c": [4.0, 5.0, 6.0, 7.0]
11392
+ })
11393
+
11394
+ pb.preview(tbl_1)
10311
11395
  ```
10312
11396
 
10313
- Let's validate that the number of columns in the table matches a fixed value. In this case,
10314
- we will use the value `11` as the expected column count.
11397
+ Let's validate that `tbl_1` matches `tbl_2`. Since these tables are identical, the
11398
+ validation should pass.
10315
11399
 
10316
11400
  ```{python}
10317
11401
  validation = (
10318
- pb.Validate(data=game_revenue)
10319
- .col_count_match(count=11)
11402
+ pb.Validate(data=tbl_1)
11403
+ .tbl_match(tbl_compare=tbl_2)
10320
11404
  .interrogate()
10321
11405
  )
10322
11406
 
10323
11407
  validation
10324
11408
  ```
10325
11409
 
10326
- The validation table shows that the expectation value of `11` matches the actual count of
10327
- columns in the target table. So, the single test unit passed.
11410
+ The validation table shows that the single test unit passed, indicating that the two tables
11411
+ match completely.
11412
+
11413
+ Now, let's create a table with a slight difference and see what happens.
11414
+
11415
+ ```{python}
11416
+ # Create a table with one different value
11417
+ tbl_3 = pl.DataFrame({
11418
+ "a": [1, 2, 3, 4],
11419
+ "b": ["w", "x", "y", "z"],
11420
+ "c": [4.0, 5.5, 6.0, 7.0] # Changed 5.0 to 5.5
11421
+ })
11422
+
11423
+ validation = (
11424
+ pb.Validate(data=tbl_1)
11425
+ .tbl_match(tbl_compare=tbl_3)
11426
+ .interrogate()
11427
+ )
11428
+
11429
+ validation
11430
+ ```
11431
+
11432
+ The validation table shows that the single test unit failed because the tables don't match
11433
+ (one value is different in column `c`).
10328
11434
  """
10329
11435
 
10330
11436
  assertion_type = _get_fn_name()
@@ -10332,20 +11438,14 @@ class Validate:
10332
11438
  _check_pre(pre=pre)
10333
11439
  _check_thresholds(thresholds=thresholds)
10334
11440
  _check_boolean_input(param=active, param_name="active")
10335
- _check_boolean_input(param=inverse, param_name="inverse")
10336
11441
 
10337
11442
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
10338
11443
  thresholds = (
10339
11444
  self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
10340
11445
  )
10341
11446
 
10342
- # If `count` is a DataFrame or table then use the column count of the DataFrame as
10343
- # the expected count
10344
- if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
10345
- count = get_column_count(count)
10346
-
10347
- # Package up the `count=` and boolean params into a dictionary for later interrogation
10348
- values = {"count": count, "inverse": inverse}
11447
+ # Package up the `tbl_compare` into a dictionary for later interrogation
11448
+ values = {"tbl_compare": tbl_compare}
10349
11449
 
10350
11450
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
10351
11451
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
@@ -11275,11 +12375,14 @@ class Validate:
11275
12375
  "col_vals_le",
11276
12376
  "col_vals_null",
11277
12377
  "col_vals_not_null",
12378
+ "col_vals_increasing",
12379
+ "col_vals_decreasing",
11278
12380
  "col_vals_between",
11279
12381
  "col_vals_outside",
11280
12382
  "col_vals_in_set",
11281
12383
  "col_vals_not_in_set",
11282
12384
  "col_vals_regex",
12385
+ "col_vals_within_spec",
11283
12386
  ]:
11284
12387
  # Process table for column validation
11285
12388
  tbl = _column_test_prep(
@@ -11315,6 +12418,36 @@ class Validate:
11315
12418
  elif assertion_method == "not_null":
11316
12419
  results_tbl = interrogate_not_null(tbl=tbl, column=column)
11317
12420
 
12421
+ elif assertion_type == "col_vals_increasing":
12422
+ from pointblank._interrogation import interrogate_increasing
12423
+
12424
+ # Extract direction options from val_info
12425
+ allow_stationary = validation.val_info.get("allow_stationary", False)
12426
+ decreasing_tol = validation.val_info.get("decreasing_tol", 0.0)
12427
+
12428
+ results_tbl = interrogate_increasing(
12429
+ tbl=tbl,
12430
+ column=column,
12431
+ allow_stationary=allow_stationary,
12432
+ decreasing_tol=decreasing_tol,
12433
+ na_pass=na_pass,
12434
+ )
12435
+
12436
+ elif assertion_type == "col_vals_decreasing":
12437
+ from pointblank._interrogation import interrogate_decreasing
12438
+
12439
+ # Extract direction options from val_info
12440
+ allow_stationary = validation.val_info.get("allow_stationary", False)
12441
+ increasing_tol = validation.val_info.get("increasing_tol", 0.0)
12442
+
12443
+ results_tbl = interrogate_decreasing(
12444
+ tbl=tbl,
12445
+ column=column,
12446
+ allow_stationary=allow_stationary,
12447
+ increasing_tol=increasing_tol,
12448
+ na_pass=na_pass,
12449
+ )
12450
+
11318
12451
  elif assertion_type == "col_vals_between":
11319
12452
  results_tbl = interrogate_between(
11320
12453
  tbl=tbl,
@@ -11348,6 +12481,13 @@ class Validate:
11348
12481
  tbl=tbl, column=column, values=value, na_pass=na_pass
11349
12482
  )
11350
12483
 
12484
+ elif assertion_type == "col_vals_within_spec":
12485
+ from pointblank._interrogation import interrogate_within_spec
12486
+
12487
+ results_tbl = interrogate_within_spec(
12488
+ tbl=tbl, column=column, values=value, na_pass=na_pass
12489
+ )
12490
+
11351
12491
  elif assertion_type == "col_vals_expr":
11352
12492
  results_tbl = col_vals_expr(
11353
12493
  data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
@@ -11441,6 +12581,25 @@ class Validate:
11441
12581
 
11442
12582
  results_tbl = None
11443
12583
 
12584
+ elif assertion_type == "tbl_match":
12585
+ from pointblank._interrogation import tbl_match
12586
+
12587
+ # Get the comparison table (could be callable or actual table)
12588
+ tbl_compare = value["tbl_compare"]
12589
+
12590
+ # If tbl_compare is callable, execute it to get the table
12591
+ if callable(tbl_compare):
12592
+ tbl_compare = tbl_compare()
12593
+
12594
+ result_bool = tbl_match(data_tbl=data_tbl_step, tbl_compare=tbl_compare)
12595
+
12596
+ validation.all_passed = result_bool
12597
+ validation.n = 1
12598
+ validation.n_passed = int(result_bool)
12599
+ validation.n_failed = 1 - result_bool
12600
+
12601
+ results_tbl = None
12602
+
11444
12603
  elif assertion_type == "conjointly":
11445
12604
  results_tbl = conjointly_validation(
11446
12605
  data_tbl=data_tbl_step,
@@ -13501,6 +14660,151 @@ class Validate:
13501
14660
 
13502
14661
  return sundered_tbl
13503
14662
 
14663
+ def get_notes(
14664
+ self, i: int, format: str = "dict"
14665
+ ) -> dict[str, dict[str, str]] | list[str] | None:
14666
+ """
14667
+ Get notes from a validation step by its step number.
14668
+
14669
+ This is a convenience method that retrieves notes from a specific validation step using
14670
+ the step number (1-indexed). It provides easier access to step notes without having to
14671
+ navigate through the `validation_info` list.
14672
+
14673
+ Parameters
14674
+ ----------
14675
+ i
14676
+ The step number (1-indexed) to retrieve notes from. This corresponds to the step
14677
+ numbers shown in validation reports.
14678
+ format
14679
+ The format to return notes in:
14680
+ - `"dict"`: Returns the full notes dictionary (default)
14681
+ - `"markdown"`: Returns a list of markdown-formatted note values
14682
+ - `"text"`: Returns a list of plain text note values
14683
+ - `"keys"`: Returns a list of note keys
14684
+
14685
+ Returns
14686
+ -------
14687
+ dict, list, or None
14688
+ The notes in the requested format, or `None` if the step doesn't exist or has no notes.
14689
+
14690
+ Examples
14691
+ --------
14692
+ ```python
14693
+ import pointblank as pb
14694
+ import polars as pl
14695
+
14696
+ # Create validation with notes
14697
+ validation = pb.Validate(pl.DataFrame({"x": [1, 2, 3]}))
14698
+ validation.col_vals_gt(columns="x", value=0)
14699
+
14700
+ # Add a note to step 1
14701
+ validation.validation_info[0]._add_note(
14702
+ key="info",
14703
+ markdown="This is a **test** note",
14704
+ text="This is a test note"
14705
+ )
14706
+
14707
+ # Interrogate
14708
+ validation.interrogate()
14709
+
14710
+ # Get notes from step 1 using the step number
14711
+ notes = validation.get_notes(1)
14712
+ # Returns: {'info': {'markdown': 'This is a **test** note', 'text': '...'}}
14713
+
14714
+ # Get just the markdown versions
14715
+ markdown_notes = validation.get_notes(1, format="markdown")
14716
+ # Returns: ['This is a **test** note']
14717
+
14718
+ # Get just the keys
14719
+ keys = validation.get_notes(1, format="keys")
14720
+ # Returns: ['info']
14721
+ ```
14722
+ """
14723
+ # Validate step number
14724
+ if not isinstance(i, int) or i < 1:
14725
+ raise ValueError(f"Step number must be a positive integer, got: {i}")
14726
+
14727
+ # Find the validation step with the matching step number
14728
+ # Note: validation_info may contain multiple steps after segmentation,
14729
+ # so we need to find the one with the matching `i` value
14730
+ for validation in self.validation_info:
14731
+ if validation.i == i:
14732
+ return validation._get_notes(format=format)
14733
+
14734
+ # Step not found
14735
+ return None
14736
+
14737
+ def get_note(self, i: int, key: str, format: str = "dict") -> dict[str, str] | str | None:
14738
+ """
14739
+ Get a specific note from a validation step by its step number and note key.
14740
+
14741
+ This method retrieves a specific note from a validation step using the step number
14742
+ (1-indexed) and the note key. It provides easier access to individual notes without having
14743
+ to navigate through the `validation_info` list or retrieve all notes.
14744
+
14745
+ Parameters
14746
+ ----------
14747
+ i
14748
+ The step number (1-indexed) to retrieve the note from. This corresponds to the step
14749
+ numbers shown in validation reports.
14750
+ key
14751
+ The key of the note to retrieve.
14752
+ format
14753
+ The format to return the note in:
14754
+ - `"dict"`: Returns the note as a dictionary with 'markdown' and 'text' keys (default)
14755
+ - `"markdown"`: Returns just the markdown-formatted note value
14756
+ - `"text"`: Returns just the plain text note value
14757
+
14758
+ Returns
14759
+ -------
14760
+ dict, str, or None
14761
+ The note in the requested format, or `None` if the step or note doesn't exist.
14762
+
14763
+ Examples
14764
+ --------
14765
+ ```python
14766
+ import pointblank as pb
14767
+ import polars as pl
14768
+
14769
+ # Create validation with notes
14770
+ validation = pb.Validate(pl.DataFrame({"x": [1, 2, 3]}))
14771
+ validation.col_vals_gt(columns="x", value=0)
14772
+
14773
+ # Add a note to step 1
14774
+ validation.validation_info[0]._add_note(
14775
+ key="threshold_info",
14776
+ markdown="Using **default** thresholds",
14777
+ text="Using default thresholds"
14778
+ )
14779
+
14780
+ # Interrogate
14781
+ validation.interrogate()
14782
+
14783
+ # Get a specific note from step 1 using step number and key
14784
+ note = validation.get_note(1, "threshold_info")
14785
+ # Returns: {'markdown': 'Using **default** thresholds', 'text': '...'}
14786
+
14787
+ # Get just the markdown version
14788
+ markdown = validation.get_note(1, "threshold_info", format="markdown")
14789
+ # Returns: 'Using **default** thresholds'
14790
+
14791
+ # Get just the text version
14792
+ text = validation.get_note(1, "threshold_info", format="text")
14793
+ # Returns: 'Using default thresholds'
14794
+ ```
14795
+ """
14796
+ # Validate step number
14797
+ if not isinstance(i, int) or i < 1:
14798
+ raise ValueError(f"Step number must be a positive integer, got: {i}")
14799
+
14800
+ # Find the validation step with the matching step number
14801
+ for validation in self.validation_info:
14802
+ if validation.i == i:
14803
+ return validation._get_note(key=key, format=format)
14804
+
14805
+ # Step not found
14806
+ return None
14807
+
13504
14808
  def get_tabular_report(
13505
14809
  self, title: str | None = ":default:", incl_header: bool = None, incl_footer: bool = None
13506
14810
  ) -> GT:
@@ -13907,6 +15211,9 @@ class Validate:
13907
15211
  elif assertion_type[i] in ["col_vals_expr", "conjointly"]:
13908
15212
  values_upd.append("COLUMN EXPR")
13909
15213
 
15214
+ elif assertion_type[i] in ["col_vals_increasing", "col_vals_decreasing"]:
15215
+ values_upd.append("")
15216
+
13910
15217
  elif assertion_type[i] in ["row_count_match", "col_count_match"]:
13911
15218
  count = values[i]["count"]
13912
15219
  inverse = values[i]["inverse"]
@@ -13916,6 +15223,9 @@ class Validate:
13916
15223
 
13917
15224
  values_upd.append(str(count))
13918
15225
 
15226
+ elif assertion_type[i] in ["tbl_match"]:
15227
+ values_upd.append("EXTERNAL TABLE")
15228
+
13919
15229
  elif assertion_type[i] in ["specially"]:
13920
15230
  values_upd.append("EXPR")
13921
15231
 
@@ -13924,6 +15234,11 @@ class Validate:
13924
15234
 
13925
15235
  values_upd.append(str(pattern))
13926
15236
 
15237
+ elif assertion_type[i] in ["col_vals_within_spec"]:
15238
+ spec = value["spec"]
15239
+
15240
+ values_upd.append(str(spec))
15241
+
13927
15242
  elif assertion_type[i] in ["prompt"]: # pragma: no cover
13928
15243
  # For AI validation, show only the prompt, not the full config
13929
15244
  if isinstance(value, dict) and "prompt" in value: # pragma: no cover
@@ -14180,6 +15495,7 @@ class Validate:
14180
15495
  validation_info_dict.pop("label")
14181
15496
  validation_info_dict.pop("active")
14182
15497
  validation_info_dict.pop("all_passed")
15498
+ validation_info_dict.pop("notes")
14183
15499
 
14184
15500
  # If no interrogation performed, populate the `i` entry with a sequence of integers
14185
15501
  # from `1` to the number of validation steps
@@ -14364,8 +15680,14 @@ class Validate:
14364
15680
  gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
14365
15681
 
14366
15682
  if incl_footer:
15683
+ # Add table time as HTML source note
14367
15684
  gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
14368
15685
 
15686
+ # Create notes markdown from validation steps and add as separate source note
15687
+ notes_markdown = _create_notes_html(self.validation_info)
15688
+ if notes_markdown:
15689
+ gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
15690
+
14369
15691
  # If the interrogation has not been performed, then style the table columns dealing with
14370
15692
  # interrogation data as grayed out
14371
15693
  if not interrogation_performed:
@@ -16064,6 +17386,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
16064
17386
  "critical",
16065
17387
  "extract",
16066
17388
  "proc_duration_s",
17389
+ "notes",
16067
17390
  ]
16068
17391
 
16069
17392
  # Filter the validation information to include only the selected fields
@@ -16407,6 +17730,14 @@ def _transform_assertion_str(
16407
17730
  # Use Markdown-to-HTML conversion to format the `brief_str` text
16408
17731
  brief_str = [commonmark.commonmark(x) for x in brief_str]
16409
17732
 
17733
+ # Add inline styles to <p> tags for proper rendering in all environments
17734
+ # In some sandboxed HTML environments (e.g., Streamlit), <p> tags don't inherit
17735
+ # font-size from parent divs, so we add inline styles directly to the <p> tags
17736
+ brief_str = [
17737
+ re.sub(r"<p>", r'<p style="font-size: inherit; margin: 0;">', x) if x.strip() else x
17738
+ for x in brief_str
17739
+ ]
17740
+
16410
17741
  # Obtain the number of characters contained in the assertion
16411
17742
  # string; this is important for sizing components appropriately
16412
17743
  assertion_type_nchar = [len(x) for x in assertion_str]
@@ -16535,6 +17866,86 @@ def _create_table_time_html(
16535
17866
  )
16536
17867
 
16537
17868
 
17869
+ def _create_notes_html(validation_info: list) -> str:
17870
+ """
17871
+ Create markdown text for validation notes/footnotes.
17872
+
17873
+ This function collects notes from all validation steps and formats them as footnotes
17874
+ for display in the report footer. Each note is prefixed with the step number in
17875
+ uppercase small caps bold formatting, and the note content is rendered as markdown.
17876
+
17877
+ Parameters
17878
+ ----------
17879
+ validation_info
17880
+ List of _ValidationInfo objects from which to extract notes.
17881
+
17882
+ Returns
17883
+ -------
17884
+ str
17885
+ Markdown string containing formatted footnotes, or empty string if no notes exist.
17886
+ """
17887
+ # Collect all notes from validation steps
17888
+ all_notes = []
17889
+ for step in validation_info:
17890
+ if step.notes:
17891
+ for key, content in step.notes.items():
17892
+ # Store note with step number for context
17893
+ all_notes.append(
17894
+ {
17895
+ "step": step.i,
17896
+ "key": key,
17897
+ "markdown": content["markdown"],
17898
+ "text": content["text"],
17899
+ }
17900
+ )
17901
+
17902
+ # If no notes, return empty string
17903
+ if not all_notes:
17904
+ return ""
17905
+
17906
+ # Build markdown for notes section
17907
+ # Start with a styled horizontal rule and bold "Notes" header
17908
+ notes_parts = [
17909
+ (
17910
+ "<hr style='border: none; border-top-width: 1px; border-top-style: dotted; "
17911
+ "border-top-color: #B5B5B5; margin-top: -3px; margin-bottom: 3px;'>"
17912
+ ),
17913
+ "<strong>Notes</strong>",
17914
+ "",
17915
+ ]
17916
+
17917
+ previous_step = None
17918
+ for note in all_notes:
17919
+ # Determine if this is the first note for this step
17920
+ is_first_for_step = note["step"] != previous_step
17921
+ previous_step = note["step"]
17922
+
17923
+ # Format step label with HTML for uppercase small caps bold
17924
+ # Use lighter color for subsequent notes of the same step
17925
+ step_color = "#333333" if is_first_for_step else "#999999"
17926
+ step_label = (
17927
+ f"<span style='font-variant: small-caps; font-weight: bold; font-size: smaller; "
17928
+ f"text-transform: uppercase; color: {step_color};'>Step {note['step']}</span>"
17929
+ )
17930
+
17931
+ # Format note key in monospaced font with smaller size
17932
+ note_key = f"<span style='font-family: \"IBM Plex Mono\", monospace; font-size: smaller;'>({note['key']})</span>"
17933
+
17934
+ # Combine step label, note key, and markdown content
17935
+ note_text = f"{step_label} {note_key} {note['markdown']}"
17936
+ notes_parts.append(note_text)
17937
+ notes_parts.append("") # Add blank line between notes
17938
+
17939
+ # Remove trailing blank line
17940
+ if notes_parts[-1] == "":
17941
+ notes_parts.pop()
17942
+
17943
+ # Join with newlines to create markdown text
17944
+ notes_markdown = "\n".join(notes_parts)
17945
+
17946
+ return notes_markdown
17947
+
17948
+
16538
17949
  def _create_label_html(label: str | None, start_time: str) -> str:
16539
17950
  if label is None:
16540
17951
  # Remove the decimal and everything beyond that