pointblank 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -56,7 +56,9 @@ from pointblank._interrogation import (
56
56
  ConjointlyValidation,
57
57
  NumberOfTestUnits,
58
58
  RowCountMatch,
59
+ RowsComplete,
59
60
  RowsDistinct,
61
+ SpeciallyValidation,
60
62
  )
61
63
  from pointblank._typing import SegmentSpec
62
64
  from pointblank._utils import (
@@ -6546,6 +6548,243 @@ class Validate:
6546
6548
 
6547
6549
  return self
6548
6550
 
6551
+ def rows_complete(
6552
+ self,
6553
+ columns_subset: str | list[str] | None = None,
6554
+ pre: Callable | None = None,
6555
+ segments: SegmentSpec | None = None,
6556
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
6557
+ actions: Actions | None = None,
6558
+ brief: str | bool | None = None,
6559
+ active: bool = True,
6560
+ ) -> Validate:
6561
+ """
6562
+ Validate whether row data are complete by having no missing values.
6563
+
6564
+ The `rows_complete()` method checks whether rows in the table are complete. Completeness
6565
+ of a row means that there are no missing values within the row. This validation will operate
6566
+ over the number of test units that is equal to the number of rows in the table (determined
6567
+ after any `pre=` mutation has been applied). A subset of columns can be specified for the
6568
+ completeness check. If no subset is provided, all columns in the table will be used.
6569
+
6570
+ Parameters
6571
+ ----------
6572
+ columns_subset
6573
+ A single column or a list of columns to use as a subset for the completeness check. If
6574
+ `None` (the default), then all columns in the table will be used.
6575
+ pre
6576
+ An optional preprocessing function or lambda to apply to the data table during
6577
+ interrogation. This function should take a table as input and return a modified table.
6578
+ Have a look at the *Preprocessing* section for more information on how to use this
6579
+ argument.
6580
+ segments
6581
+ An optional directive on segmentation, which serves to split a validation step into
6582
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
6583
+ column name and its corresponding values to segment on, or a combination of both
6584
+ (provided as a list). Read the *Segmentation* section for usage information.
6585
+ thresholds
6586
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
6587
+ The thresholds are set at the step level and will override any global thresholds set in
6588
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
6589
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
6590
+ section for information on how to set threshold levels.
6591
+ actions
6592
+ Optional actions to take when the validation step meets or exceeds any set threshold
6593
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
6594
+ define the actions.
6595
+ brief
6596
+ An optional brief description of the validation step that will be displayed in the
6597
+ reporting table. You can use the templating elements like `"{step}"` to insert
6598
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
6599
+ the entire brief will be automatically generated. If `None` (the default) then there
6600
+ won't be a brief.
6601
+ active
6602
+ A boolean value indicating whether the validation step should be active. Using `False`
6603
+ will make the validation step inactive (still reporting its presence and keeping indexes
6604
+ for the steps unchanged).
6605
+
6606
+ Returns
6607
+ -------
6608
+ Validate
6609
+ The `Validate` object with the added validation step.
6610
+
6611
+ Preprocessing
6612
+ -------------
6613
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
6614
+ table during interrogation. This function should take a table as input and return a modified
6615
+ table. This is useful for performing any necessary transformations or filtering on the data
6616
+ before the validation step is applied.
6617
+
6618
+ The preprocessing function can be any callable that takes a table as input and returns a
6619
+ modified table. For example, you could use a lambda function to filter the table based on
6620
+ certain criteria or to apply a transformation to the data. Note that you can refer to
6621
+ columns via `columns_subset=` that are expected to be present in the transformed table, but
6622
+ may not exist in the table before preprocessing. Regarding the lifetime of the transformed
6623
+ table, it only exists during the validation step and is not stored in the `Validate` object
6624
+ or used in subsequent validation steps.
6625
+
6626
+ Segmentation
6627
+ ------------
6628
+ The `segments=` argument allows for the segmentation of a validation step into multiple
6629
+ segments. This is useful for applying the same validation step to different subsets of the
6630
+ data. The segmentation can be done based on a single column or specific fields within a
6631
+ column.
6632
+
6633
+ Providing a single column name will result in a separate validation step for each unique
6634
+ value in that column. For example, if you have a column called `"region"` with values
6635
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
6636
+ region.
6637
+
6638
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
6639
+ values to segment on. For example, if you have a column called `"date"` and you want to
6640
+ segment on only specific dates, you can provide a tuple like
6641
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
6642
+ (i.e., no validation steps will be created for them).
6643
+
6644
+ A list with a combination of column names and tuples can be provided as well. This allows
6645
+ for more complex segmentation scenarios. The following inputs are all valid:
6646
+
6647
+ - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6648
+ in the `"region"` column and specific dates in the `"date"` column
6649
+ - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6650
+ columns
6651
+
6652
+ The segmentation is performed during interrogation, and the resulting validation steps will
6653
+ be numbered sequentially. Each segment will have its own validation step, and the results
6654
+ will be reported separately. This allows for a more granular analysis of the data and helps
6655
+ identify issues within specific segments.
6656
+
6657
+ Importantly, the segmentation process will be performed after any preprocessing of the data
6658
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
6659
+ that can be used for segmentation. For example, you could create a new column called
6660
+ `"segment"` through use of `pre=` and then use that column for segmentation.
6661
+
6662
+ Thresholds
6663
+ ----------
6664
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
6665
+ step. If they are set here at the step level, these thresholds will override any thresholds
6666
+ set at the global level in `Validate(thresholds=...)`.
6667
+
6668
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
6669
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
6670
+ or, the absolute number of failing test units (as integer that's `1` or greater).
6671
+
6672
+ Thresholds can be defined using one of these input schemes:
6673
+
6674
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
6675
+ thresholds)
6676
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
6677
+ the 'error' level, and position `2` is the 'critical' level
6678
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
6679
+ 'critical'
6680
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
6681
+ for the 'warning' level only
6682
+
6683
+ If the number of failing test units exceeds set thresholds, the validation step will be
6684
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
6685
+ set, you're free to set any combination of them.
6686
+
6687
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
6688
+ take for each level of failure (using the `actions=` parameter).
6689
+
6690
+ Examples
6691
+ --------
6692
+ ```{python}
6693
+ #| echo: false
6694
+ #| output: false
6695
+ import pointblank as pb
6696
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
6697
+ ```
6698
+ For the examples here, we'll use a simple Polars DataFrame with three string columns
6699
+ (`col_1`, `col_2`, and `col_3`). The table is shown below:
6700
+
6701
+ ```{python}
6702
+ import pointblank as pb
6703
+ import polars as pl
6704
+
6705
+ tbl = pl.DataFrame(
6706
+ {
6707
+ "col_1": ["a", None, "c", "d"],
6708
+ "col_2": ["a", "a", "c", None],
6709
+ "col_3": ["a", "a", "d", None],
6710
+ }
6711
+ )
6712
+
6713
+ pb.preview(tbl)
6714
+ ```
6715
+
6716
+ Let's validate that the rows in the table are complete with `rows_complete()`. We'll
6717
+ determine if this validation had any failing test units (there are four test units, one for
6718
+ each row). A failing test units means that a given row is not complete (i.e., has at least
6719
+ one missing value).
6720
+
6721
+ ```{python}
6722
+ validation = (
6723
+ pb.Validate(data=tbl)
6724
+ .rows_complete()
6725
+ .interrogate()
6726
+ )
6727
+
6728
+ validation
6729
+ ```
6730
+
6731
+ From this validation table we see that there are two failing test units. This is because
6732
+ two rows in the table have at least one missing value (the second row and the last row).
6733
+
6734
+ We can also use a subset of columns to determine completeness. Let's specify the subset
6735
+ using columns `col_2` and `col_3` for the next validation.
6736
+
6737
+ ```{python}
6738
+ validation = (
6739
+ pb.Validate(data=tbl)
6740
+ .rows_complete(columns_subset=["col_2", "col_3"])
6741
+ .interrogate()
6742
+ )
6743
+
6744
+ validation
6745
+ ```
6746
+
6747
+ The validation table reports a single failing test units. The last row contains missing
6748
+ values in both the `col_2` and `col_3` columns.
6749
+ others.
6750
+ """
6751
+
6752
+ assertion_type = _get_fn_name()
6753
+
6754
+ _check_pre(pre=pre)
6755
+ # TODO: add check for segments
6756
+ # _check_segments(segments=segments)
6757
+ _check_thresholds(thresholds=thresholds)
6758
+ _check_boolean_input(param=active, param_name="active")
6759
+
6760
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
6761
+ thresholds = (
6762
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
6763
+ )
6764
+
6765
+ if columns_subset is not None and isinstance(columns_subset, str):
6766
+ columns_subset = [columns_subset]
6767
+
6768
+ # TODO: incorporate Column object
6769
+
6770
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
6771
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
6772
+
6773
+ val_info = _ValidationInfo(
6774
+ assertion_type=assertion_type,
6775
+ column=columns_subset,
6776
+ pre=pre,
6777
+ segments=segments,
6778
+ thresholds=thresholds,
6779
+ actions=actions,
6780
+ brief=brief,
6781
+ active=active,
6782
+ )
6783
+
6784
+ self._add_validation(validation_info=val_info)
6785
+
6786
+ return self
6787
+
6549
6788
  def col_schema_match(
6550
6789
  self,
6551
6790
  schema: Schema,
@@ -7395,7 +7634,7 @@ class Validate:
7395
7634
 
7396
7635
  val_info = _ValidationInfo(
7397
7636
  assertion_type=assertion_type,
7398
- column=None, # This is a rowwise validation, not specific to any column
7637
+ column=None, # This validation is not specific to any column(s)
7399
7638
  values=values,
7400
7639
  pre=pre,
7401
7640
  thresholds=thresholds,
@@ -7408,6 +7647,351 @@ class Validate:
7408
7647
 
7409
7648
  return self
7410
7649
 
7650
+ def specially(
7651
+ self,
7652
+ expr: Callable,
7653
+ pre: Callable | None = None,
7654
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
7655
+ actions: Actions | None = None,
7656
+ brief: str | bool | None = None,
7657
+ active: bool = True,
7658
+ ) -> Validate:
7659
+ """
7660
+ Perform a specialized validation with customized logic.
7661
+
7662
+ The `specially()` validation method allows for the creation of specialized validation
7663
+ expressions that can be used to validate specific conditions or logic in the data. This
7664
+ method provides maximum flexibility by accepting a custom callable that encapsulates
7665
+ your validation logic.
7666
+
7667
+ The callable function can have one of two signatures:
7668
+
7669
+ - a function accepting a single parameter (the data table): `def validate(data): ...`
7670
+ - a function with no parameters: `def validate(): ...`
7671
+
7672
+ The second form is particularly useful for environment validations that don't need to
7673
+ inspect the data table.
7674
+
7675
+ The callable function must ultimately return one of:
7676
+
7677
+ 1. a single boolean value or boolean list
7678
+ 2. a table where the final column contains boolean values (column name is unimportant)
7679
+
7680
+ The validation will operate over the number of test units that is equal to the number of
7681
+ rows in the data table (if returning a table with boolean values). If returning a scalar
7682
+ boolean value, the validation will operate over a single test unit. For a return of a list
7683
+ of boolean values, the length of the list constitutes the number of test units.
7684
+
7685
+ Parameters
7686
+ ----------
7687
+ expr
7688
+ A callable function that defines the specialized validation logic. This function should:
7689
+ (1) accept the target data table as its single argument (though it may ignore it), or
7690
+ (2) take no parameters at all (for environment validations). The function must
7691
+ ultimately return boolean values representing validation results. Design your function
7692
+ to incorporate any custom parameters directly within the function itself using closure
7693
+ variables or default parameters.
7694
+ pre
7695
+ An optional preprocessing function or lambda to apply to the data table during
7696
+ interrogation. This function should take a table as input and return a modified table.
7697
+ Have a look at the *Preprocessing* section for more information on how to use this
7698
+ argument.
7699
+ thresholds
7700
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
7701
+ The thresholds are set at the step level and will override any global thresholds set in
7702
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
7703
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
7704
+ section for information on how to set threshold levels.
7705
+ actions
7706
+ Optional actions to take when the validation step meets or exceeds any set threshold
7707
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
7708
+ define the actions.
7709
+ brief
7710
+ An optional brief description of the validation step that will be displayed in the
7711
+ reporting table. You can use the templating elements like `"{step}"` to insert
7712
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
7713
+ the entire brief will be automatically generated. If `None` (the default) then there
7714
+ won't be a brief.
7715
+ active
7716
+ A boolean value indicating whether the validation step should be active. Using `False`
7717
+ will make the validation step inactive (still reporting its presence and keeping indexes
7718
+ for the steps unchanged).
7719
+
7720
+ Returns
7721
+ -------
7722
+ Validate
7723
+ The `Validate` object with the added validation step.
7724
+
7725
+ Preprocessing
7726
+ -------------
7727
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
7728
+ table during interrogation. This function should take a table as input and return a modified
7729
+ table. This is useful for performing any necessary transformations or filtering on the data
7730
+ before the validation step is applied.
7731
+
7732
+ The preprocessing function can be any callable that takes a table as input and returns a
7733
+ modified table. For example, you could use a lambda function to filter the table based on
7734
+ certain criteria or to apply a transformation to the data. Regarding the lifetime of the
7735
+ transformed table, it only exists during the validation step and is not stored in the
7736
+ `Validate` object or used in subsequent validation steps.
7737
+
7738
+ Thresholds
7739
+ ----------
7740
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
7741
+ step. If they are set here at the step level, these thresholds will override any thresholds
7742
+ set at the global level in `Validate(thresholds=...)`.
7743
+
7744
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
7745
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
7746
+ or, the absolute number of failing test units (as integer that's `1` or greater).
7747
+
7748
+ Thresholds can be defined using one of these input schemes:
7749
+
7750
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
7751
+ thresholds)
7752
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
7753
+ the 'error' level, and position `2` is the 'critical' level
7754
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
7755
+ 'critical'
7756
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
7757
+ for the 'warning' level only
7758
+
7759
+ If the number of failing test units exceeds set thresholds, the validation step will be
7760
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
7761
+ set, you're free to set any combination of them.
7762
+
7763
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
7764
+ take for each level of failure (using the `actions=` parameter).
7765
+
7766
+ Examples
7767
+ --------
7768
+ ```{python}
7769
+ #| echo: false
7770
+ #| output: false
7771
+ import pointblank as pb
7772
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
7773
+ ```
7774
+ The `specially()` method offers maximum flexibility for validation, allowing you to create
7775
+ custom validation logic that fits your specific needs. The following examples demonstrate
7776
+ different patterns and use cases for this powerful validation approach.
7777
+
7778
+ ### Simple validation with direct table access
7779
+
7780
+ This example shows the most straightforward use case where we create a function that
7781
+ directly checks if the sum of two columns is positive.
7782
+
7783
+ ```{python}
7784
+ import pointblank as pb
7785
+ import polars as pl
7786
+
7787
+ simple_tbl = pl.DataFrame({
7788
+ "a": [5, 7, 1, 3, 9, 4],
7789
+ "b": [6, 3, 0, 5, 8, 2]
7790
+ })
7791
+
7792
+ # Simple function that validates directly on the table
7793
+ def validate_sum_positive(data):
7794
+ return data.select(pl.col("a") + pl.col("b") > 0)
7795
+
7796
+ (
7797
+ pb.Validate(data=simple_tbl)
7798
+ .specially(expr=validate_sum_positive)
7799
+ .interrogate()
7800
+ )
7801
+ ```
7802
+
7803
+ The function returns a Polars DataFrame with a single boolean column indicating whether
7804
+ the sum of columns `a` and `b` is positive for each row. Each row in the resulting DataFrame
7805
+ is a distinct test unit. This pattern works well for simple validations where you don't need
7806
+ configurable parameters.
7807
+
7808
+ ### Advanced validation with closure variables for parameters
7809
+
7810
+ When you need to make your validation configurable, you can use the function factory pattern
7811
+ (also known as closures) to create parameterized validations:
7812
+
7813
+ ```{python}
7814
+ # Create a parameterized validation function using closures
7815
+ def make_column_ratio_validator(col1, col2, min_ratio):
7816
+ def validate_column_ratio(data):
7817
+ return data.select((pl.col(col1) / pl.col(col2)) > min_ratio)
7818
+ return validate_column_ratio
7819
+
7820
+ (
7821
+ pb.Validate(data=simple_tbl)
7822
+ .specially(
7823
+ expr=make_column_ratio_validator(col1="a", col2="b", min_ratio=0.5)
7824
+ )
7825
+ .interrogate()
7826
+ )
7827
+ ```
7828
+
7829
+ This approach allows you to create reusable validation functions that can be configured with
7830
+ different parameters without modifying the function itself.
7831
+
7832
+ ### Validation function returning a list of booleans
7833
+
7834
+ This example demonstrates how to create a validation function that returns a list of boolean
7835
+ values, where each element represents a separate test unit:
7836
+
7837
+ ```{python}
7838
+ import pointblank as pb
7839
+ import polars as pl
7840
+ import random
7841
+
7842
+ # Create sample data
7843
+ transaction_tbl = pl.DataFrame({
7844
+ "transaction_id": [f"TX{i:04d}" for i in range(1, 11)],
7845
+ "amount": [120.50, 85.25, 50.00, 240.75, 35.20, 150.00, 85.25, 65.00, 210.75, 90.50],
7846
+ "category": ["food", "shopping", "entertainment", "travel", "utilities",
7847
+ "food", "shopping", "entertainment", "travel", "utilities"]
7848
+ })
7849
+
7850
+ # Define a validation function that returns a list of booleans
7851
+ def validate_transaction_rules(data):
7852
+ # Create a list to store individual test results
7853
+ test_results = []
7854
+
7855
+ # Check each row individually against multiple business rules
7856
+ for row in data.iter_rows(named=True):
7857
+ # Rule: transaction IDs must start with "TX" and be 6 chars long
7858
+ valid_id = row["transaction_id"].startswith("TX") and len(row["transaction_id"]) == 6
7859
+
7860
+ # Rule: Amounts must be appropriate for their category
7861
+ valid_amount = True
7862
+ if row["category"] == "food" and (row["amount"] < 10 or row["amount"] > 200):
7863
+ valid_amount = False
7864
+ elif row["category"] == "utilities" and (row["amount"] < 20 or row["amount"] > 300):
7865
+ valid_amount = False
7866
+ elif row["category"] == "entertainment" and row["amount"] > 100:
7867
+ valid_amount = False
7868
+
7869
+ # A transaction passes if it satisfies both rules
7870
+ test_results.append(valid_id and valid_amount)
7871
+
7872
+ return test_results
7873
+
7874
+ (
7875
+ pb.Validate(data=transaction_tbl)
7876
+ .specially(
7877
+ expr=validate_transaction_rules,
7878
+ brief="Validate transaction IDs and amounts by category."
7879
+ )
7880
+ .interrogate()
7881
+ )
7882
+ ```
7883
+
7884
+ This example shows how to create a validation function that applies multiple business rules
7885
+ to each row and returns a list of boolean results. Each boolean in the list represents a
7886
+ separate test unit, and a test unit passes only if all rules are satisfied for a given row.
7887
+
7888
+ The function iterates through each row in the data table, checking:
7889
+
7890
+ 1. if transaction IDs follow the required format
7891
+ 2. if transaction amounts are appropriate for their respective categories
7892
+
7893
+ This approach is powerful when you need to apply complex, conditional logic that can't be
7894
+ easily expressed using the built-in validation functions.
7895
+
7896
+ ### Table-level validation returning a single boolean
7897
+
7898
+ Sometimes you need to validate properties of the entire table rather than row-by-row. In
7899
+ these cases, your function can return a single boolean value:
7900
+
7901
+ ```{python}
7902
+ def validate_table_properties(data):
7903
+ # Check if table has at least one row with column 'a' > 10
7904
+ has_large_values = data.filter(pl.col("a") > 10).height > 0
7905
+
7906
+ # Check if mean of column 'b' is positive
7907
+ has_positive_mean = data.select(pl.mean("b")).item() > 0
7908
+
7909
+ # Return a single boolean for the entire table
7910
+ return has_large_values and has_positive_mean
7911
+
7912
+ (
7913
+ pb.Validate(data=simple_tbl)
7914
+ .specially(expr=validate_table_properties)
7915
+ .interrogate()
7916
+ )
7917
+ ```
7918
+
7919
+ This example demonstrates how to perform multiple checks on the table as a whole and combine
7920
+ them into a single validation result.
7921
+
7922
+ ### Environment validation that doesn't use the data table
7923
+
7924
+ The `specially()` validation method can even be used to validate aspects of your environment
7925
+ that are completely independent of the data:
7926
+
7927
+ ```{python}
7928
+ def validate_pointblank_version():
7929
+ try:
7930
+ import importlib.metadata
7931
+ version = importlib.metadata.version("pointblank")
7932
+ version_parts = version.split(".")
7933
+
7934
+ # Get major and minor components regardless of how many parts there are
7935
+ major = int(version_parts[0])
7936
+ minor = int(version_parts[1])
7937
+
7938
+ # Check both major and minor components for version `0.9+`
7939
+ return (major > 0) or (major == 0 and minor >= 9)
7940
+
7941
+ except Exception as e:
7942
+ # More specific error handling could be added here
7943
+ print(f"Version check failed: {e}")
7944
+ return False
7945
+
7946
+ (
7947
+ pb.Validate(data=simple_tbl)
7948
+ .specially(
7949
+ expr=validate_pointblank_version,
7950
+ brief="Check Pointblank version `>=0.9.0`."
7951
+ )
7952
+ .interrogate()
7953
+ )
7954
+ ```
7955
+
7956
+ This pattern shows how to validate external dependencies or environment conditions as part
7957
+ of your validation workflow. Notice that the function doesn't take any parameters at all,
7958
+ which makes it cleaner when the validation doesn't need to access the data table.
7959
+
7960
+ By combining these patterns, you can create sophisticated validation workflows that address
7961
+ virtually any data quality requirement in your organization.
7962
+ """
7963
+
7964
+ assertion_type = _get_fn_name()
7965
+
7966
+ # TODO: add a check for the expression to be a callable
7967
+ # _check_expr_specially(expr=expr)
7968
+ _check_pre(pre=pre)
7969
+ _check_thresholds(thresholds=thresholds)
7970
+ _check_boolean_input(param=active, param_name="active")
7971
+
7972
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
7973
+ thresholds = (
7974
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
7975
+ )
7976
+
7977
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
7978
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
7979
+
7980
+ val_info = _ValidationInfo(
7981
+ assertion_type=assertion_type,
7982
+ column=None, # This validation is not specific to any column(s)
7983
+ values=expr,
7984
+ pre=pre,
7985
+ thresholds=thresholds,
7986
+ actions=actions,
7987
+ brief=brief,
7988
+ active=active,
7989
+ )
7990
+
7991
+ self._add_validation(validation_info=val_info)
7992
+
7993
+ return self
7994
+
7411
7995
  def interrogate(
7412
7996
  self,
7413
7997
  collect_extracts: bool = True,
@@ -7724,6 +8308,14 @@ class Validate:
7724
8308
  tbl_type=tbl_type,
7725
8309
  ).get_test_results()
7726
8310
 
8311
+ if assertion_category == "ROWS_COMPLETE":
8312
+ results_tbl = RowsComplete(
8313
+ data_tbl=data_tbl_step,
8314
+ columns_subset=column,
8315
+ threshold=threshold,
8316
+ tbl_type=tbl_type,
8317
+ ).get_test_results()
8318
+
7727
8319
  if assertion_category == "COL_EXISTS_HAS_TYPE":
7728
8320
  result_bool = ColExistsHasType(
7729
8321
  data_tbl=data_tbl_step,
@@ -7814,12 +8406,39 @@ class Validate:
7814
8406
  tbl_type=tbl_type,
7815
8407
  ).get_test_results()
7816
8408
 
7817
- if assertion_category not in [
7818
- "COL_EXISTS_HAS_TYPE",
7819
- "COL_SCHEMA_MATCH",
7820
- "ROW_COUNT_MATCH",
7821
- "COL_COUNT_MATCH",
7822
- ]:
8409
+ if assertion_category == "SPECIALLY":
8410
+ results_tbl_list = SpeciallyValidation(
8411
+ data_tbl=data_tbl_step,
8412
+ expression=value,
8413
+ threshold=threshold,
8414
+ tbl_type=tbl_type,
8415
+ ).get_test_results()
8416
+
8417
+ #
8418
+ # The result from this could either be a table in the conventional form, or,
8419
+ # a list of boolean values; handle both cases
8420
+ #
8421
+
8422
+ if isinstance(results_tbl_list, list):
8423
+ # If the result is a list of boolean values, then we need to convert it to a
8424
+ # set the validation results from the list
8425
+ validation.all_passed = all(results_tbl_list)
8426
+ validation.n = len(results_tbl_list)
8427
+ validation.n_passed = results_tbl_list.count(True)
8428
+ validation.n_failed = results_tbl_list.count(False)
8429
+
8430
+ results_tbl = None
8431
+
8432
+ else:
8433
+ # If the result is not a list, then we assume it's a table in the conventional
8434
+ # form (where the column is `pb_is_good_` exists, with boolean values)
8435
+
8436
+ results_tbl = results_tbl_list
8437
+
8438
+ # If the results table is not `None`, then we assume there is a table with a column
8439
+ # called `pb_is_good_` that contains boolean values; we can then use this table to
8440
+ # determine the number of test units that passed and failed
8441
+ if results_tbl is not None:
7823
8442
  # Extract the `pb_is_good_` column from the table as a results list
7824
8443
  if tbl_type in IBIS_BACKENDS:
7825
8444
  # Select the DataFrame library to use for getting the results list
@@ -7994,7 +8613,8 @@ class Validate:
7994
8613
  # TODO: Add support for extraction of rows for Ibis backends
7995
8614
  if (
7996
8615
  collect_extracts
7997
- and assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_distinct"]
8616
+ and assertion_type
8617
+ in ROW_BASED_VALIDATION_TYPES + ["rows_distinct", "rows_complete"]
7998
8618
  and tbl_type not in IBIS_BACKENDS
7999
8619
  ):
8000
8620
  # Add row numbers to the results table
@@ -9076,19 +9696,134 @@ class Validate:
9076
9696
  """
9077
9697
  Get a report of the validation results as a JSON-formatted string.
9078
9698
 
9699
+ The `get_json_report()` method provides a machine-readable report of validation results in
9700
+ JSON format. This is particularly useful for programmatic processing, storing validation
9701
+ results, or integrating with other systems. The report includes detailed information about
9702
+ each validation step, such as assertion type, columns validated, threshold values, test
9703
+ results, and more.
9704
+
9705
+ By default, all available validation information fields are included in the report. However,
9706
+ you can customize the fields to include or exclude using the `use_fields=` and
9707
+ `exclude_fields=` parameters.
9708
+
9079
9709
  Parameters
9080
9710
  ----------
9081
9711
  use_fields
9082
- A list of fields to include in the report. If `None`, all fields are included.
9712
+ An optional list of specific fields to include in the report. If provided, only these
9713
+ fields will be included in the JSON output. If `None` (the default), all standard
9714
+ validation report fields are included. Have a look at the *Available Report Fields*
9715
+ section below for a list of fields that can be included in the report.
9083
9716
  exclude_fields
9084
- A list of fields to exclude from the report. If `None`, no fields are excluded.
9717
+ An optional list of fields to exclude from the report. If provided, these fields will
9718
+ be omitted from the JSON output. If `None` (the default), no fields are excluded.
9719
+ This parameter cannot be used together with `use_fields=`. The *Available Report Fields*
9720
+ provides a listing of fields that can be excluded from the report.
9085
9721
 
9086
9722
  Returns
9087
9723
  -------
9088
9724
  str
9089
- A JSON-formatted string representing the validation report.
9090
- """
9725
+ A JSON-formatted string representing the validation report, with each validation step
9726
+ as an object in the report array.
9727
+
9728
+ Available Report Fields
9729
+ -----------------------
9730
+ The JSON report can include any of the standard validation report fields, including:
9731
+
9732
+ - `i`: the step number (1-indexed)
9733
+ - `i_o`: the original step index from the validation plan (pre-expansion)
9734
+ - `assertion_type`: the type of validation assertion (e.g., `"col_vals_gt"`, etc.)
9735
+ - `column`: the column being validated (or columns used in certain validations)
9736
+ - `values`: the comparison values or parameters used in the validation
9737
+ - `inclusive`: whether the comparison is inclusive (for range-based validations)
9738
+ - `na_pass`: whether `NA`/`Null` values are considered passing (for certain validations)
9739
+ - `pre`: preprocessing function applied before validation
9740
+ - `segments`: data segments to which the validation was applied
9741
+ - `thresholds`: threshold level statement that was used for the validation step
9742
+ - `label`: custom label for the validation step
9743
+ - `brief`: a brief description of the validation step
9744
+ - `active`: whether the validation step is active
9745
+ - `all_passed`: whether all test units passed in the step
9746
+ - `n`: total number of test units
9747
+ - `n_passed`, `n_failed`: number of test units that passed and failed
9748
+ - `f_passed`, `f_failed`: Fraction of test units that passed and failed
9749
+ - `warning`, `error`, `critical`: whether the namesake threshold level was exceeded (is
9750
+ `null` if threshold not set)
9751
+ - `time_processed`: when the validation step was processed (ISO 8601 format)
9752
+ - `proc_duration_s`: the processing duration in seconds
9753
+
9754
+ Examples
9755
+ --------
9756
+ Let's create a validation plan with a few validation steps and generate a JSON report of the
9757
+ results:
9091
9758
 
9759
+ ```{python}
9760
+ import pointblank as pb
9761
+ import polars as pl
9762
+
9763
+ # Create a sample DataFrame
9764
+ tbl = pl.DataFrame({
9765
+ "a": [5, 7, 8, 9],
9766
+ "b": [3, 4, 2, 1]
9767
+ })
9768
+
9769
+ # Create and execute a validation plan
9770
+ validation = (
9771
+ pb.Validate(data=tbl)
9772
+ .col_vals_gt(columns="a", value=6)
9773
+ .col_vals_lt(columns="b", value=4)
9774
+ .interrogate()
9775
+ )
9776
+
9777
+ # Get the full JSON report
9778
+ json_report = validation.get_json_report()
9779
+
9780
+ print(json_report)
9781
+ ```
9782
+
9783
+ You can also customize which fields to include:
9784
+
9785
+ ```{python}
9786
+ json_report = validation.get_json_report(
9787
+ use_fields=["i", "assertion_type", "column", "n_passed", "n_failed"]
9788
+ )
9789
+
9790
+ print(json_report)
9791
+ ```
9792
+
9793
+ Or which fields to exclude:
9794
+
9795
+ ```{python}
9796
+ json_report = validation.get_json_report(
9797
+ exclude_fields=[
9798
+ "i_o", "thresholds", "pre", "segments", "values",
9799
+ "na_pass", "inclusive", "label", "brief", "active",
9800
+ "time_processed", "proc_duration_s"
9801
+ ]
9802
+ )
9803
+
9804
+ print(json_report)
9805
+ ```
9806
+
9807
+ The JSON output can be further processed or analyzed programmatically:
9808
+
9809
+ ```{python}
9810
+ import json
9811
+
9812
+ # Parse the JSON report
9813
+ report_data = json.loads(validation.get_json_report())
9814
+
9815
+ # Extract and analyze validation results
9816
+ failing_steps = [step for step in report_data if step["n_failed"] > 0]
9817
+ print(f"Number of failing validation steps: {len(failing_steps)}")
9818
+ ```
9819
+
9820
+ See Also
9821
+ --------
9822
+ - [`get_tabular_report()`](`pointblank.Validate.get_tabular_report`): Get a formatted HTML
9823
+ report as a GT table
9824
+ - [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`): Get rows that
9825
+ failed validation
9826
+ """
9092
9827
  if use_fields is not None and exclude_fields is not None:
9093
9828
  raise ValueError("Cannot specify both `use_fields=` and `exclude_fields=`.")
9094
9829
 
@@ -9597,7 +10332,7 @@ class Validate:
9597
10332
  "col_vals_expr",
9598
10333
  ]:
9599
10334
  columns_upd.append("&mdash;")
9600
- elif assertion_type[i] in ["rows_distinct"]:
10335
+ elif assertion_type[i] in ["rows_distinct", "rows_complete"]:
9601
10336
  if not column:
9602
10337
  # If there is no column subset, then all columns are used
9603
10338
  columns_upd.append("ALL COLUMNS")
@@ -9605,7 +10340,7 @@ class Validate:
9605
10340
  # With a column subset list, format with commas between the column names
9606
10341
  columns_upd.append(", ".join(column))
9607
10342
 
9608
- elif assertion_type[i] in ["conjointly"]:
10343
+ elif assertion_type[i] in ["conjointly", "specially"]:
9609
10344
  columns_upd.append("")
9610
10345
  else:
9611
10346
  columns_upd.append(str(column))
@@ -9660,13 +10395,14 @@ class Validate:
9660
10395
  "col_vals_not_null",
9661
10396
  "col_exists",
9662
10397
  "rows_distinct",
10398
+ "rows_complete",
9663
10399
  ]:
9664
10400
  values_upd.append("&mdash;")
9665
10401
 
9666
10402
  elif assertion_type[i] in ["col_schema_match"]:
9667
10403
  values_upd.append("SCHEMA")
9668
10404
 
9669
- elif assertion_type[i] in ["col_vals_expr"]:
10405
+ elif assertion_type[i] in ["col_vals_expr", "conjointly"]:
9670
10406
  values_upd.append("COLUMN EXPR")
9671
10407
 
9672
10408
  elif assertion_type[i] in ["row_count_match", "col_count_match"]:
@@ -9678,8 +10414,8 @@ class Validate:
9678
10414
 
9679
10415
  values_upd.append(str(count))
9680
10416
 
9681
- elif assertion_type[i] in ["conjointly"]:
9682
- values_upd.append("COLUMN EXPR")
10417
+ elif assertion_type[i] in ["specially"]:
10418
+ values_upd.append("EXPR")
9683
10419
 
9684
10420
  # If the assertion type is not recognized, add the value as a string
9685
10421
  else:
@@ -10213,6 +10949,7 @@ class Validate:
10213
10949
  - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
10214
10950
  - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
10215
10951
  - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
10952
+ - [`rows_complete()`](`pointblank.Validate.rows_complete`)
10216
10953
  - [`conjointly()`](`pointblank.Validate.conjointly`)
10217
10954
 
10218
10955
  The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
@@ -10372,7 +11109,7 @@ class Validate:
10372
11109
  # if get_row_count(extract) == 0:
10373
11110
  # return "No rows were extracted."
10374
11111
 
10375
- if assertion_type in ROW_BASED_VALIDATION_TYPES:
11112
+ if assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_complete"]:
10376
11113
  # Get the extracted data for the step
10377
11114
  extract = self.get_data_extracts(i=i, frame=True)
10378
11115
 
@@ -11082,6 +11819,13 @@ def _create_autobrief_or_failure_text(
11082
11819
  for_failure=for_failure,
11083
11820
  )
11084
11821
 
11822
+ if assertion_type == "rows_complete":
11823
+ return _create_text_rows_complete(
11824
+ lang=lang,
11825
+ columns_subset=column,
11826
+ for_failure=for_failure,
11827
+ )
11828
+
11085
11829
  if assertion_type == "row_count_match":
11086
11830
  return _create_text_row_count_match(
11087
11831
  lang=lang,
@@ -11099,6 +11843,9 @@ def _create_autobrief_or_failure_text(
11099
11843
  if assertion_type == "conjointly":
11100
11844
  return _create_text_conjointly(lang=lang, for_failure=for_failure)
11101
11845
 
11846
+ if assertion_type == "specially":
11847
+ return _create_text_specially(lang=lang, for_failure=for_failure)
11848
+
11102
11849
  return None # pragma: no cover
11103
11850
 
11104
11851
 
@@ -11257,6 +12004,24 @@ def _create_text_rows_distinct(
11257
12004
  return text
11258
12005
 
11259
12006
 
12007
+ def _create_text_rows_complete(
12008
+ lang: str, columns_subset: list[str] | None, for_failure: bool = False
12009
+ ) -> str:
12010
+ type_ = _expect_failure_type(for_failure=for_failure)
12011
+
12012
+ if columns_subset is None:
12013
+ text = EXPECT_FAIL_TEXT[f"all_row_complete_{type_}_text"][lang]
12014
+
12015
+ else:
12016
+ column_text = _prep_values_text(values=columns_subset, lang=lang, limit=3)
12017
+
12018
+ text = EXPECT_FAIL_TEXT[f"across_row_complete_{type_}_text"][lang].format(
12019
+ column_text=column_text
12020
+ )
12021
+
12022
+ return text
12023
+
12024
+
11260
12025
  def _create_text_row_count_match(lang: str, value: int, for_failure: bool = False) -> str:
11261
12026
  type_ = _expect_failure_type(for_failure=for_failure)
11262
12027
 
@@ -11279,6 +12044,12 @@ def _create_text_conjointly(lang: str, for_failure: bool = False) -> str:
11279
12044
  return EXPECT_FAIL_TEXT[f"conjointly_{type_}_text"][lang]
11280
12045
 
11281
12046
 
12047
+ def _create_text_specially(lang: str, for_failure: bool = False) -> str:
12048
+ type_ = _expect_failure_type(for_failure=for_failure)
12049
+
12050
+ return EXPECT_FAIL_TEXT[f"specially_{type_}_text"][lang]
12051
+
12052
+
11282
12053
  def _prep_column_text(column: str | list[str]) -> str:
11283
12054
  if isinstance(column, list):
11284
12055
  return "`" + str(column[0]) + "`"
@@ -12057,6 +12828,11 @@ def _step_report_row_based(
12057
12828
  text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
12058
12829
  elif assertion_type == "col_vals_not_null":
12059
12830
  text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
12831
+ elif assertion_type == "rows_complete":
12832
+ if column is None:
12833
+ text = STEP_REPORT_TEXT["rows_complete_all"][lang]
12834
+ else:
12835
+ text = STEP_REPORT_TEXT["rows_complete_subset"][lang]
12060
12836
 
12061
12837
  # Wrap assertion text in a <code> tag
12062
12838
  text = (