pointblank 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -52,6 +52,7 @@ from pointblank._interrogation import (
52
52
  ColValsCompareTwo,
53
53
  ColValsExpr,
54
54
  ColValsRegex,
55
+ ConjointlyValidation,
55
56
  NumberOfTestUnits,
56
57
  RowCountMatch,
57
58
  RowsDistinct,
@@ -6462,6 +6463,250 @@ class Validate:
6462
6463
 
6463
6464
  return self
6464
6465
 
6466
+ def conjointly(
6467
+ self,
6468
+ *exprs: Callable,
6469
+ pre: Callable | None = None,
6470
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
6471
+ actions: Actions | None = None,
6472
+ brief: str | bool | None = None,
6473
+ active: bool = True,
6474
+ ) -> Validate:
6475
+ """
6476
+ Perform multiple row-wise validations for joint validity.
6477
+
6478
+ The `conjointly()` validation method checks whether each row in the table passes multiple
6479
+ validation conditions simultaneously. This enables compound validation logic where a test
6480
+ unit (typically a row) must satisfy all specified conditions to pass the validation.
6481
+
6482
+ This method accepts multiple validation expressions as callables, which should return
6483
+ boolean expressions when applied to the data. You can use lambdas that incorporate
6484
+ Polars/Pandas/Ibis expressions (based on the target table type) or create more complex
6485
+ validation functions. The validation will operate over the number of test units that is
6486
+ equal to the number of rows in the table (determined after any `pre=` mutation has been
6487
+ applied).
6488
+
6489
+ Parameters
6490
+ ----------
6491
+ *exprs
6492
+ Multiple validation expressions provided as callable functions. Each callable should
6493
+ accept a table as its single argument and return a boolean expression or Series/Column
6494
+ that evaluates to boolean values for each row.
6495
+ pre
6496
+ An optional preprocessing function or lambda to apply to the data table during
6497
+ interrogation. This function should take a table as input and return a modified table.
6498
+ Have a look at the *Preprocessing* section for more information on how to use this
6499
+ argument.
6500
+ thresholds
6501
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
6502
+ The thresholds are set at the step level and will override any global thresholds set in
6503
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
6504
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
6505
+ section for information on how to set threshold levels.
6506
+ actions
6507
+ Optional actions to take when the validation step meets or exceeds any set threshold
6508
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
6509
+ define the actions.
6510
+ brief
6511
+ An optional brief description of the validation step that will be displayed in the
6512
+ reporting table. You can use the templating elements like `"{step}"` to insert
6513
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
6514
+ the entire brief will be automatically generated. If `None` (the default) then there
6515
+ won't be a brief.
6516
+ active
6517
+ A boolean value indicating whether the validation step should be active. Using `False`
6518
+ will make the validation step inactive (still reporting its presence and keeping indexes
6519
+ for the steps unchanged).
6520
+
6521
+ Returns
6522
+ -------
6523
+ Validate
6524
+ The `Validate` object with the added validation step.
6525
+
6526
+ Preprocessing
6527
+ -------------
6528
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
6529
+ table during interrogation. This function should take a table as input and return a modified
6530
+ table. This is useful for performing any necessary transformations or filtering on the data
6531
+ before the validation step is applied.
6532
+
6533
+ The preprocessing function can be any callable that takes a table as input and returns a
6534
+ modified table. For example, you could use a lambda function to filter the table based on
6535
+ certain criteria or to apply a transformation to the data. Regarding the lifetime of the
6536
+ transformed table, it only exists during the validation step and is not stored in the
6537
+ `Validate` object or used in subsequent validation steps.
6538
+
6539
+ Thresholds
6540
+ ----------
6541
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
6542
+ step. If they are set here at the step level, these thresholds will override any thresholds
6543
+ set at the global level in `Validate(thresholds=...)`.
6544
+
6545
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
6546
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
6547
+ or, the absolute number of failing test units (as integer that's `1` or greater).
6548
+
6549
+ Thresholds can be defined using one of these input schemes:
6550
+
6551
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
6552
+ thresholds)
6553
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
6554
+ the 'error' level, and position `2` is the 'critical' level
6555
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
6556
+ 'critical'
6557
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
6558
+ for the 'warning' level only
6559
+
6560
+ If the number of failing test units exceeds set thresholds, the validation step will be
6561
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
6562
+ set, you're free to set any combination of them.
6563
+
6564
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
6565
+ take for each level of failure (using the `actions=` parameter).
6566
+
6567
+ Examples
6568
+ --------
6569
+ ```{python}
6570
+ #| echo: false
6571
+ #| output: false
6572
+ import pointblank as pb
6573
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
6574
+ ```
6575
+ For the examples here, we'll use a simple Polars DataFrame with three numeric columns (`a`,
6576
+ `b`, and `c`). The table is shown below:
6577
+
6578
+ ```{python}
6579
+ import pointblank as pb
6580
+ import polars as pl
6581
+
6582
+ tbl = pl.DataFrame(
6583
+ {
6584
+ "a": [5, 7, 1, 3, 9, 4],
6585
+ "b": [6, 3, 0, 5, 8, 2],
6586
+ "c": [10, 4, 8, 9, 10, 5],
6587
+ }
6588
+ )
6589
+
6590
+ pb.preview(tbl)
6591
+ ```
6592
+
6593
+ Let's validate that the values in each row satisfy multiple conditions simultaneously:
6594
+
6595
+ 1. Column `a` should be greater than 2
6596
+ 2. Column `b` should be less than 7
6597
+ 3. The sum of `a` and `b` should be less than the value in column `c`
6598
+
6599
+ We'll use `conjointly()` to check all these conditions together:
6600
+
6601
+ ```{python}
6602
+ validation = (
6603
+ pb.Validate(data=tbl)
6604
+ .conjointly(
6605
+ lambda df: pl.col("a") > 2,
6606
+ lambda df: pl.col("b") < 7,
6607
+ lambda df: pl.col("a") + pl.col("b") < pl.col("c")
6608
+ )
6609
+ .interrogate()
6610
+ )
6611
+
6612
+ validation
6613
+ ```
6614
+
6615
+ The validation table shows that not all rows satisfy all three conditions together. For a
6616
+ row to pass the conjoint validation, all three conditions must be true for that row.
6617
+
6618
+ We can also use preprocessing to filter the data before applying the conjoint validation:
6619
+
6620
+ ```{python}
6621
+ validation = (
6622
+ pb.Validate(data=tbl)
6623
+ .conjointly(
6624
+ lambda df: pl.col("a") > 2,
6625
+ lambda df: pl.col("b") < 7,
6626
+ lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
6627
+ pre=lambda df: df.filter(pl.col("c") > 5)
6628
+ )
6629
+ .interrogate()
6630
+ )
6631
+
6632
+ validation
6633
+ ```
6634
+
6635
+ This allows for more complex validation scenarios where the data is first prepared and then
6636
+ validated against multiple conditions simultaneously.
6637
+
6638
+ Or, you can use the backend-agnostic column expression helper
6639
+ [`expr_col()`](`pointblank.expr_col`) to write expressions that work across different table
6640
+ backends:
6641
+
6642
+ ```{python}
6643
+ tbl = pl.DataFrame(
6644
+ {
6645
+ "a": [5, 7, 1, 3, 9, 4],
6646
+ "b": [6, 3, 0, 5, 8, 2],
6647
+ "c": [10, 4, 8, 9, 10, 5],
6648
+ }
6649
+ )
6650
+
6651
+ # Using backend-agnostic syntax with expr_col()
6652
+ validation = (
6653
+ pb.Validate(data=tbl)
6654
+ .conjointly(
6655
+ lambda df: pb.expr_col("a") > 2,
6656
+ lambda df: pb.expr_col("b") < 7,
6657
+ lambda df: pb.expr_col("a") + pb.expr_col("b") < pb.expr_col("c")
6658
+ )
6659
+ .interrogate()
6660
+ )
6661
+
6662
+ validation
6663
+ ```
6664
+
6665
+ Using [`expr_col()`](`pointblank.expr_col`) allows your validation code to work consistently
6666
+ across Pandas, Polars, and Ibis table backends without changes, making your validation
6667
+ pipelines more portable.
6668
+
6669
+ See Also
6670
+ --------
6671
+ Look at the documentation of the [`expr_col()`](`pointblank.expr_col`) function for more
6672
+ information on how to use it with different table backends.
6673
+ """
6674
+
6675
+ assertion_type = _get_fn_name()
6676
+
6677
+ if len(exprs) == 0:
6678
+ raise ValueError("At least one validation expression must be provided")
6679
+
6680
+ _check_pre(pre=pre)
6681
+ _check_thresholds(thresholds=thresholds)
6682
+ _check_boolean_input(param=active, param_name="active")
6683
+
6684
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
6685
+ thresholds = (
6686
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
6687
+ )
6688
+
6689
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
6690
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
6691
+
6692
+ # Package the validation expressions for later evaluation
6693
+ values = {"expressions": exprs}
6694
+
6695
+ val_info = _ValidationInfo(
6696
+ assertion_type=assertion_type,
6697
+ column=None, # This is a rowwise validation, not specific to any column
6698
+ values=values,
6699
+ pre=pre,
6700
+ thresholds=thresholds,
6701
+ actions=actions,
6702
+ brief=brief,
6703
+ active=active,
6704
+ )
6705
+
6706
+ self._add_validation(validation_info=val_info)
6707
+
6708
+ return self
6709
+
6465
6710
  def interrogate(
6466
6711
  self,
6467
6712
  collect_extracts: bool = True,
@@ -6841,6 +7086,14 @@ class Validate:
6841
7086
 
6842
7087
  results_tbl = None
6843
7088
 
7089
+ if assertion_category == "CONJOINTLY":
7090
+ results_tbl = ConjointlyValidation(
7091
+ data_tbl=data_tbl_step,
7092
+ expressions=value["expressions"],
7093
+ threshold=threshold,
7094
+ tbl_type=tbl_type,
7095
+ ).get_test_results()
7096
+
6844
7097
  if assertion_category not in [
6845
7098
  "COL_EXISTS_HAS_TYPE",
6846
7099
  "COL_SCHEMA_MATCH",
@@ -6849,9 +7102,18 @@ class Validate:
6849
7102
  ]:
6850
7103
  # Extract the `pb_is_good_` column from the table as a results list
6851
7104
  if tbl_type in IBIS_BACKENDS:
6852
- results_list = (
6853
- results_tbl.select("pb_is_good_").to_pandas()["pb_is_good_"].to_list()
6854
- )
7105
+ # Select the DataFrame library to use for getting the results list
7106
+ df_lib = _select_df_lib(preference="polars")
7107
+ df_lib_name = df_lib.__name__
7108
+
7109
+ if df_lib_name == "pandas":
7110
+ results_list = (
7111
+ results_tbl.select("pb_is_good_").to_pandas()["pb_is_good_"].to_list()
7112
+ )
7113
+ else:
7114
+ results_list = (
7115
+ results_tbl.select("pb_is_good_").to_polars()["pb_is_good_"].to_list()
7116
+ )
6855
7117
 
6856
7118
  else:
6857
7119
  results_list = nw.from_native(results_tbl)["pb_is_good_"].to_list()
@@ -7012,7 +7274,7 @@ class Validate:
7012
7274
  # TODO: Add support for extraction of rows for Ibis backends
7013
7275
  if (
7014
7276
  collect_extracts
7015
- and assertion_type in ROW_BASED_VALIDATION_TYPES
7277
+ and assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_distinct"]
7016
7278
  and tbl_type not in IBIS_BACKENDS
7017
7279
  ):
7018
7280
  # Add row numbers to the results table
@@ -7038,6 +7300,32 @@ class Validate:
7038
7300
  if len(validation_extract_nw) > extract_limit:
7039
7301
  validation_extract_nw = validation_extract_nw.head(extract_limit)
7040
7302
 
7303
+ # If a 'rows_distinct' validation step, then the extract should have the
7304
+ # duplicate rows arranged together
7305
+ if assertion_type == "rows_distinct":
7306
+ # Get the list of column names in the extract, excluding the `_row_num_` column
7307
+ column_names = validation_extract_nw.columns
7308
+ column_names.remove("_row_num_")
7309
+
7310
+ # Only include the columns that were defined in `rows_distinct(columns_subset=)`
7311
+ # (stored here in `column`), if supplied
7312
+ if column is not None:
7313
+ column_names = column
7314
+ column_names_subset = ["_row_num_"] + column
7315
+ validation_extract_nw = validation_extract_nw.select(column_names_subset)
7316
+
7317
+ validation_extract_nw = (
7318
+ validation_extract_nw.with_columns(
7319
+ group_min_row=nw.min("_row_num_").over(*column_names)
7320
+ )
7321
+ # First sort by the columns to group duplicates and by row numbers
7322
+ # within groups; this type of sorting will preserve the original order in a
7323
+ # single operation
7324
+ .sort(by=["group_min_row"] + column_names + ["_row_num_"])
7325
+ .drop("group_min_row")
7326
+ )
7327
+
7328
+ # Ensure that the extract is set to its native format
7041
7329
  validation.extract = nw.to_native(validation_extract_nw)
7042
7330
 
7043
7331
  # Get the end time for this step
@@ -7976,6 +8264,7 @@ class Validate:
7976
8264
  - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
7977
8265
  - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
7978
8266
  - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
8267
+ - [`rows_distinct()`](`pointblank.Validate.rows_distinct`)
7979
8268
 
7980
8269
  An extracted row means that a test unit failed for that row in the validation step. The
7981
8270
  extracted rows are a subset of the original table and are useful for further analysis or for
@@ -8357,6 +8646,7 @@ class Validate:
8357
8646
  # Do we have a DataFrame library to work with?
8358
8647
  _check_any_df_lib(method_used="get_tabular_report")
8359
8648
 
8649
+ # Select the DataFrame library
8360
8650
  df_lib = _select_df_lib(preference="polars")
8361
8651
 
8362
8652
  # Get information on the input data table
@@ -8586,6 +8876,9 @@ class Validate:
8586
8876
  else:
8587
8877
  # With a column subset list, format with commas between the column names
8588
8878
  columns_upd.append(", ".join(column))
8879
+
8880
+ elif assertion_type[i] in ["conjointly"]:
8881
+ columns_upd.append("")
8589
8882
  else:
8590
8883
  columns_upd.append(str(column))
8591
8884
 
@@ -8657,6 +8950,9 @@ class Validate:
8657
8950
 
8658
8951
  values_upd.append(str(count))
8659
8952
 
8953
+ elif assertion_type[i] in ["conjointly"]:
8954
+ values_upd.append("COLUMN EXPR")
8955
+
8660
8956
  # If the assertion type is not recognized, add the value as a string
8661
8957
  else:
8662
8958
  values_upd.append(str(value))
@@ -9330,6 +9626,24 @@ class Validate:
9330
9626
  lang=lang,
9331
9627
  )
9332
9628
 
9629
+ elif assertion_type == "rows_distinct":
9630
+ extract = self.get_data_extracts(i=i, frame=True)
9631
+
9632
+ step_report = _step_report_rows_distinct(
9633
+ i=i,
9634
+ column=column,
9635
+ column_position=column_position,
9636
+ columns_subset=columns_subset,
9637
+ n=n,
9638
+ n_failed=n_failed,
9639
+ all_passed=all_passed,
9640
+ extract=extract,
9641
+ tbl_preview=tbl_preview,
9642
+ header=header,
9643
+ limit=limit,
9644
+ lang=lang,
9645
+ )
9646
+
9333
9647
  elif assertion_type == "col_schema_match":
9334
9648
  # Get the parameters for column-schema matching
9335
9649
  values_dict = validation_step["values"]
@@ -9925,6 +10239,9 @@ def _create_autobrief_or_failure_text(
9925
10239
  for_failure=for_failure,
9926
10240
  )
9927
10241
 
10242
+ if assertion_type == "conjointly":
10243
+ return _create_text_conjointly(lang=lang, for_failure=for_failure)
10244
+
9928
10245
  return None # pragma: no cover
9929
10246
 
9930
10247
 
@@ -10099,6 +10416,12 @@ def _create_text_col_count_match(lang: str, value: int, for_failure: bool = Fals
10099
10416
  return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text)
10100
10417
 
10101
10418
 
10419
+ def _create_text_conjointly(lang: str, for_failure: bool = False) -> str:
10420
+ type_ = _expect_failure_type(for_failure=for_failure)
10421
+
10422
+ return EXPECT_FAIL_TEXT[f"conjointly_{type_}_text"][lang]
10423
+
10424
+
10102
10425
  def _prep_column_text(column: str | list[str]) -> str:
10103
10426
  if isinstance(column, list):
10104
10427
  return "`" + str(column[0]) + "`"
@@ -10672,7 +10995,7 @@ def _step_report_row_based(
10672
10995
  header: str,
10673
10996
  limit: int | None,
10674
10997
  lang: str,
10675
- ):
10998
+ ) -> GT:
10676
10999
  # Get the length of the extracted data for the step
10677
11000
  extract_length = get_row_count(extract)
10678
11001
 
@@ -10889,6 +11212,140 @@ def _step_report_row_based(
10889
11212
  return step_report
10890
11213
 
10891
11214
 
11215
+ def _step_report_rows_distinct(
11216
+ i: int,
11217
+ column: list[str],
11218
+ column_position: list[int],
11219
+ columns_subset: list[str] | None,
11220
+ n: int,
11221
+ n_failed: int,
11222
+ all_passed: bool,
11223
+ extract: any,
11224
+ tbl_preview: GT,
11225
+ header: str,
11226
+ limit: int | None,
11227
+ lang: str,
11228
+ ) -> GT:
11229
+ # Get the length of the extracted data for the step
11230
+ extract_length = get_row_count(extract)
11231
+
11232
+ # Determine whether the `lang` value represents a right-to-left language
11233
+ is_rtl_lang = lang in RTL_LANGUAGES
11234
+ direction_rtl = " direction: rtl;" if is_rtl_lang else ""
11235
+
11236
+ if column is None:
11237
+ text = STEP_REPORT_TEXT["rows_distinct_all"][lang].format(column=column)
11238
+ else:
11239
+ columns_list = ", ".join(column)
11240
+ text = STEP_REPORT_TEXT["rows_distinct_subset"][lang].format(columns_subset=columns_list)
11241
+
11242
+ if all_passed:
11243
+ step_report = tbl_preview
11244
+
11245
+ if header is None:
11246
+ return step_report
11247
+
11248
+ title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN
11249
+
11250
+ success_stmt = STEP_REPORT_TEXT["success_statement_no_column"][lang].format(
11251
+ n=n,
11252
+ column_position=column_position,
11253
+ )
11254
+ preview_stmt = STEP_REPORT_TEXT["preview_statement"][lang]
11255
+
11256
+ details = (
11257
+ f"<div style='font-size: 13.6px; {direction_rtl}'>"
11258
+ "<div style='padding-top: 7px;'>"
11259
+ f"{text}"
11260
+ "</div>"
11261
+ "<div style='padding-top: 7px;'>"
11262
+ f"{success_stmt}"
11263
+ "</div>"
11264
+ f"{preview_stmt}"
11265
+ "</div>"
11266
+ )
11267
+
11268
+ # Generate the default template text for the header when `":default:"` is used
11269
+ if header == ":default:":
11270
+ header = "{title}{details}"
11271
+
11272
+ # Use commonmark to convert the header text to HTML
11273
+ header = commonmark.commonmark(header)
11274
+
11275
+ # Place any templated text in the header
11276
+ header = header.format(title=title, details=details)
11277
+
11278
+ # Create the header with `header` string
11279
+ step_report = step_report.tab_header(title=md(header))
11280
+
11281
+ else:
11282
+ if limit is None:
11283
+ limit = extract_length
11284
+
11285
+ # Create a preview of the extracted data
11286
+ step_report = _generate_display_table(
11287
+ data=extract,
11288
+ columns_subset=columns_subset,
11289
+ n_head=limit,
11290
+ n_tail=0,
11291
+ limit=limit,
11292
+ min_tbl_width=600,
11293
+ incl_header=False,
11294
+ mark_missing_values=False,
11295
+ )
11296
+
11297
+ title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i)
11298
+ failure_rate_metrics = f"<strong>{n_failed}</strong> / <strong>{n}</strong>"
11299
+
11300
+ failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary_rows_distinct"][lang].format(
11301
+ failure_rate=failure_rate_metrics,
11302
+ column_position=column_position,
11303
+ )
11304
+
11305
+ if limit < extract_length: # pragma: no cover
11306
+ extract_length_resolved = limit
11307
+ extract_text = STEP_REPORT_TEXT["extract_text_first_rows_distinct"][lang].format(
11308
+ extract_length_resolved=extract_length_resolved
11309
+ )
11310
+
11311
+ else:
11312
+ extract_length_resolved = extract_length
11313
+ extract_text = STEP_REPORT_TEXT["extract_text_all_rows_distinct"][lang].format(
11314
+ extract_length_resolved=extract_length_resolved
11315
+ )
11316
+
11317
+ details = (
11318
+ f"<div style='font-size: 13.6px; {direction_rtl}'>"
11319
+ "<div style='padding-top: 7px;'>"
11320
+ f"{text}"
11321
+ "</div>"
11322
+ "<div style='padding-top: 7px;'>"
11323
+ f"{failure_rate_stmt}"
11324
+ "</div>"
11325
+ f"{extract_text}"
11326
+ "</div>"
11327
+ )
11328
+
11329
+ # If `header` is None then don't add a header and just return the step report
11330
+ if header is None:
11331
+ return step_report
11332
+
11333
+ # Generate the default template text for the header when `":default:"` is used
11334
+ if header == ":default:":
11335
+ header = "{title}{details}"
11336
+
11337
+ # Use commonmark to convert the header text to HTML
11338
+ header = commonmark.commonmark(header)
11339
+
11340
+ # Place any templated text in the header
11341
+ header = header.format(title=title, details=details)
11342
+
11343
+ # Create the header with `header` string
11344
+ step_report = step_report.tab_header(title=md(header))
11345
+
11346
+ return step_report
11347
+
11348
+
10892
11349
  def _step_report_schema_in_order(
10893
11350
  step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False
10894
11351
  ) -> GT | any: