pointblank 0.16.0__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import functools
4
+ from collections.abc import Callable
4
5
  from dataclasses import dataclass
5
6
  from typing import Any
6
7
 
@@ -16,6 +17,7 @@ from pointblank._spec_utils import (
16
17
  check_postal_code,
17
18
  check_vin,
18
19
  )
20
+ from pointblank._typing import AbsoluteBounds
19
21
  from pointblank._utils import (
20
22
  _column_test_prep,
21
23
  _convert_to_narwhals,
@@ -745,6 +747,28 @@ def row_count_match(data_tbl: FrameT, count, inverse: bool, abs_tol_bounds) -> b
745
747
  return row_count >= min_val and row_count <= max_val
746
748
 
747
749
 
750
+ def col_pct_null(
751
+ data_tbl: FrameT, column: str, p: float, bound_finder: Callable[[int], AbsoluteBounds]
752
+ ) -> bool:
753
+ """Check if the percentage of null vales are within p given the absolute bounds."""
754
+ # Convert to narwhals for consistent API across backends
755
+ nw_tbl = nw.from_native(data_tbl)
756
+
757
+ # Handle LazyFrames by collecting them first
758
+ if hasattr(nw_tbl, "collect"):
759
+ nw_tbl = nw_tbl.collect()
760
+
761
+ # Get total rows using narwhals
762
+ total_rows: int = nw_tbl.select(nw.len()).item()
763
+ abs_target: float = round(total_rows * p)
764
+ lower_bound, upper_bound = bound_finder(abs_target)
765
+
766
+ # Count null values
767
+ n_null: int = nw_tbl.select(nw.col(column).is_null().sum()).item()
768
+
769
+ return n_null >= (abs_target - lower_bound) and n_null <= (abs_target + upper_bound)
770
+
771
+
748
772
  def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
749
773
  """
750
774
  Check if DataFrame column count matches expected count.
@@ -104,6 +104,7 @@ def _get_api_text() -> str:
104
104
  "Validate.rows_distinct",
105
105
  "Validate.rows_complete",
106
106
  "Validate.col_exists",
107
+ "Validate.col_pct_null",
107
108
  "Validate.col_schema_match",
108
109
  "Validate.row_count_match",
109
110
  "Validate.col_count_match",
@@ -5402,6 +5402,247 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
5402
5402
  failing validation step (the check for column `c`, which doesn't exist).
5403
5403
 
5404
5404
 
5405
+ col_pct_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', p: 'float', tol: 'Tolerance' = 0, thresholds: 'int | float | None | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
5406
+
5407
+ Validate whether a column has a specific percentage of Null values.
5408
+
5409
+ The `col_pct_null()` validation method checks whether the percentage of Null values in a
5410
+ column matches a specified percentage `p=` (within an optional tolerance `tol=`). This
5411
+ validation operates at the column level, generating a single validation step per column that
5412
+ passes or fails based on whether the actual percentage of Null values falls within the
5413
+ acceptable range defined by `p ± tol`.
5414
+
5415
+ Parameters
5416
+ ----------
5417
+ columns
5418
+ A single column or a list of columns to validate. Can also use
5419
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
5420
+ multiple columns are supplied or resolved, there will be a separate validation step
5421
+ generated for each column.
5422
+ p
5423
+ The expected percentage of Null values in the column, expressed as a decimal between
5424
+ `0.0` and `1.0`. For example, `p=0.5` means 50% of values should be Null.
5425
+ tol
5426
+ The tolerance allowed when comparing the actual percentage of Null values to the
5427
+ expected percentage `p=`. The validation passes if the actual percentage falls within
5428
+ the range `[p - tol, p + tol]`. Default is `0`, meaning an exact match is required. See
5429
+ the *Tolerance* section for details on all supported formats (absolute, relative,
5430
+ symmetric, and asymmetric bounds).
5431
+ thresholds
5432
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
5433
+ The thresholds are set at the step level and will override any global thresholds set in
5434
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
5435
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
5436
+ section for information on how to set threshold levels.
5437
+ actions
5438
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
5439
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
5440
+ define the actions.
5441
+ brief
5442
+ An optional brief description of the validation step that will be displayed in the
5443
+ reporting table. You can use the templating elements like `"{step}"` to insert
5444
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
5445
+ the entire brief will be automatically generated. If `None` (the default) then there
5446
+ won't be a brief.
5447
+ active
5448
+ A boolean value indicating whether the validation step should be active. Using `False`
5449
+ will make the validation step inactive (still reporting its presence and keeping indexes
5450
+ for the steps unchanged).
5451
+
5452
+ Returns
5453
+ -------
5454
+ Validate
5455
+ The `Validate` object with the added validation step.
5456
+
5457
+ Tolerance
5458
+ ---------
5459
+ The `tol=` parameter accepts several different formats to specify the acceptable deviation
5460
+ from the expected percentage `p=`. The tolerance can be expressed as:
5461
+
5462
+ 1. *single integer* (absolute tolerance): the exact number of test units that can deviate.
5463
+ For example, `tol=2` means the actual count can differ from the expected count by up to 2
5464
+ units in either direction.
5465
+
5466
+ 2. *single float between 0 and 1* (relative tolerance): a proportion of the expected
5467
+ count. For example, if the expected count is 50 and `tol=0.1`, the acceptable range is
5468
+ 45 to 55 (50 ± 10% of 50 = 50 ± 5).
5469
+
5470
+ 3. *tuple of two integers* (absolute bounds): explicitly specify the lower and upper
5471
+ bounds as absolute deviations. For example, `tol=(1, 3)` means the actual count can be
5472
+ 1 unit below or 3 units above the expected count.
5473
+
5474
+ 4. *tuple of two floats between 0 and 1* (relative bounds): explicitly specify the lower
5475
+ and upper bounds as proportional deviations. For example, `tol=(0.05, 0.15)` means the
5476
+ lower bound is 5% below and the upper bound is 15% above the expected count.
5477
+
5478
+ When using a single value (integer or float), the tolerance is applied symmetrically in both
5479
+ directions. When using a tuple, you can specify asymmetric tolerances where the lower and
5480
+ upper bounds differ.
5481
+
5482
+ Thresholds
5483
+ ----------
5484
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
5485
+ step. If they are set here at the step level, these thresholds will override any thresholds
5486
+ set at the global level in `Validate(thresholds=...)`.
5487
+
5488
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
5489
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
5490
+ or, the absolute number of failing test units (as integer that's `1` or greater).
5491
+
5492
+ Thresholds can be defined using one of these input schemes:
5493
+
5494
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
5495
+ thresholds)
5496
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
5497
+ the 'error' level, and position `2` is the 'critical' level
5498
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
5499
+ 'critical'
5500
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
5501
+ for the 'warning' level only
5502
+
5503
+ If the number of failing test units exceeds set thresholds, the validation step will be
5504
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
5505
+ set, you're free to set any combination of them.
5506
+
5507
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
5508
+ take for each level of failure (using the `actions=` parameter).
5509
+
5510
+ Examples
5511
+ --------
5512
+ For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`,
5513
+ and `c`) that have different percentages of Null values. The table is shown below:
5514
+
5515
+ ```python
5516
+ import pointblank as pb
5517
+ import polars as pl
5518
+
5519
+ tbl = pl.DataFrame(
5520
+ {
5521
+ "a": [1, 2, 3, 4, 5, 6, 7, 8],
5522
+ "b": [1, None, 3, None, 5, None, 7, None],
5523
+ "c": [None, None, None, None, None, None, 1, 2],
5524
+ }
5525
+ )
5526
+
5527
+ pb.preview(tbl)
5528
+ ```
5529
+
5530
+ Let's validate that column `a` has 0% Null values (i.e., no Null values at all).
5531
+
5532
+ ```python
5533
+ validation = (
5534
+ pb.Validate(data=tbl)
5535
+ .col_pct_null(columns="a", p=0.0)
5536
+ .interrogate()
5537
+ )
5538
+
5539
+ validation
5540
+ ```
5541
+
5542
+ Printing the `validation` object shows the validation table in an HTML viewing environment.
5543
+ The validation table shows the single entry that corresponds to the validation step created
5544
+ by using `col_pct_null()`. The validation passed since column `a` has no Null values.
5545
+
5546
+ Now, let's check that column `b` has exactly 50% Null values.
5547
+
5548
+ ```python
5549
+ validation = (
5550
+ pb.Validate(data=tbl)
5551
+ .col_pct_null(columns="b", p=0.5)
5552
+ .interrogate()
5553
+ )
5554
+
5555
+ validation
5556
+ ```
5557
+
5558
+ This validation also passes, as column `b` has exactly 4 out of 8 values as Null (50%).
5559
+
5560
+ Finally, let's validate column `c` with a tolerance. Column `c` has 75% Null values, so
5561
+ we'll check if it's approximately 70% Null with a tolerance of 10%.
5562
+
5563
+ ```python
5564
+ validation = (
5565
+ pb.Validate(data=tbl)
5566
+ .col_pct_null(columns="c", p=0.70, tol=0.10)
5567
+ .interrogate()
5568
+ )
5569
+
5570
+ validation
5571
+ ```
5572
+
5573
+ This validation passes because the actual percentage (75%) falls within the acceptable
5574
+ range of 60% to 80% (70% ± 10%).
5575
+
5576
+ The `tol=` parameter supports multiple formats to express tolerance. Let's explore all the
5577
+ different ways to specify tolerance using column `b`, which has exactly 50% Null values
5578
+ (4 out of 8 values).
5579
+
5580
+ *Using an absolute tolerance (integer)*: Specify the exact number of rows that can
5581
+ deviate. With `tol=1`, we allow the count to differ by 1 row in either direction.
5582
+
5583
+ ```python
5584
+ validation = (
5585
+ pb.Validate(data=tbl)
5586
+ .col_pct_null(columns="b", p=0.375, tol=1) # Expect 3 nulls, allow ±1 (range: 2-4)
5587
+ .interrogate()
5588
+ )
5589
+
5590
+ validation
5591
+ ```
5592
+
5593
+ This passes because column `b` has 4 Null values, which falls within the acceptable range
5594
+ of 2 to 4 (3 ± 1).
5595
+
5596
+ *Using a relative tolerance (float)*: Specify the tolerance as a proportion of the
5597
+ expected count. With `tol=0.25`, we allow a 25% deviation from the expected count.
5598
+
5599
+ ```python
5600
+ validation = (
5601
+ pb.Validate(data=tbl)
5602
+ .col_pct_null(columns="b", p=0.375, tol=0.25) # Expect 3 nulls, allow ±25% (range: 2.25-3.75)
5603
+ .interrogate()
5604
+ )
5605
+
5606
+ validation
5607
+ ```
5608
+
5609
+ This passes because 4 Null values falls within the acceptable range (3 ± 0.75 calculates
5610
+ to 2.25 to 3.75, which rounds down to 2 to 3 rows).
5611
+
5612
+ *Using asymmetric absolute bounds (tuple of integers)*: Specify different lower and
5613
+ upper bounds as absolute values. With `tol=(0, 2)`, we allow no deviation below but up
5614
+ to 2 rows above the expected count.
5615
+
5616
+ ```python
5617
+ validation = (
5618
+ pb.Validate(data=tbl)
5619
+ .col_pct_null(columns="b", p=0.25, tol=(0, 2)) # Expect 2 Nulls, allow +0/-2 (range: 2-4)
5620
+ .interrogate()
5621
+ )
5622
+
5623
+ validation
5624
+ ```
5625
+
5626
+ This passes because 4 Null values falls within the acceptable range of 2 to 4.
5627
+
5628
+ *Using asymmetric relative bounds (tuple of floats)*: Specify different lower and upper
5629
+ bounds as proportions. With `tol=(0.1, 0.3)`, we allow 10% below and 30% above the
5630
+ expected count.
5631
+
5632
+ ```python
5633
+ validation = (
5634
+ pb.Validate(data=tbl)
5635
+ .col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3) # Expect 3 Nulls, allow -10%/+30%
5636
+ .interrogate()
5637
+ )
5638
+
5639
+ validation
5640
+ ```
5641
+
5642
+ This passes because 4 Null values falls within the acceptable range (3 - 0.3 to 3 + 0.9
5643
+ calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows).
5644
+
5645
+
5405
5646
  col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
5406
5647
 
5407
5648
  Do columns in the table (and their types) match a predefined schema?
@@ -8596,7 +8837,7 @@ set_tbl(self, tbl: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str |
8596
8837
  ```
8597
8838
 
8598
8839
 
8599
- get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool' = None, incl_footer: 'bool' = None) -> 'GT'
8840
+ get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool' = None, incl_footer: 'bool' = None, incl_footer_timings: 'bool' = None, incl_footer_notes: 'bool' = None) -> 'GT'
8600
8841
 
8601
8842
  Validation report as a GT table.
8602
8843
 
@@ -8618,6 +8859,20 @@ get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool'
8618
8859
  name of the table as the title for the report. If no title is wanted, then `":none:"`
8619
8860
  can be used. Aside from keyword options, text can be provided for the title. This will
8620
8861
  be interpreted as Markdown text and transformed internally to HTML.
8862
+ incl_header
8863
+ Controls whether the header section should be displayed. If `None`, uses the global
8864
+ configuration setting. The header contains the table name, label, and threshold
8865
+ information.
8866
+ incl_footer
8867
+ Controls whether the footer section should be displayed. If `None`, uses the global
8868
+ configuration setting. The footer can contain validation timing information and notes.
8869
+ incl_footer_timings
8870
+ Controls whether validation timing information (start time, duration, end time) should
8871
+ be displayed in the footer. If `None`, uses the global configuration setting. Only
8872
+ applies when `incl_footer=True`.
8873
+ incl_footer_notes
8874
+ Controls whether notes from validation steps should be displayed in the footer. If
8875
+ `None`, uses the global configuration setting. Only applies when `incl_footer=True`.
8621
8876
 
8622
8877
  Returns
8623
8878
  -------
@@ -12310,7 +12565,7 @@ read_file(filepath: 'str | Path') -> 'Validate'
12310
12565
  to disk for later retrieval with this function.
12311
12566
 
12312
12567
 
12313
- config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, preview_incl_header: 'bool' = True) -> 'PointblankConfig'
12568
+ config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, report_incl_footer_timings: 'bool' = True, report_incl_footer_notes: 'bool' = True, preview_incl_header: 'bool' = True) -> 'PointblankConfig'
12314
12569
 
12315
12570
  Configuration settings for the Pointblank library.
12316
12571
 
@@ -12322,7 +12577,13 @@ config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, pre
12322
12577
  threshold levels (if set).
12323
12578
  report_incl_footer
12324
12579
  Should the footer of the validation table report be displayed? The footer contains the
12325
- starting and ending times of the interrogation.
12580
+ starting and ending times of the interrogation and any notes added to validation steps.
12581
+ report_incl_footer_timings
12582
+ Controls whether the validation timing information (start time, duration, and end time)
12583
+ should be displayed in the footer. Only applies when `report_incl_footer=True`.
12584
+ report_incl_footer_notes
12585
+ Controls whether the notes from validation steps should be displayed in the footer. Only
12586
+ applies when `report_incl_footer=True`.
12326
12587
  preview_incl_header
12327
12588
  Whether the header should be present in any preview table (generated via the
12328
12589
  [`preview()`](`pointblank.preview`) function).