pointblank 0.17.0__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/column.py CHANGED
@@ -2,12 +2,14 @@ from __future__ import annotations
2
2
 
3
3
  import re
4
4
  from dataclasses import dataclass
5
+ from typing import Any
5
6
 
6
7
  import narwhals as nw
7
8
  from narwhals.typing import IntoDataFrame
8
9
 
9
10
  __all__ = [
10
11
  "col",
12
+ "ref",
11
13
  "starts_with",
12
14
  "ends_with",
13
15
  "contains",
@@ -192,6 +194,22 @@ class ColumnLiteral(Column):
192
194
  return self.exprs
193
195
 
194
196
 
197
+ @dataclass
198
+ class ReferenceColumn:
199
+ """
200
+ A class to represent a column from the reference data.
201
+
202
+ This is used with aggregate validation methods (like `col_sum_eq`, `col_avg_gt`, etc.)
203
+ to compare the aggregate value of a column in the main data against the aggregate
204
+ value of a column in the reference data.
205
+ """
206
+
207
+ column_name: str
208
+
209
+ def __repr__(self):
210
+ return f"ref({self.column_name!r})"
211
+
212
+
195
213
  @dataclass
196
214
  class ColumnSelectorNarwhals(Column):
197
215
  """
@@ -211,11 +229,16 @@ class ColumnSelectorNarwhals(Column):
211
229
 
212
230
  exprs: nw.selectors.Selector
213
231
 
214
- def resolve(self, table) -> list[str]:
232
+ def resolve(
233
+ self, columns: list[str] | None = None, table: IntoDataFrame | None = None
234
+ ) -> list[str]:
235
+ # Note: columns parameter is unused - Narwhals selectors need the actual table
236
+ if table is None:
237
+ raise ValueError("ColumnSelectorNarwhals requires a table for resolution")
215
238
  # Convert the native table to a Narwhals DataFrame
216
239
  dfn = nw.from_native(table)
217
240
  # Use the selector to select columns and return their names
218
- selected_df = dfn.select(self.exprs.exprs)
241
+ selected_df = dfn.select(self.exprs.exprs) # type: ignore[attr-defined]
219
242
  # Use `collect_schema()` for LazyFrame to avoid performance warnings
220
243
  if hasattr(selected_df, "collect_schema"):
221
244
  return list(selected_df.collect_schema().keys())
@@ -224,7 +247,7 @@ class ColumnSelectorNarwhals(Column):
224
247
 
225
248
 
226
249
  def col(
227
- exprs: str | ColumnSelector | ColumnSelectorNarwhals,
250
+ exprs: str | ColumnSelector | ColumnSelectorNarwhals | nw.selectors.Selector,
228
251
  ) -> Column | ColumnLiteral | ColumnSelectorNarwhals:
229
252
  """
230
253
  Helper function for referencing a column in the input table.
@@ -522,6 +545,83 @@ def col(
522
545
  raise TypeError(f"Unsupported type: {type(exprs)}") # pragma: no cover
523
546
 
524
547
 
548
+ def ref(column_name: str) -> ReferenceColumn:
549
+ """
550
+ Reference a column from the reference data for aggregate comparisons.
551
+
552
+ This function is used with aggregate validation methods (like `col_sum_eq`, `col_avg_gt`, etc.)
553
+ to compare the aggregate value of a column in the main data against the aggregate value of
554
+ a column in the reference data.
555
+
556
+ To use this function, you must first set the reference data on the `Validate` object using
557
+ the `reference=` parameter in the constructor.
558
+
559
+ Parameters
560
+ ----------
561
+ column_name
562
+ The name of the column in the reference data to compute the aggregate from.
563
+
564
+ Returns
565
+ -------
566
+ ReferenceColumn
567
+ A reference column marker that indicates the value should be computed from the
568
+ reference data.
569
+
570
+ Examples
571
+ --------
572
+ ```{python}
573
+ #| echo: false
574
+ #| output: false
575
+ import pointblank as pb
576
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
577
+ ```
578
+
579
+ Suppose we have two DataFrames: a current data table and a reference (historical) table.
580
+ We want to validate that the sum of a column in the current data matches the sum of the
581
+ same column in the reference data.
582
+
583
+ ```{python}
584
+ import pointblank as pb
585
+ import polars as pl
586
+
587
+ # Current data
588
+ current_data = pl.DataFrame({"sales": [100, 200, 300]})
589
+
590
+ # Reference (historical) data
591
+ reference_data = pl.DataFrame({"sales": [100, 200, 300]})
592
+
593
+ validation = (
594
+ pb.Validate(data=current_data, reference=reference_data)
595
+ .col_sum_eq("sales", pb.ref("sales"))
596
+ .interrogate()
597
+ )
598
+
599
+ validation
600
+ ```
601
+
602
+ You can also compare different columns or use tolerance:
603
+
604
+ ```{python}
605
+ current_data = pl.DataFrame({"revenue": [105, 205, 305]})
606
+ reference_data = pl.DataFrame({"sales": [100, 200, 300]})
607
+
608
+ # Check if revenue sum is within 10% of sales sum
609
+ validation = (
610
+ pb.Validate(data=current_data, reference=reference_data)
611
+ .col_sum_eq("revenue", pb.ref("sales"), tol=0.1)
612
+ .interrogate()
613
+ )
614
+
615
+ validation
616
+ ```
617
+
618
+ See Also
619
+ --------
620
+ The [`col()`](`pointblank.col`) function for referencing columns within the same table.
621
+ """
622
+ return ReferenceColumn(column_name=column_name)
623
+
624
+
525
625
  def starts_with(text: str, case_sensitive: bool = False) -> StartsWith:
526
626
  """
527
627
  Select columns that start with specified text.
@@ -1634,13 +1734,25 @@ class ColumnExpression:
1634
1734
  Supports operations like >, <, +, etc. for creating backend-agnostic validation expressions.
1635
1735
  """
1636
1736
 
1637
- def __init__(self, column_name=None, operation=None, left=None, right=None):
1737
+ column_name: str | None
1738
+ operation: str | None
1739
+ left: ColumnExpression | None
1740
+ right: ColumnExpression | str | int | float | None
1741
+
1742
+ def __init__(
1743
+ self,
1744
+ column_name: str | None = None,
1745
+ operation: str | None = None,
1746
+ left: ColumnExpression | None = None,
1747
+ right: ColumnExpression | str | int | float | None = None,
1748
+ ):
1638
1749
  self.column_name = column_name # Name of the column (for leaf nodes)
1639
1750
  self.operation = operation # Operation type (gt, lt, add, etc.)
1640
1751
  self.left = left # Left operand (ColumnExpression or None for column reference)
1641
1752
  self.right = right # Right operand (ColumnExpression, value, or None)
1642
1753
 
1643
- def to_polars_expr(self):
1754
+ # TODO: This method would benefit from stronger typing
1755
+ def to_polars_expr(self) -> Any:
1644
1756
  """Convert this expression to a Polars expression."""
1645
1757
  import polars as pl
1646
1758
 
@@ -1650,16 +1762,16 @@ class ColumnExpression:
1650
1762
 
1651
1763
  # Handle unary operations like is_null
1652
1764
  if self.operation == "is_null":
1653
- left_expr = self.left
1765
+ left_expr: Any = self.left
1654
1766
  if isinstance(left_expr, ColumnExpression):
1655
1767
  left_expr = left_expr.to_polars_expr()
1656
- return left_expr.is_null()
1768
+ return left_expr.is_null() # type: ignore[union-attr]
1657
1769
 
1658
1770
  if self.operation == "is_not_null":
1659
1771
  left_expr = self.left
1660
1772
  if isinstance(left_expr, ColumnExpression):
1661
1773
  left_expr = left_expr.to_polars_expr()
1662
- return left_expr.is_not_null()
1774
+ return left_expr.is_not_null() # type: ignore[union-attr]
1663
1775
 
1664
1776
  # Handle nested expressions through recursive evaluation
1665
1777
  if self.operation is None:
@@ -1667,6 +1779,7 @@ class ColumnExpression:
1667
1779
  raise ValueError("Invalid expression state: No operation or column name")
1668
1780
 
1669
1781
  # Get the left operand
1782
+ left_expr: Any
1670
1783
  if self.left is None and self.column_name is not None:
1671
1784
  # Column name as left operand
1672
1785
  left_expr = pl.col(self.column_name) # pragma: no cover
@@ -1678,6 +1791,7 @@ class ColumnExpression:
1678
1791
  left_expr = self.left # pragma: no cover
1679
1792
 
1680
1793
  # Get the right operand
1794
+ right_expr: Any
1681
1795
  if isinstance(self.right, ColumnExpression):
1682
1796
  # Nested expression as right operand
1683
1797
  right_expr = self.right.to_polars_expr() # pragma: no cover
@@ -1688,35 +1802,35 @@ class ColumnExpression:
1688
1802
  # Literal value as right operand
1689
1803
  right_expr = self.right # pragma: no cover
1690
1804
 
1691
- # Apply the operation
1805
+ # Apply the operation (type ignore needed due to dynamic expression types)
1692
1806
  if self.operation == "gt":
1693
- return left_expr > right_expr
1807
+ return left_expr > right_expr # type: ignore[operator]
1694
1808
  elif self.operation == "lt":
1695
- return left_expr < right_expr
1809
+ return left_expr < right_expr # type: ignore[operator]
1696
1810
  elif self.operation == "eq":
1697
1811
  return left_expr == right_expr
1698
1812
  elif self.operation == "ne":
1699
1813
  return left_expr != right_expr
1700
1814
  elif self.operation == "ge":
1701
- return left_expr >= right_expr
1815
+ return left_expr >= right_expr # type: ignore[operator]
1702
1816
  elif self.operation == "le":
1703
- return left_expr <= right_expr
1817
+ return left_expr <= right_expr # type: ignore[operator]
1704
1818
  elif self.operation == "add":
1705
- return left_expr + right_expr
1819
+ return left_expr + right_expr # type: ignore[operator]
1706
1820
  elif self.operation == "sub":
1707
- return left_expr - right_expr
1821
+ return left_expr - right_expr # type: ignore[operator]
1708
1822
  elif self.operation == "mul":
1709
- return left_expr * right_expr
1823
+ return left_expr * right_expr # type: ignore[operator]
1710
1824
  elif self.operation == "div":
1711
- return left_expr / right_expr
1825
+ return left_expr / right_expr # type: ignore[operator]
1712
1826
  elif self.operation == "and":
1713
- return left_expr & right_expr
1827
+ return left_expr & right_expr # type: ignore[operator]
1714
1828
  elif self.operation == "or":
1715
- return left_expr | right_expr
1829
+ return left_expr | right_expr # type: ignore[operator]
1716
1830
  else:
1717
1831
  raise ValueError(f"Unsupported operation: {self.operation}")
1718
1832
 
1719
- def to_pandas_expr(self, df):
1833
+ def to_pandas_expr(self, df: Any) -> Any:
1720
1834
  """Convert this expression to a Pandas Series of booleans."""
1721
1835
 
1722
1836
  # Handle is_null as a special case - but raise an error
@@ -1739,43 +1853,43 @@ class ColumnExpression:
1739
1853
  return df[self.column_name]
1740
1854
 
1741
1855
  # For other operations, recursively process operands
1742
- left_expr = self.left
1856
+ left_expr: Any = self.left
1743
1857
  if isinstance(left_expr, ColumnExpression):
1744
1858
  left_expr = left_expr.to_pandas_expr(df)
1745
1859
  elif isinstance(left_expr, str) and left_expr in df.columns: # pragma: no cover
1746
1860
  left_expr = df[left_expr]
1747
1861
 
1748
- right_expr = self.right
1862
+ right_expr: Any = self.right
1749
1863
  if isinstance(right_expr, ColumnExpression):
1750
1864
  right_expr = right_expr.to_pandas_expr(df)
1751
1865
  elif isinstance(right_expr, str) and right_expr in df.columns: # pragma: no cover
1752
1866
  right_expr = df[right_expr]
1753
1867
 
1754
- # Apply the operation
1868
+ # Apply the operation (type ignore needed due to dynamic expression types)
1755
1869
  if self.operation == "gt":
1756
- return left_expr > right_expr
1870
+ return left_expr > right_expr # type: ignore[operator]
1757
1871
  elif self.operation == "lt":
1758
- return left_expr < right_expr
1872
+ return left_expr < right_expr # type: ignore[operator]
1759
1873
  elif self.operation == "eq":
1760
1874
  return left_expr == right_expr
1761
1875
  elif self.operation == "ne":
1762
1876
  return left_expr != right_expr
1763
1877
  elif self.operation == "ge":
1764
- return left_expr >= right_expr
1878
+ return left_expr >= right_expr # type: ignore[operator]
1765
1879
  elif self.operation == "le":
1766
- return left_expr <= right_expr
1880
+ return left_expr <= right_expr # type: ignore[operator]
1767
1881
  elif self.operation == "add":
1768
- return left_expr + right_expr
1882
+ return left_expr + right_expr # type: ignore[operator]
1769
1883
  elif self.operation == "sub":
1770
- return left_expr - right_expr
1884
+ return left_expr - right_expr # type: ignore[operator]
1771
1885
  elif self.operation == "mul":
1772
- return left_expr * right_expr
1886
+ return left_expr * right_expr # type: ignore[operator]
1773
1887
  elif self.operation == "div":
1774
- return left_expr / right_expr
1888
+ return left_expr / right_expr # type: ignore[operator]
1775
1889
  else:
1776
1890
  raise ValueError(f"Unsupported operation: {self.operation}")
1777
1891
 
1778
- def to_ibis_expr(self, table):
1892
+ def to_ibis_expr(self, table: Any) -> Any:
1779
1893
  """Convert this expression to an Ibis expression."""
1780
1894
 
1781
1895
  # Base case: simple column reference
@@ -1784,16 +1898,16 @@ class ColumnExpression:
1784
1898
 
1785
1899
  # Handle unary operations
1786
1900
  if self.operation == "is_null":
1787
- left_expr = self.left
1901
+ left_expr: Any = self.left
1788
1902
  if isinstance(left_expr, ColumnExpression):
1789
1903
  left_expr = left_expr.to_ibis_expr(table)
1790
- return left_expr.isnull()
1904
+ return left_expr.isnull() # type: ignore[union-attr]
1791
1905
 
1792
1906
  if self.operation == "is_not_null":
1793
1907
  left_expr = self.left
1794
1908
  if isinstance(left_expr, ColumnExpression):
1795
1909
  left_expr = left_expr.to_ibis_expr(table)
1796
- return ~left_expr.isnull()
1910
+ return ~left_expr.isnull() # type: ignore[union-attr,operator]
1797
1911
 
1798
1912
  # Handle nested expressions through recursive evaluation
1799
1913
  if self.operation is None:
@@ -1801,6 +1915,7 @@ class ColumnExpression:
1801
1915
  raise ValueError("Invalid expression state: No operation or column name")
1802
1916
 
1803
1917
  # Get the left operand
1918
+ left_expr: Any
1804
1919
  if self.left is None and self.column_name is not None:
1805
1920
  # Column name as left operand
1806
1921
  left_expr = table[self.column_name] # pragma: no cover
@@ -1812,6 +1927,7 @@ class ColumnExpression:
1812
1927
  left_expr = self.left # pragma: no cover
1813
1928
 
1814
1929
  # Get the right operand
1930
+ right_expr: Any
1815
1931
  if isinstance(self.right, ColumnExpression):
1816
1932
  # Nested expression as right operand
1817
1933
  right_expr = self.right.to_ibis_expr(table) # pragma: no cover
@@ -1822,31 +1938,31 @@ class ColumnExpression:
1822
1938
  # Literal value as right operand
1823
1939
  right_expr = self.right # pragma: no cover
1824
1940
 
1825
- # Apply the operation
1941
+ # Apply the operation (type ignore needed due to dynamic expression types)
1826
1942
  if self.operation == "gt":
1827
- return left_expr > right_expr
1943
+ return left_expr > right_expr # type: ignore[operator]
1828
1944
  elif self.operation == "lt":
1829
- return left_expr < right_expr
1945
+ return left_expr < right_expr # type: ignore[operator]
1830
1946
  elif self.operation == "eq":
1831
1947
  return left_expr == right_expr
1832
1948
  elif self.operation == "ne":
1833
1949
  return left_expr != right_expr
1834
1950
  elif self.operation == "ge":
1835
- return left_expr >= right_expr
1951
+ return left_expr >= right_expr # type: ignore[operator]
1836
1952
  elif self.operation == "le":
1837
- return left_expr <= right_expr
1953
+ return left_expr <= right_expr # type: ignore[operator]
1838
1954
  elif self.operation == "add":
1839
- return left_expr + right_expr
1955
+ return left_expr + right_expr # type: ignore[operator]
1840
1956
  elif self.operation == "sub":
1841
- return left_expr - right_expr
1957
+ return left_expr - right_expr # type: ignore[operator]
1842
1958
  elif self.operation == "mul":
1843
- return left_expr * right_expr
1959
+ return left_expr * right_expr # type: ignore[operator]
1844
1960
  elif self.operation == "div":
1845
- return left_expr / right_expr
1961
+ return left_expr / right_expr # type: ignore[operator]
1846
1962
  elif self.operation == "and":
1847
- return left_expr & right_expr
1963
+ return left_expr & right_expr # type: ignore[operator]
1848
1964
  elif self.operation == "or":
1849
- return left_expr | right_expr
1965
+ return left_expr | right_expr # type: ignore[operator]
1850
1966
  else:
1851
1967
  raise ValueError(f"Unsupported operation: {self.operation}")
1852
1968