pointblank 0.17.0__py3-none-any.whl → 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_agg.py +120 -0
- pointblank/_constants.py +334 -55
- pointblank/_constants_translations.py +378 -0
- pointblank/_datascan_utils.py +28 -10
- pointblank/_interrogation.py +406 -149
- pointblank/_typing.py +12 -0
- pointblank/_utils.py +81 -44
- pointblank/_utils_ai.py +4 -5
- pointblank/_utils_check_args.py +3 -3
- pointblank/_utils_llms_txt.py +40 -2
- pointblank/actions.py +1 -1
- pointblank/assistant.py +2 -3
- pointblank/cli.py +1 -1
- pointblank/column.py +162 -46
- pointblank/data/api-docs.txt +2695 -49
- pointblank/datascan.py +17 -17
- pointblank/draft.py +2 -3
- pointblank/scan_profile.py +2 -1
- pointblank/schema.py +61 -20
- pointblank/thresholds.py +15 -13
- pointblank/validate.py +2034 -233
- pointblank/validate.pyi +1104 -0
- pointblank/yaml.py +10 -6
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/METADATA +2 -2
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/RECORD +30 -28
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/WHEEL +1 -1
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/licenses/LICENSE +1 -1
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.17.0.dist-info → pointblank-0.19.0.dist-info}/top_level.txt +0 -0
pointblank/column.py
CHANGED
|
@@ -2,12 +2,14 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
+
from typing import Any
|
|
5
6
|
|
|
6
7
|
import narwhals as nw
|
|
7
8
|
from narwhals.typing import IntoDataFrame
|
|
8
9
|
|
|
9
10
|
__all__ = [
|
|
10
11
|
"col",
|
|
12
|
+
"ref",
|
|
11
13
|
"starts_with",
|
|
12
14
|
"ends_with",
|
|
13
15
|
"contains",
|
|
@@ -192,6 +194,22 @@ class ColumnLiteral(Column):
|
|
|
192
194
|
return self.exprs
|
|
193
195
|
|
|
194
196
|
|
|
197
|
+
@dataclass
|
|
198
|
+
class ReferenceColumn:
|
|
199
|
+
"""
|
|
200
|
+
A class to represent a column from the reference data.
|
|
201
|
+
|
|
202
|
+
This is used with aggregate validation methods (like `col_sum_eq`, `col_avg_gt`, etc.)
|
|
203
|
+
to compare the aggregate value of a column in the main data against the aggregate
|
|
204
|
+
value of a column in the reference data.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
column_name: str
|
|
208
|
+
|
|
209
|
+
def __repr__(self):
|
|
210
|
+
return f"ref({self.column_name!r})"
|
|
211
|
+
|
|
212
|
+
|
|
195
213
|
@dataclass
|
|
196
214
|
class ColumnSelectorNarwhals(Column):
|
|
197
215
|
"""
|
|
@@ -211,11 +229,16 @@ class ColumnSelectorNarwhals(Column):
|
|
|
211
229
|
|
|
212
230
|
exprs: nw.selectors.Selector
|
|
213
231
|
|
|
214
|
-
def resolve(
|
|
232
|
+
def resolve(
|
|
233
|
+
self, columns: list[str] | None = None, table: IntoDataFrame | None = None
|
|
234
|
+
) -> list[str]:
|
|
235
|
+
# Note: columns parameter is unused - Narwhals selectors need the actual table
|
|
236
|
+
if table is None:
|
|
237
|
+
raise ValueError("ColumnSelectorNarwhals requires a table for resolution")
|
|
215
238
|
# Convert the native table to a Narwhals DataFrame
|
|
216
239
|
dfn = nw.from_native(table)
|
|
217
240
|
# Use the selector to select columns and return their names
|
|
218
|
-
selected_df = dfn.select(self.exprs.exprs)
|
|
241
|
+
selected_df = dfn.select(self.exprs.exprs) # type: ignore[attr-defined]
|
|
219
242
|
# Use `collect_schema()` for LazyFrame to avoid performance warnings
|
|
220
243
|
if hasattr(selected_df, "collect_schema"):
|
|
221
244
|
return list(selected_df.collect_schema().keys())
|
|
@@ -224,7 +247,7 @@ class ColumnSelectorNarwhals(Column):
|
|
|
224
247
|
|
|
225
248
|
|
|
226
249
|
def col(
|
|
227
|
-
exprs: str | ColumnSelector | ColumnSelectorNarwhals,
|
|
250
|
+
exprs: str | ColumnSelector | ColumnSelectorNarwhals | nw.selectors.Selector,
|
|
228
251
|
) -> Column | ColumnLiteral | ColumnSelectorNarwhals:
|
|
229
252
|
"""
|
|
230
253
|
Helper function for referencing a column in the input table.
|
|
@@ -522,6 +545,83 @@ def col(
|
|
|
522
545
|
raise TypeError(f"Unsupported type: {type(exprs)}") # pragma: no cover
|
|
523
546
|
|
|
524
547
|
|
|
548
|
+
def ref(column_name: str) -> ReferenceColumn:
|
|
549
|
+
"""
|
|
550
|
+
Reference a column from the reference data for aggregate comparisons.
|
|
551
|
+
|
|
552
|
+
This function is used with aggregate validation methods (like `col_sum_eq`, `col_avg_gt`, etc.)
|
|
553
|
+
to compare the aggregate value of a column in the main data against the aggregate value of
|
|
554
|
+
a column in the reference data.
|
|
555
|
+
|
|
556
|
+
To use this function, you must first set the reference data on the `Validate` object using
|
|
557
|
+
the `reference=` parameter in the constructor.
|
|
558
|
+
|
|
559
|
+
Parameters
|
|
560
|
+
----------
|
|
561
|
+
column_name
|
|
562
|
+
The name of the column in the reference data to compute the aggregate from.
|
|
563
|
+
|
|
564
|
+
Returns
|
|
565
|
+
-------
|
|
566
|
+
ReferenceColumn
|
|
567
|
+
A reference column marker that indicates the value should be computed from the
|
|
568
|
+
reference data.
|
|
569
|
+
|
|
570
|
+
Examples
|
|
571
|
+
--------
|
|
572
|
+
```{python}
|
|
573
|
+
#| echo: false
|
|
574
|
+
#| output: false
|
|
575
|
+
import pointblank as pb
|
|
576
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
577
|
+
```
|
|
578
|
+
|
|
579
|
+
Suppose we have two DataFrames: a current data table and a reference (historical) table.
|
|
580
|
+
We want to validate that the sum of a column in the current data matches the sum of the
|
|
581
|
+
same column in the reference data.
|
|
582
|
+
|
|
583
|
+
```{python}
|
|
584
|
+
import pointblank as pb
|
|
585
|
+
import polars as pl
|
|
586
|
+
|
|
587
|
+
# Current data
|
|
588
|
+
current_data = pl.DataFrame({"sales": [100, 200, 300]})
|
|
589
|
+
|
|
590
|
+
# Reference (historical) data
|
|
591
|
+
reference_data = pl.DataFrame({"sales": [100, 200, 300]})
|
|
592
|
+
|
|
593
|
+
validation = (
|
|
594
|
+
pb.Validate(data=current_data, reference=reference_data)
|
|
595
|
+
.col_sum_eq("sales", pb.ref("sales"))
|
|
596
|
+
.interrogate()
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
validation
|
|
600
|
+
```
|
|
601
|
+
|
|
602
|
+
You can also compare different columns or use tolerance:
|
|
603
|
+
|
|
604
|
+
```{python}
|
|
605
|
+
current_data = pl.DataFrame({"revenue": [105, 205, 305]})
|
|
606
|
+
reference_data = pl.DataFrame({"sales": [100, 200, 300]})
|
|
607
|
+
|
|
608
|
+
# Check if revenue sum is within 10% of sales sum
|
|
609
|
+
validation = (
|
|
610
|
+
pb.Validate(data=current_data, reference=reference_data)
|
|
611
|
+
.col_sum_eq("revenue", pb.ref("sales"), tol=0.1)
|
|
612
|
+
.interrogate()
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
validation
|
|
616
|
+
```
|
|
617
|
+
|
|
618
|
+
See Also
|
|
619
|
+
--------
|
|
620
|
+
The [`col()`](`pointblank.col`) function for referencing columns within the same table.
|
|
621
|
+
"""
|
|
622
|
+
return ReferenceColumn(column_name=column_name)
|
|
623
|
+
|
|
624
|
+
|
|
525
625
|
def starts_with(text: str, case_sensitive: bool = False) -> StartsWith:
|
|
526
626
|
"""
|
|
527
627
|
Select columns that start with specified text.
|
|
@@ -1634,13 +1734,25 @@ class ColumnExpression:
|
|
|
1634
1734
|
Supports operations like >, <, +, etc. for creating backend-agnostic validation expressions.
|
|
1635
1735
|
"""
|
|
1636
1736
|
|
|
1637
|
-
|
|
1737
|
+
column_name: str | None
|
|
1738
|
+
operation: str | None
|
|
1739
|
+
left: ColumnExpression | None
|
|
1740
|
+
right: ColumnExpression | str | int | float | None
|
|
1741
|
+
|
|
1742
|
+
def __init__(
|
|
1743
|
+
self,
|
|
1744
|
+
column_name: str | None = None,
|
|
1745
|
+
operation: str | None = None,
|
|
1746
|
+
left: ColumnExpression | None = None,
|
|
1747
|
+
right: ColumnExpression | str | int | float | None = None,
|
|
1748
|
+
):
|
|
1638
1749
|
self.column_name = column_name # Name of the column (for leaf nodes)
|
|
1639
1750
|
self.operation = operation # Operation type (gt, lt, add, etc.)
|
|
1640
1751
|
self.left = left # Left operand (ColumnExpression or None for column reference)
|
|
1641
1752
|
self.right = right # Right operand (ColumnExpression, value, or None)
|
|
1642
1753
|
|
|
1643
|
-
|
|
1754
|
+
# TODO: This method would benefit from stronger typing
|
|
1755
|
+
def to_polars_expr(self) -> Any:
|
|
1644
1756
|
"""Convert this expression to a Polars expression."""
|
|
1645
1757
|
import polars as pl
|
|
1646
1758
|
|
|
@@ -1650,16 +1762,16 @@ class ColumnExpression:
|
|
|
1650
1762
|
|
|
1651
1763
|
# Handle unary operations like is_null
|
|
1652
1764
|
if self.operation == "is_null":
|
|
1653
|
-
left_expr = self.left
|
|
1765
|
+
left_expr: Any = self.left
|
|
1654
1766
|
if isinstance(left_expr, ColumnExpression):
|
|
1655
1767
|
left_expr = left_expr.to_polars_expr()
|
|
1656
|
-
return left_expr.is_null()
|
|
1768
|
+
return left_expr.is_null() # type: ignore[union-attr]
|
|
1657
1769
|
|
|
1658
1770
|
if self.operation == "is_not_null":
|
|
1659
1771
|
left_expr = self.left
|
|
1660
1772
|
if isinstance(left_expr, ColumnExpression):
|
|
1661
1773
|
left_expr = left_expr.to_polars_expr()
|
|
1662
|
-
return left_expr.is_not_null()
|
|
1774
|
+
return left_expr.is_not_null() # type: ignore[union-attr]
|
|
1663
1775
|
|
|
1664
1776
|
# Handle nested expressions through recursive evaluation
|
|
1665
1777
|
if self.operation is None:
|
|
@@ -1667,6 +1779,7 @@ class ColumnExpression:
|
|
|
1667
1779
|
raise ValueError("Invalid expression state: No operation or column name")
|
|
1668
1780
|
|
|
1669
1781
|
# Get the left operand
|
|
1782
|
+
left_expr: Any
|
|
1670
1783
|
if self.left is None and self.column_name is not None:
|
|
1671
1784
|
# Column name as left operand
|
|
1672
1785
|
left_expr = pl.col(self.column_name) # pragma: no cover
|
|
@@ -1678,6 +1791,7 @@ class ColumnExpression:
|
|
|
1678
1791
|
left_expr = self.left # pragma: no cover
|
|
1679
1792
|
|
|
1680
1793
|
# Get the right operand
|
|
1794
|
+
right_expr: Any
|
|
1681
1795
|
if isinstance(self.right, ColumnExpression):
|
|
1682
1796
|
# Nested expression as right operand
|
|
1683
1797
|
right_expr = self.right.to_polars_expr() # pragma: no cover
|
|
@@ -1688,35 +1802,35 @@ class ColumnExpression:
|
|
|
1688
1802
|
# Literal value as right operand
|
|
1689
1803
|
right_expr = self.right # pragma: no cover
|
|
1690
1804
|
|
|
1691
|
-
# Apply the operation
|
|
1805
|
+
# Apply the operation (type ignore needed due to dynamic expression types)
|
|
1692
1806
|
if self.operation == "gt":
|
|
1693
|
-
return left_expr > right_expr
|
|
1807
|
+
return left_expr > right_expr # type: ignore[operator]
|
|
1694
1808
|
elif self.operation == "lt":
|
|
1695
|
-
return left_expr < right_expr
|
|
1809
|
+
return left_expr < right_expr # type: ignore[operator]
|
|
1696
1810
|
elif self.operation == "eq":
|
|
1697
1811
|
return left_expr == right_expr
|
|
1698
1812
|
elif self.operation == "ne":
|
|
1699
1813
|
return left_expr != right_expr
|
|
1700
1814
|
elif self.operation == "ge":
|
|
1701
|
-
return left_expr >= right_expr
|
|
1815
|
+
return left_expr >= right_expr # type: ignore[operator]
|
|
1702
1816
|
elif self.operation == "le":
|
|
1703
|
-
return left_expr <= right_expr
|
|
1817
|
+
return left_expr <= right_expr # type: ignore[operator]
|
|
1704
1818
|
elif self.operation == "add":
|
|
1705
|
-
return left_expr + right_expr
|
|
1819
|
+
return left_expr + right_expr # type: ignore[operator]
|
|
1706
1820
|
elif self.operation == "sub":
|
|
1707
|
-
return left_expr - right_expr
|
|
1821
|
+
return left_expr - right_expr # type: ignore[operator]
|
|
1708
1822
|
elif self.operation == "mul":
|
|
1709
|
-
return left_expr * right_expr
|
|
1823
|
+
return left_expr * right_expr # type: ignore[operator]
|
|
1710
1824
|
elif self.operation == "div":
|
|
1711
|
-
return left_expr / right_expr
|
|
1825
|
+
return left_expr / right_expr # type: ignore[operator]
|
|
1712
1826
|
elif self.operation == "and":
|
|
1713
|
-
return left_expr & right_expr
|
|
1827
|
+
return left_expr & right_expr # type: ignore[operator]
|
|
1714
1828
|
elif self.operation == "or":
|
|
1715
|
-
return left_expr | right_expr
|
|
1829
|
+
return left_expr | right_expr # type: ignore[operator]
|
|
1716
1830
|
else:
|
|
1717
1831
|
raise ValueError(f"Unsupported operation: {self.operation}")
|
|
1718
1832
|
|
|
1719
|
-
def to_pandas_expr(self, df):
|
|
1833
|
+
def to_pandas_expr(self, df: Any) -> Any:
|
|
1720
1834
|
"""Convert this expression to a Pandas Series of booleans."""
|
|
1721
1835
|
|
|
1722
1836
|
# Handle is_null as a special case - but raise an error
|
|
@@ -1739,43 +1853,43 @@ class ColumnExpression:
|
|
|
1739
1853
|
return df[self.column_name]
|
|
1740
1854
|
|
|
1741
1855
|
# For other operations, recursively process operands
|
|
1742
|
-
left_expr = self.left
|
|
1856
|
+
left_expr: Any = self.left
|
|
1743
1857
|
if isinstance(left_expr, ColumnExpression):
|
|
1744
1858
|
left_expr = left_expr.to_pandas_expr(df)
|
|
1745
1859
|
elif isinstance(left_expr, str) and left_expr in df.columns: # pragma: no cover
|
|
1746
1860
|
left_expr = df[left_expr]
|
|
1747
1861
|
|
|
1748
|
-
right_expr = self.right
|
|
1862
|
+
right_expr: Any = self.right
|
|
1749
1863
|
if isinstance(right_expr, ColumnExpression):
|
|
1750
1864
|
right_expr = right_expr.to_pandas_expr(df)
|
|
1751
1865
|
elif isinstance(right_expr, str) and right_expr in df.columns: # pragma: no cover
|
|
1752
1866
|
right_expr = df[right_expr]
|
|
1753
1867
|
|
|
1754
|
-
# Apply the operation
|
|
1868
|
+
# Apply the operation (type ignore needed due to dynamic expression types)
|
|
1755
1869
|
if self.operation == "gt":
|
|
1756
|
-
return left_expr > right_expr
|
|
1870
|
+
return left_expr > right_expr # type: ignore[operator]
|
|
1757
1871
|
elif self.operation == "lt":
|
|
1758
|
-
return left_expr < right_expr
|
|
1872
|
+
return left_expr < right_expr # type: ignore[operator]
|
|
1759
1873
|
elif self.operation == "eq":
|
|
1760
1874
|
return left_expr == right_expr
|
|
1761
1875
|
elif self.operation == "ne":
|
|
1762
1876
|
return left_expr != right_expr
|
|
1763
1877
|
elif self.operation == "ge":
|
|
1764
|
-
return left_expr >= right_expr
|
|
1878
|
+
return left_expr >= right_expr # type: ignore[operator]
|
|
1765
1879
|
elif self.operation == "le":
|
|
1766
|
-
return left_expr <= right_expr
|
|
1880
|
+
return left_expr <= right_expr # type: ignore[operator]
|
|
1767
1881
|
elif self.operation == "add":
|
|
1768
|
-
return left_expr + right_expr
|
|
1882
|
+
return left_expr + right_expr # type: ignore[operator]
|
|
1769
1883
|
elif self.operation == "sub":
|
|
1770
|
-
return left_expr - right_expr
|
|
1884
|
+
return left_expr - right_expr # type: ignore[operator]
|
|
1771
1885
|
elif self.operation == "mul":
|
|
1772
|
-
return left_expr * right_expr
|
|
1886
|
+
return left_expr * right_expr # type: ignore[operator]
|
|
1773
1887
|
elif self.operation == "div":
|
|
1774
|
-
return left_expr / right_expr
|
|
1888
|
+
return left_expr / right_expr # type: ignore[operator]
|
|
1775
1889
|
else:
|
|
1776
1890
|
raise ValueError(f"Unsupported operation: {self.operation}")
|
|
1777
1891
|
|
|
1778
|
-
def to_ibis_expr(self, table):
|
|
1892
|
+
def to_ibis_expr(self, table: Any) -> Any:
|
|
1779
1893
|
"""Convert this expression to an Ibis expression."""
|
|
1780
1894
|
|
|
1781
1895
|
# Base case: simple column reference
|
|
@@ -1784,16 +1898,16 @@ class ColumnExpression:
|
|
|
1784
1898
|
|
|
1785
1899
|
# Handle unary operations
|
|
1786
1900
|
if self.operation == "is_null":
|
|
1787
|
-
left_expr = self.left
|
|
1901
|
+
left_expr: Any = self.left
|
|
1788
1902
|
if isinstance(left_expr, ColumnExpression):
|
|
1789
1903
|
left_expr = left_expr.to_ibis_expr(table)
|
|
1790
|
-
return left_expr.isnull()
|
|
1904
|
+
return left_expr.isnull() # type: ignore[union-attr]
|
|
1791
1905
|
|
|
1792
1906
|
if self.operation == "is_not_null":
|
|
1793
1907
|
left_expr = self.left
|
|
1794
1908
|
if isinstance(left_expr, ColumnExpression):
|
|
1795
1909
|
left_expr = left_expr.to_ibis_expr(table)
|
|
1796
|
-
return ~left_expr.isnull()
|
|
1910
|
+
return ~left_expr.isnull() # type: ignore[union-attr,operator]
|
|
1797
1911
|
|
|
1798
1912
|
# Handle nested expressions through recursive evaluation
|
|
1799
1913
|
if self.operation is None:
|
|
@@ -1801,6 +1915,7 @@ class ColumnExpression:
|
|
|
1801
1915
|
raise ValueError("Invalid expression state: No operation or column name")
|
|
1802
1916
|
|
|
1803
1917
|
# Get the left operand
|
|
1918
|
+
left_expr: Any
|
|
1804
1919
|
if self.left is None and self.column_name is not None:
|
|
1805
1920
|
# Column name as left operand
|
|
1806
1921
|
left_expr = table[self.column_name] # pragma: no cover
|
|
@@ -1812,6 +1927,7 @@ class ColumnExpression:
|
|
|
1812
1927
|
left_expr = self.left # pragma: no cover
|
|
1813
1928
|
|
|
1814
1929
|
# Get the right operand
|
|
1930
|
+
right_expr: Any
|
|
1815
1931
|
if isinstance(self.right, ColumnExpression):
|
|
1816
1932
|
# Nested expression as right operand
|
|
1817
1933
|
right_expr = self.right.to_ibis_expr(table) # pragma: no cover
|
|
@@ -1822,31 +1938,31 @@ class ColumnExpression:
|
|
|
1822
1938
|
# Literal value as right operand
|
|
1823
1939
|
right_expr = self.right # pragma: no cover
|
|
1824
1940
|
|
|
1825
|
-
# Apply the operation
|
|
1941
|
+
# Apply the operation (type ignore needed due to dynamic expression types)
|
|
1826
1942
|
if self.operation == "gt":
|
|
1827
|
-
return left_expr > right_expr
|
|
1943
|
+
return left_expr > right_expr # type: ignore[operator]
|
|
1828
1944
|
elif self.operation == "lt":
|
|
1829
|
-
return left_expr < right_expr
|
|
1945
|
+
return left_expr < right_expr # type: ignore[operator]
|
|
1830
1946
|
elif self.operation == "eq":
|
|
1831
1947
|
return left_expr == right_expr
|
|
1832
1948
|
elif self.operation == "ne":
|
|
1833
1949
|
return left_expr != right_expr
|
|
1834
1950
|
elif self.operation == "ge":
|
|
1835
|
-
return left_expr >= right_expr
|
|
1951
|
+
return left_expr >= right_expr # type: ignore[operator]
|
|
1836
1952
|
elif self.operation == "le":
|
|
1837
|
-
return left_expr <= right_expr
|
|
1953
|
+
return left_expr <= right_expr # type: ignore[operator]
|
|
1838
1954
|
elif self.operation == "add":
|
|
1839
|
-
return left_expr + right_expr
|
|
1955
|
+
return left_expr + right_expr # type: ignore[operator]
|
|
1840
1956
|
elif self.operation == "sub":
|
|
1841
|
-
return left_expr - right_expr
|
|
1957
|
+
return left_expr - right_expr # type: ignore[operator]
|
|
1842
1958
|
elif self.operation == "mul":
|
|
1843
|
-
return left_expr * right_expr
|
|
1959
|
+
return left_expr * right_expr # type: ignore[operator]
|
|
1844
1960
|
elif self.operation == "div":
|
|
1845
|
-
return left_expr / right_expr
|
|
1961
|
+
return left_expr / right_expr # type: ignore[operator]
|
|
1846
1962
|
elif self.operation == "and":
|
|
1847
|
-
return left_expr & right_expr
|
|
1963
|
+
return left_expr & right_expr # type: ignore[operator]
|
|
1848
1964
|
elif self.operation == "or":
|
|
1849
|
-
return left_expr | right_expr
|
|
1965
|
+
return left_expr | right_expr # type: ignore[operator]
|
|
1850
1966
|
else:
|
|
1851
1967
|
raise ValueError(f"Unsupported operation: {self.operation}")
|
|
1852
1968
|
|