pointblank 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/__init__.py CHANGED
@@ -30,8 +30,10 @@ from pointblank.thresholds import Actions, FinalActions, Thresholds
30
30
  from pointblank.validate import (
31
31
  Validate,
32
32
  config,
33
+ connect_to_table,
33
34
  get_action_metadata,
34
35
  get_column_count,
36
+ get_data_path,
35
37
  get_row_count,
36
38
  get_validation_summary,
37
39
  load_dataset,
@@ -60,7 +62,9 @@ __all__ = [
60
62
  "first_n",
61
63
  "last_n",
62
64
  "load_dataset",
65
+ "get_data_path",
63
66
  "config",
67
+ "connect_to_table",
64
68
  "preview",
65
69
  "missing_vals_tbl",
66
70
  "get_action_metadata",
pointblank/_constants.py CHANGED
@@ -105,10 +105,12 @@ ROW_BASED_VALIDATION_TYPES = [
105
105
  "col_vals_regex",
106
106
  "col_vals_null",
107
107
  "col_vals_not_null",
108
+ "col_vals_expr",
108
109
  "conjointly",
109
110
  ]
110
111
 
111
112
  IBIS_BACKENDS = [
113
+ "bigquery",
112
114
  "databricks",
113
115
  "duckdb",
114
116
  "memtable",
@@ -158,6 +160,9 @@ MODEL_PROVIDERS = [
158
160
  TABLE_TYPE_STYLES = {
159
161
  "pandas": {"background": "#150458", "text": "#FFFFFF", "label": "Pandas"},
160
162
  "polars": {"background": "#0075FF", "text": "#FFFFFF", "label": "Polars"},
163
+ "polars-lazy": {"background": "#0075FF", "text": "#FFFFFF", "label": "Polars (LazyFrame)"},
164
+ "narwhals": {"background": "#78BEAF", "text": "#222222", "label": "Narwhals"},
165
+ "narwhals-lazy": {"background": "#78BEAF", "text": "#222222", "label": "Narwhals (LazyFrame)"},
161
166
  "duckdb": {"background": "#000000", "text": "#FFFFFF", "label": "DuckDB"},
162
167
  "mysql": {"background": "#EBAD40", "text": "#222222", "label": "MySQL"},
163
168
  "postgres": {"background": "#3E638B", "text": "#FFFFFF", "label": "PostgreSQL"},
@@ -165,6 +170,7 @@ TABLE_TYPE_STYLES = {
165
170
  "parquet": {"background": "#3F9FF9", "text": "#FFFFFF", "label": "Parquet"},
166
171
  "memtable": {"background": "#2C3E50", "text": "#FFFFFF", "label": "Ibis memtable"},
167
172
  "mssql": {"background": "#E2E2E2", "text": "#222222", "label": "MSSQL"},
173
+ "bigquery": {"background": "#4285F4", "text": "#FFFFFF", "label": "BigQuery"},
168
174
  "pyspark": {"background": "#E66F21", "text": "#FFFFFF", "label": "Spark DataFrame"},
169
175
  "databricks": {"background": "#FF3621", "text": "#FFFFFF", "label": "Databricks"},
170
176
  }
@@ -0,0 +1,65 @@
1
+ from __future__ import annotations
2
+
3
+ from math import floor, log10
4
+ from typing import TYPE_CHECKING
5
+
6
+ from great_tables.vals import fmt_integer, fmt_number, fmt_scientific
7
+
8
+ if TYPE_CHECKING:
9
+ pass
10
+
11
+
12
+ def _round_to_sig_figs(value: float, sig_figs: int) -> float:
13
+ if value == 0:
14
+ return 0
15
+ return round(value, sig_figs - int(floor(log10(abs(value)))) - 1)
16
+
17
+
18
+ def _compact_integer_fmt(value: float | int) -> str:
19
+ if value == 0:
20
+ formatted = "0"
21
+ elif abs(value) >= 1 and abs(value) < 10_000:
22
+ formatted = fmt_integer(value, use_seps=False)[0]
23
+ else:
24
+ formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
25
+
26
+ return formatted
27
+
28
+
29
+ def _compact_decimal_fmt(value: float | int) -> str:
30
+ if value == 0:
31
+ formatted = "0.00"
32
+ elif abs(value) < 1 and abs(value) >= 0.01:
33
+ formatted = fmt_number(value, decimals=2)[0]
34
+ elif abs(value) < 0.01:
35
+ formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
36
+ elif abs(value) >= 1 and abs(value) < 1000:
37
+ formatted = fmt_number(value, n_sigfig=3)[0]
38
+ elif abs(value) >= 1000 and abs(value) < 10_000:
39
+ formatted = fmt_number(value, decimals=0, use_seps=False)[0]
40
+ else:
41
+ formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
42
+
43
+ return formatted
44
+
45
+
46
+ def _compact_0_1_fmt(value: float | int | None) -> str | None:
47
+ if value is None:
48
+ return value
49
+
50
+ if value == 0:
51
+ return " 0.00"
52
+
53
+ if value == 1:
54
+ return " 1.00"
55
+
56
+ if abs(value) < 1 and abs(value) >= 0.01:
57
+ return " " + fmt_number(value, decimals=2)[0]
58
+
59
+ if abs(value) < 0.01:
60
+ return "<0.01"
61
+
62
+ if abs(value) > 0.99:
63
+ return ">0.99"
64
+
65
+ return fmt_number(value, n_sigfig=3)[0]
pointblank/_utils.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import inspect
4
4
  import re
5
+ from collections import defaultdict
5
6
  from typing import TYPE_CHECKING, Any
6
7
 
7
8
  import narwhals as nw
@@ -12,9 +13,28 @@ from narwhals.typing import FrameT
12
13
  from pointblank._constants import ASSERTION_TYPE_METHOD_MAP, GENERAL_COLUMN_TYPES
13
14
 
14
15
  if TYPE_CHECKING:
16
+ from collections.abc import Mapping
17
+
15
18
  from pointblank._typing import AbsoluteBounds, Tolerance
16
19
 
17
20
 
21
+ def transpose_dicts(list_of_dicts: list[dict[str, Any]]) -> dict[str, list[Any]]:
22
+ if not list_of_dicts:
23
+ return {}
24
+
25
+ # Get all unique keys across all dictionaries
26
+ all_keys = set()
27
+ for d in list_of_dicts:
28
+ all_keys.update(d.keys())
29
+
30
+ result = defaultdict(list)
31
+ for d in list_of_dicts:
32
+ for key in all_keys:
33
+ result[key].append(d.get(key)) # None is default for missing keys
34
+
35
+ return dict(result)
36
+
37
+
18
38
  def _derive_single_bound(ref: int, tol: int | float) -> int:
19
39
  """Derive a single bound using the reference."""
20
40
  if not isinstance(tol, float | int):
@@ -88,6 +108,29 @@ def _get_tbl_type(data: FrameT | Any) -> str:
88
108
  return "unknown" # pragma: no cover
89
109
 
90
110
 
111
+ def _is_narwhals_table(data: any) -> bool:
112
+ # Check if the data is a Narwhals DataFrame
113
+ type_str = str(type(data)).lower()
114
+
115
+ if "narwhals" in type_str:
116
+ # If the object is not a Narwhals DataFrame, return False
117
+ return True
118
+
119
+ return False
120
+
121
+
122
+ def _is_lazy_frame(data: any) -> bool:
123
+ # Check if the data is a Polars or Narwhals DataFrame
124
+ type_str = str(type(data)).lower()
125
+
126
+ if "polars" not in type_str and "narwhals" not in type_str:
127
+ # If the object is neither a Polars nor a Narwhals DataFrame, return False
128
+ return False
129
+
130
+ # Check if the data is a lazy frame
131
+ return "lazy" in type_str
132
+
133
+
91
134
  def _is_lib_present(lib_name: str) -> bool:
92
135
  import importlib
93
136
 
@@ -186,6 +229,77 @@ def _check_column_exists(dfn: nw.DataFrame, column: str) -> None:
186
229
  raise ValueError(f"Column '{column}' not found in DataFrame.")
187
230
 
188
231
 
232
+ def _count_true_values_in_column(
233
+ tbl: FrameT,
234
+ column: str,
235
+ inverse: bool = False,
236
+ ) -> int:
237
+ """
238
+ Count the number of `True` values in a specified column of a table.
239
+
240
+ Parameters
241
+ ----------
242
+ tbl
243
+ A Narwhals-compatible DataFrame or table-like object.
244
+ column
245
+ The column in which to count the `True` values.
246
+ inverse
247
+ If `True`, count the number of `False` values instead.
248
+
249
+ Returns
250
+ -------
251
+ int
252
+ The count of `True` (or `False`) values in the specified column.
253
+ """
254
+
255
+ # Convert the DataFrame to a Narwhals DataFrame (no detrimental effect if
256
+ # already a Narwhals DataFrame)
257
+ tbl_nw = nw.from_native(tbl)
258
+
259
+ # Filter the table based on the column and whether we want to count True or False values
260
+ tbl_filtered = tbl_nw.filter(nw.col(column) if not inverse else ~nw.col(column))
261
+
262
+ # Always collect table if it is a LazyFrame; this is required to get the row count
263
+ if _is_lazy_frame(tbl_filtered):
264
+ tbl_filtered = tbl_filtered.collect()
265
+
266
+ return len(tbl_filtered)
267
+
268
+
269
+ def _count_null_values_in_column(
270
+ tbl: FrameT,
271
+ column: str,
272
+ ) -> int:
273
+ """
274
+ Count the number of Null values in a specified column of a table.
275
+
276
+ Parameters
277
+ ----------
278
+ tbl
279
+ A Narwhals-compatible DataFrame or table-like object.
280
+ column
281
+ The column in which to count the Null values.
282
+
283
+ Returns
284
+ -------
285
+ int
286
+ The count of Null values in the specified column.
287
+ """
288
+
289
+ # Convert the DataFrame to a Narwhals DataFrame (no detrimental effect if
290
+ # already a Narwhals DataFrame)
291
+ tbl_nw = nw.from_native(tbl)
292
+
293
+ # Filter the table to get rows where the specified column is Null
294
+ tbl_filtered = tbl_nw.filter(nw.col(column).is_null())
295
+
296
+ # Always collect table if it is a LazyFrame; this is required to get the row count
297
+ if _is_lazy_frame(tbl_filtered):
298
+ tbl_filtered = tbl_filtered.collect()
299
+
300
+ return len(tbl_filtered)
301
+
302
+
189
303
  def _is_numeric_dtype(dtype: str) -> bool:
190
304
  """
191
305
  Check if a given data type string represents a numeric type.
@@ -514,6 +628,8 @@ def _get_api_text() -> str:
514
628
  "Validate.get_data_extracts",
515
629
  "Validate.all_passed",
516
630
  "Validate.assert_passing",
631
+ "Validate.assert_below_threshold",
632
+ "Validate.above_threshold",
517
633
  "Validate.n",
518
634
  "Validate.n_passed",
519
635
  "Validate.n_failed",
@@ -531,6 +647,7 @@ def _get_api_text() -> str:
531
647
  "missing_vals_tbl",
532
648
  "assistant",
533
649
  "load_dataset",
650
+ "get_data_path",
534
651
  ]
535
652
 
536
653
  utility_exported = [
@@ -782,3 +899,14 @@ def _format_to_float_value(
782
899
  formatted_vals = _get_column_of_values(gt, column_name="x", context="html")
783
900
 
784
901
  return formatted_vals[0]
902
+
903
+
904
+ def _pivot_to_dict(col_dict: Mapping[str, Any]): # TODO : Type hint and unit test
905
+ result_dict = {}
906
+ for col, sub_dict in col_dict.items():
907
+ for key, value in sub_dict.items():
908
+ # add columns fields not present
909
+ if key not in result_dict:
910
+ result_dict[key] = [None] * len(col_dict)
911
+ result_dict[key][list(col_dict.keys()).index(col)] = value
912
+ return result_dict
pointblank/_utils_html.py CHANGED
@@ -1,9 +1,49 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from typing import Any
4
+
5
+ from great_tables import html
6
+
3
7
  from pointblank._constants import TABLE_TYPE_STYLES
4
8
  from pointblank._utils import _format_to_integer_value
5
9
 
6
10
 
11
+ def _fmt_frac(vec) -> list[str | None]:
12
+ res: list[str | None] = []
13
+ for x in vec:
14
+ if x is None:
15
+ res.append(x)
16
+ continue
17
+
18
+ if x == 0:
19
+ res.append("0")
20
+ continue
21
+
22
+ if x < 0.01:
23
+ res.append("<.01")
24
+ continue
25
+
26
+ try:
27
+ intx: int = int(x)
28
+ except ValueError: # generic object, ie. NaN
29
+ res.append(str(x))
30
+ continue
31
+
32
+ if intx == x: # can remove trailing 0s w/o loss
33
+ res.append(str(intx))
34
+ continue
35
+
36
+ res.append(str(round(x, 2)))
37
+
38
+ return res
39
+
40
+
41
+ def _make_sublabel(major: str, minor: str) -> Any:
42
+ return html(
43
+ f'{major!s}<span style="font-size: 0.75em; vertical-align: sub; position: relative; line-height: 0.5em;">{minor!s}</span>'
44
+ )
45
+
46
+
7
47
  def _create_table_type_html(
8
48
  tbl_type: str | None, tbl_name: str | None, font_size: str = "10px"
9
49
  ) -> str:
pointblank/actions.py CHANGED
@@ -216,7 +216,7 @@ def send_slack_notification(
216
216
  thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
217
217
  actions=pb.Actions(critical=notify_slack),
218
218
  )
219
- .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
219
+ .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
220
220
  .col_vals_gt(columns="item_revenue", value=0.05)
221
221
  .col_vals_gt(columns="session_duration", value=15)
222
222
  .interrogate()
@@ -248,7 +248,7 @@ def send_slack_notification(
248
248
  thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
249
249
  final_actions=pb.FinalActions(notify_slack),
250
250
  )
251
- .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
251
+ .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
252
252
  .col_vals_gt(columns="item_revenue", value=0.05)
253
253
  .col_vals_gt(columns="session_duration", value=15)
254
254
  .interrogate()
@@ -316,7 +316,7 @@ def send_slack_notification(
316
316
  actions=pb.Actions(default=notify_slack),
317
317
  final_actions=pb.FinalActions(notify_slack),
318
318
  )
319
- .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}")
319
+ .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
320
320
  .col_vals_gt(columns="item_revenue", value=0.05)
321
321
  .col_vals_gt(columns="session_duration", value=15)
322
322
  .interrogate()
pointblank/assistant.py CHANGED
@@ -176,9 +176,7 @@ def assistant(
176
176
  if data is not None:
177
177
  scan = DataScan(data=data)
178
178
 
179
- scan_dict = scan.to_dict()
180
-
181
- tbl_type = scan_dict["tbl_type"]
179
+ tbl_type: str = scan.profile.implementation.name.lower()
182
180
  tbl_json = scan.to_json()
183
181
 
184
182
  if tbl_name is not None:
pointblank/column.py CHANGED
@@ -1007,7 +1007,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
1007
1007
  `[rev_01, rev_02, profit_01, profit_02, age]`
1008
1008
 
1009
1009
  and you want to validate columns that have two digits at the end of the name, you can use
1010
- `columns=matches(r"\d{2}$")`. This will select the `rev_01`, `rev_02`, `profit_01`, and
1010
+ `columns=matches(r"[0-9]{2}$")`. This will select the `rev_01`, `rev_02`, `profit_01`, and
1011
1011
  `profit_02` columns.
1012
1012
 
1013
1013
  There will be a validation step created for every resolved column. Note that if there aren't any
@@ -1061,7 +1061,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
1061
1061
  [`col()`](`pointblank.col`) function, like this:
1062
1062
 
1063
1063
  ```python
1064
- col(matches(r"^\d{5}") & ends_with("_id"))
1064
+ col(matches(r"^[0-9]{5}") & ends_with("_id"))
1065
1065
  ```
1066
1066
 
1067
1067
  There are four operators that can be used to compose column selectors:
@@ -1107,7 +1107,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
1107
1107
 
1108
1108
  validation = (
1109
1109
  pb.Validate(data=tbl)
1110
- .col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID\d{4}")
1110
+ .col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID[0-9]{4}")
1111
1111
  .interrogate()
1112
1112
  )
1113
1113
 
@@ -1115,7 +1115,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
1115
1115
  ```
1116
1116
 
1117
1117
  From the results of the validation table we get two validation steps, one for `id_old` and one
1118
- for `new_identifier`. The values in both columns all match the pattern `"ID\d{4}"`.
1118
+ for `new_identifier`. The values in both columns all match the pattern `"ID[0-9]{4}"`.
1119
1119
 
1120
1120
  We can also use the `matches()` function in combination with other column selectors (within
1121
1121
  [`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select
pointblank/compare.py ADDED
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from pointblank import DataScan
6
+
7
+ if TYPE_CHECKING:
8
+ from narwhals.typing import IntoFrame
9
+
10
+
11
+ class Compare:
12
+ def __init__(self, a: IntoFrame, b: IntoFrame) -> None:
13
+ self.a: IntoFrame = a
14
+ self.b: IntoFrame = b
15
+
16
+ def compare(self) -> None:
17
+ ## Scan both frames
18
+ self._scana = DataScan(self.a)
19
+ self._scanb = DataScan(self.b)
20
+
21
+ ## Get summary outs
22
+ summarya = self._scana.summary_data
23
+ summaryb = self._scana.summary_data
24
+
25
+ summarya.columns
26
+
27
+ self._scana.profile