pointblank 0.9.6__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +4 -0
- pointblank/_datascan_utils.py +65 -0
- pointblank/_utils.py +126 -0
- pointblank/_utils_html.py +40 -0
- pointblank/assistant.py +1 -3
- pointblank/compare.py +27 -0
- pointblank/data/api-docs.txt +518 -125
- pointblank/datascan.py +318 -959
- pointblank/scan_profile.py +321 -0
- pointblank/scan_profile_stats.py +180 -0
- pointblank/schema.py +14 -3
- pointblank/validate.py +1425 -202
- {pointblank-0.9.6.dist-info → pointblank-0.10.0.dist-info}/METADATA +4 -3
- {pointblank-0.9.6.dist-info → pointblank-0.10.0.dist-info}/RECORD +18 -14
- {pointblank-0.9.6.dist-info → pointblank-0.10.0.dist-info}/WHEEL +1 -1
- {pointblank-0.9.6.dist-info → pointblank-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.9.6.dist-info → pointblank-0.10.0.dist-info}/top_level.txt +0 -0
pointblank/__init__.py
CHANGED
|
@@ -30,8 +30,10 @@ from pointblank.thresholds import Actions, FinalActions, Thresholds
|
|
|
30
30
|
from pointblank.validate import (
|
|
31
31
|
Validate,
|
|
32
32
|
config,
|
|
33
|
+
connect_to_table,
|
|
33
34
|
get_action_metadata,
|
|
34
35
|
get_column_count,
|
|
36
|
+
get_data_path,
|
|
35
37
|
get_row_count,
|
|
36
38
|
get_validation_summary,
|
|
37
39
|
load_dataset,
|
|
@@ -60,7 +62,9 @@ __all__ = [
|
|
|
60
62
|
"first_n",
|
|
61
63
|
"last_n",
|
|
62
64
|
"load_dataset",
|
|
65
|
+
"get_data_path",
|
|
63
66
|
"config",
|
|
67
|
+
"connect_to_table",
|
|
64
68
|
"preview",
|
|
65
69
|
"missing_vals_tbl",
|
|
66
70
|
"get_action_metadata",
|
pointblank/_constants.py
CHANGED
|
@@ -105,6 +105,7 @@ ROW_BASED_VALIDATION_TYPES = [
|
|
|
105
105
|
"col_vals_regex",
|
|
106
106
|
"col_vals_null",
|
|
107
107
|
"col_vals_not_null",
|
|
108
|
+
"col_vals_expr",
|
|
108
109
|
"conjointly",
|
|
109
110
|
]
|
|
110
111
|
|
|
@@ -159,6 +160,9 @@ MODEL_PROVIDERS = [
|
|
|
159
160
|
TABLE_TYPE_STYLES = {
|
|
160
161
|
"pandas": {"background": "#150458", "text": "#FFFFFF", "label": "Pandas"},
|
|
161
162
|
"polars": {"background": "#0075FF", "text": "#FFFFFF", "label": "Polars"},
|
|
163
|
+
"polars-lazy": {"background": "#0075FF", "text": "#FFFFFF", "label": "Polars (LazyFrame)"},
|
|
164
|
+
"narwhals": {"background": "#78BEAF", "text": "#222222", "label": "Narwhals"},
|
|
165
|
+
"narwhals-lazy": {"background": "#78BEAF", "text": "#222222", "label": "Narwhals (LazyFrame)"},
|
|
162
166
|
"duckdb": {"background": "#000000", "text": "#FFFFFF", "label": "DuckDB"},
|
|
163
167
|
"mysql": {"background": "#EBAD40", "text": "#222222", "label": "MySQL"},
|
|
164
168
|
"postgres": {"background": "#3E638B", "text": "#FFFFFF", "label": "PostgreSQL"},
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from math import floor, log10
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from great_tables.vals import fmt_integer, fmt_number, fmt_scientific
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _round_to_sig_figs(value: float, sig_figs: int) -> float:
|
|
13
|
+
if value == 0:
|
|
14
|
+
return 0
|
|
15
|
+
return round(value, sig_figs - int(floor(log10(abs(value)))) - 1)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _compact_integer_fmt(value: float | int) -> str:
|
|
19
|
+
if value == 0:
|
|
20
|
+
formatted = "0"
|
|
21
|
+
elif abs(value) >= 1 and abs(value) < 10_000:
|
|
22
|
+
formatted = fmt_integer(value, use_seps=False)[0]
|
|
23
|
+
else:
|
|
24
|
+
formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
|
|
25
|
+
|
|
26
|
+
return formatted
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _compact_decimal_fmt(value: float | int) -> str:
|
|
30
|
+
if value == 0:
|
|
31
|
+
formatted = "0.00"
|
|
32
|
+
elif abs(value) < 1 and abs(value) >= 0.01:
|
|
33
|
+
formatted = fmt_number(value, decimals=2)[0]
|
|
34
|
+
elif abs(value) < 0.01:
|
|
35
|
+
formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
|
|
36
|
+
elif abs(value) >= 1 and abs(value) < 1000:
|
|
37
|
+
formatted = fmt_number(value, n_sigfig=3)[0]
|
|
38
|
+
elif abs(value) >= 1000 and abs(value) < 10_000:
|
|
39
|
+
formatted = fmt_number(value, decimals=0, use_seps=False)[0]
|
|
40
|
+
else:
|
|
41
|
+
formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
|
|
42
|
+
|
|
43
|
+
return formatted
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _compact_0_1_fmt(value: float | int | None) -> str | None:
|
|
47
|
+
if value is None:
|
|
48
|
+
return value
|
|
49
|
+
|
|
50
|
+
if value == 0:
|
|
51
|
+
return " 0.00"
|
|
52
|
+
|
|
53
|
+
if value == 1:
|
|
54
|
+
return " 1.00"
|
|
55
|
+
|
|
56
|
+
if abs(value) < 1 and abs(value) >= 0.01:
|
|
57
|
+
return " " + fmt_number(value, decimals=2)[0]
|
|
58
|
+
|
|
59
|
+
if abs(value) < 0.01:
|
|
60
|
+
return "<0.01"
|
|
61
|
+
|
|
62
|
+
if abs(value) > 0.99:
|
|
63
|
+
return ">0.99"
|
|
64
|
+
|
|
65
|
+
return fmt_number(value, n_sigfig=3)[0]
|
pointblank/_utils.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import inspect
|
|
4
4
|
import re
|
|
5
|
+
from collections import defaultdict
|
|
5
6
|
from typing import TYPE_CHECKING, Any
|
|
6
7
|
|
|
7
8
|
import narwhals as nw
|
|
@@ -12,9 +13,28 @@ from narwhals.typing import FrameT
|
|
|
12
13
|
from pointblank._constants import ASSERTION_TYPE_METHOD_MAP, GENERAL_COLUMN_TYPES
|
|
13
14
|
|
|
14
15
|
if TYPE_CHECKING:
|
|
16
|
+
from collections.abc import Mapping
|
|
17
|
+
|
|
15
18
|
from pointblank._typing import AbsoluteBounds, Tolerance
|
|
16
19
|
|
|
17
20
|
|
|
21
|
+
def transpose_dicts(list_of_dicts: list[dict[str, Any]]) -> dict[str, list[Any]]:
|
|
22
|
+
if not list_of_dicts:
|
|
23
|
+
return {}
|
|
24
|
+
|
|
25
|
+
# Get all unique keys across all dictionaries
|
|
26
|
+
all_keys = set()
|
|
27
|
+
for d in list_of_dicts:
|
|
28
|
+
all_keys.update(d.keys())
|
|
29
|
+
|
|
30
|
+
result = defaultdict(list)
|
|
31
|
+
for d in list_of_dicts:
|
|
32
|
+
for key in all_keys:
|
|
33
|
+
result[key].append(d.get(key)) # None is default for missing keys
|
|
34
|
+
|
|
35
|
+
return dict(result)
|
|
36
|
+
|
|
37
|
+
|
|
18
38
|
def _derive_single_bound(ref: int, tol: int | float) -> int:
|
|
19
39
|
"""Derive a single bound using the reference."""
|
|
20
40
|
if not isinstance(tol, float | int):
|
|
@@ -88,6 +108,29 @@ def _get_tbl_type(data: FrameT | Any) -> str:
|
|
|
88
108
|
return "unknown" # pragma: no cover
|
|
89
109
|
|
|
90
110
|
|
|
111
|
+
def _is_narwhals_table(data: any) -> bool:
|
|
112
|
+
# Check if the data is a Narwhals DataFrame
|
|
113
|
+
type_str = str(type(data)).lower()
|
|
114
|
+
|
|
115
|
+
if "narwhals" in type_str:
|
|
116
|
+
# If the object is not a Narwhals DataFrame, return False
|
|
117
|
+
return True
|
|
118
|
+
|
|
119
|
+
return False
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _is_lazy_frame(data: any) -> bool:
|
|
123
|
+
# Check if the data is a Polars or Narwhals DataFrame
|
|
124
|
+
type_str = str(type(data)).lower()
|
|
125
|
+
|
|
126
|
+
if "polars" not in type_str and "narwhals" not in type_str:
|
|
127
|
+
# If the object is neither a Polars nor a Narwhals DataFrame, return False
|
|
128
|
+
return False
|
|
129
|
+
|
|
130
|
+
# Check if the data is a lazy frame
|
|
131
|
+
return "lazy" in type_str
|
|
132
|
+
|
|
133
|
+
|
|
91
134
|
def _is_lib_present(lib_name: str) -> bool:
|
|
92
135
|
import importlib
|
|
93
136
|
|
|
@@ -186,6 +229,77 @@ def _check_column_exists(dfn: nw.DataFrame, column: str) -> None:
|
|
|
186
229
|
raise ValueError(f"Column '{column}' not found in DataFrame.")
|
|
187
230
|
|
|
188
231
|
|
|
232
|
+
def _count_true_values_in_column(
|
|
233
|
+
tbl: FrameT,
|
|
234
|
+
column: str,
|
|
235
|
+
inverse: bool = False,
|
|
236
|
+
) -> int:
|
|
237
|
+
"""
|
|
238
|
+
Count the number of `True` values in a specified column of a table.
|
|
239
|
+
|
|
240
|
+
Parameters
|
|
241
|
+
----------
|
|
242
|
+
tbl
|
|
243
|
+
A Narwhals-compatible DataFrame or table-like object.
|
|
244
|
+
column
|
|
245
|
+
The column in which to count the `True` values.
|
|
246
|
+
inverse
|
|
247
|
+
If `True`, count the number of `False` values instead.
|
|
248
|
+
|
|
249
|
+
Returns
|
|
250
|
+
-------
|
|
251
|
+
int
|
|
252
|
+
The count of `True` (or `False`) values in the specified column.
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
# Convert the DataFrame to a Narwhals DataFrame (no detrimental effect if
|
|
256
|
+
# already a Narwhals DataFrame)
|
|
257
|
+
tbl_nw = nw.from_native(tbl)
|
|
258
|
+
|
|
259
|
+
# Filter the table based on the column and whether we want to count True or False values
|
|
260
|
+
tbl_filtered = tbl_nw.filter(nw.col(column) if not inverse else ~nw.col(column))
|
|
261
|
+
|
|
262
|
+
# Always collect table if it is a LazyFrame; this is required to get the row count
|
|
263
|
+
if _is_lazy_frame(tbl_filtered):
|
|
264
|
+
tbl_filtered = tbl_filtered.collect()
|
|
265
|
+
|
|
266
|
+
return len(tbl_filtered)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _count_null_values_in_column(
|
|
270
|
+
tbl: FrameT,
|
|
271
|
+
column: str,
|
|
272
|
+
) -> int:
|
|
273
|
+
"""
|
|
274
|
+
Count the number of Null values in a specified column of a table.
|
|
275
|
+
|
|
276
|
+
Parameters
|
|
277
|
+
----------
|
|
278
|
+
tbl
|
|
279
|
+
A Narwhals-compatible DataFrame or table-like object.
|
|
280
|
+
column
|
|
281
|
+
The column in which to count the Null values.
|
|
282
|
+
|
|
283
|
+
Returns
|
|
284
|
+
-------
|
|
285
|
+
int
|
|
286
|
+
The count of Null values in the specified column.
|
|
287
|
+
"""
|
|
288
|
+
|
|
289
|
+
# Convert the DataFrame to a Narwhals DataFrame (no detrimental effect if
|
|
290
|
+
# already a Narwhals DataFrame)
|
|
291
|
+
tbl_nw = nw.from_native(tbl)
|
|
292
|
+
|
|
293
|
+
# Filter the table to get rows where the specified column is Null
|
|
294
|
+
tbl_filtered = tbl_nw.filter(nw.col(column).is_null())
|
|
295
|
+
|
|
296
|
+
# Always collect table if it is a LazyFrame; this is required to get the row count
|
|
297
|
+
if _is_lazy_frame(tbl_filtered):
|
|
298
|
+
tbl_filtered = tbl_filtered.collect()
|
|
299
|
+
|
|
300
|
+
return len(tbl_filtered)
|
|
301
|
+
|
|
302
|
+
|
|
189
303
|
def _is_numeric_dtype(dtype: str) -> bool:
|
|
190
304
|
"""
|
|
191
305
|
Check if a given data type string represents a numeric type.
|
|
@@ -533,6 +647,7 @@ def _get_api_text() -> str:
|
|
|
533
647
|
"missing_vals_tbl",
|
|
534
648
|
"assistant",
|
|
535
649
|
"load_dataset",
|
|
650
|
+
"get_data_path",
|
|
536
651
|
]
|
|
537
652
|
|
|
538
653
|
utility_exported = [
|
|
@@ -784,3 +899,14 @@ def _format_to_float_value(
|
|
|
784
899
|
formatted_vals = _get_column_of_values(gt, column_name="x", context="html")
|
|
785
900
|
|
|
786
901
|
return formatted_vals[0]
|
|
902
|
+
|
|
903
|
+
|
|
904
|
+
def _pivot_to_dict(col_dict: Mapping[str, Any]): # TODO : Type hint and unit test
|
|
905
|
+
result_dict = {}
|
|
906
|
+
for col, sub_dict in col_dict.items():
|
|
907
|
+
for key, value in sub_dict.items():
|
|
908
|
+
# add columns fields not present
|
|
909
|
+
if key not in result_dict:
|
|
910
|
+
result_dict[key] = [None] * len(col_dict)
|
|
911
|
+
result_dict[key][list(col_dict.keys()).index(col)] = value
|
|
912
|
+
return result_dict
|
pointblank/_utils_html.py
CHANGED
|
@@ -1,9 +1,49 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from great_tables import html
|
|
6
|
+
|
|
3
7
|
from pointblank._constants import TABLE_TYPE_STYLES
|
|
4
8
|
from pointblank._utils import _format_to_integer_value
|
|
5
9
|
|
|
6
10
|
|
|
11
|
+
def _fmt_frac(vec) -> list[str | None]:
|
|
12
|
+
res: list[str | None] = []
|
|
13
|
+
for x in vec:
|
|
14
|
+
if x is None:
|
|
15
|
+
res.append(x)
|
|
16
|
+
continue
|
|
17
|
+
|
|
18
|
+
if x == 0:
|
|
19
|
+
res.append("0")
|
|
20
|
+
continue
|
|
21
|
+
|
|
22
|
+
if x < 0.01:
|
|
23
|
+
res.append("<.01")
|
|
24
|
+
continue
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
intx: int = int(x)
|
|
28
|
+
except ValueError: # generic object, ie. NaN
|
|
29
|
+
res.append(str(x))
|
|
30
|
+
continue
|
|
31
|
+
|
|
32
|
+
if intx == x: # can remove trailing 0s w/o loss
|
|
33
|
+
res.append(str(intx))
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
res.append(str(round(x, 2)))
|
|
37
|
+
|
|
38
|
+
return res
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _make_sublabel(major: str, minor: str) -> Any:
|
|
42
|
+
return html(
|
|
43
|
+
f'{major!s}<span style="font-size: 0.75em; vertical-align: sub; position: relative; line-height: 0.5em;">{minor!s}</span>'
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
7
47
|
def _create_table_type_html(
|
|
8
48
|
tbl_type: str | None, tbl_name: str | None, font_size: str = "10px"
|
|
9
49
|
) -> str:
|
pointblank/assistant.py
CHANGED
|
@@ -176,9 +176,7 @@ def assistant(
|
|
|
176
176
|
if data is not None:
|
|
177
177
|
scan = DataScan(data=data)
|
|
178
178
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
tbl_type = scan_dict["tbl_type"]
|
|
179
|
+
tbl_type: str = scan.profile.implementation.name.lower()
|
|
182
180
|
tbl_json = scan.to_json()
|
|
183
181
|
|
|
184
182
|
if tbl_name is not None:
|
pointblank/compare.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from pointblank import DataScan
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from narwhals.typing import IntoFrame
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Compare:
|
|
12
|
+
def __init__(self, a: IntoFrame, b: IntoFrame) -> None:
|
|
13
|
+
self.a: IntoFrame = a
|
|
14
|
+
self.b: IntoFrame = b
|
|
15
|
+
|
|
16
|
+
def compare(self) -> None:
|
|
17
|
+
## Scan both frames
|
|
18
|
+
self._scana = DataScan(self.a)
|
|
19
|
+
self._scanb = DataScan(self.b)
|
|
20
|
+
|
|
21
|
+
## Get summary outs
|
|
22
|
+
summarya = self._scana.summary_data
|
|
23
|
+
summaryb = self._scana.summary_data
|
|
24
|
+
|
|
25
|
+
summarya.columns
|
|
26
|
+
|
|
27
|
+
self._scana.profile
|