pointblank 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +6 -0
- pointblank/_datascan_utils.py +65 -0
- pointblank/_utils.py +128 -0
- pointblank/_utils_html.py +40 -0
- pointblank/actions.py +3 -3
- pointblank/assistant.py +1 -3
- pointblank/column.py +4 -4
- pointblank/compare.py +27 -0
- pointblank/data/api-docs.txt +769 -138
- pointblank/datascan.py +318 -959
- pointblank/scan_profile.py +321 -0
- pointblank/scan_profile_stats.py +180 -0
- pointblank/schema.py +14 -3
- pointblank/thresholds.py +2 -2
- pointblank/validate.py +1594 -207
- {pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/METADATA +6 -3
- pointblank-0.10.0.dist-info/RECORD +37 -0
- {pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/WHEEL +1 -1
- pointblank-0.9.5.dist-info/RECORD +0 -33
- {pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/top_level.txt +0 -0
pointblank/__init__.py
CHANGED
|
@@ -30,8 +30,10 @@ from pointblank.thresholds import Actions, FinalActions, Thresholds
|
|
|
30
30
|
from pointblank.validate import (
|
|
31
31
|
Validate,
|
|
32
32
|
config,
|
|
33
|
+
connect_to_table,
|
|
33
34
|
get_action_metadata,
|
|
34
35
|
get_column_count,
|
|
36
|
+
get_data_path,
|
|
35
37
|
get_row_count,
|
|
36
38
|
get_validation_summary,
|
|
37
39
|
load_dataset,
|
|
@@ -60,7 +62,9 @@ __all__ = [
|
|
|
60
62
|
"first_n",
|
|
61
63
|
"last_n",
|
|
62
64
|
"load_dataset",
|
|
65
|
+
"get_data_path",
|
|
63
66
|
"config",
|
|
67
|
+
"connect_to_table",
|
|
64
68
|
"preview",
|
|
65
69
|
"missing_vals_tbl",
|
|
66
70
|
"get_action_metadata",
|
pointblank/_constants.py
CHANGED
|
@@ -105,10 +105,12 @@ ROW_BASED_VALIDATION_TYPES = [
|
|
|
105
105
|
"col_vals_regex",
|
|
106
106
|
"col_vals_null",
|
|
107
107
|
"col_vals_not_null",
|
|
108
|
+
"col_vals_expr",
|
|
108
109
|
"conjointly",
|
|
109
110
|
]
|
|
110
111
|
|
|
111
112
|
IBIS_BACKENDS = [
|
|
113
|
+
"bigquery",
|
|
112
114
|
"databricks",
|
|
113
115
|
"duckdb",
|
|
114
116
|
"memtable",
|
|
@@ -158,6 +160,9 @@ MODEL_PROVIDERS = [
|
|
|
158
160
|
TABLE_TYPE_STYLES = {
|
|
159
161
|
"pandas": {"background": "#150458", "text": "#FFFFFF", "label": "Pandas"},
|
|
160
162
|
"polars": {"background": "#0075FF", "text": "#FFFFFF", "label": "Polars"},
|
|
163
|
+
"polars-lazy": {"background": "#0075FF", "text": "#FFFFFF", "label": "Polars (LazyFrame)"},
|
|
164
|
+
"narwhals": {"background": "#78BEAF", "text": "#222222", "label": "Narwhals"},
|
|
165
|
+
"narwhals-lazy": {"background": "#78BEAF", "text": "#222222", "label": "Narwhals (LazyFrame)"},
|
|
161
166
|
"duckdb": {"background": "#000000", "text": "#FFFFFF", "label": "DuckDB"},
|
|
162
167
|
"mysql": {"background": "#EBAD40", "text": "#222222", "label": "MySQL"},
|
|
163
168
|
"postgres": {"background": "#3E638B", "text": "#FFFFFF", "label": "PostgreSQL"},
|
|
@@ -165,6 +170,7 @@ TABLE_TYPE_STYLES = {
|
|
|
165
170
|
"parquet": {"background": "#3F9FF9", "text": "#FFFFFF", "label": "Parquet"},
|
|
166
171
|
"memtable": {"background": "#2C3E50", "text": "#FFFFFF", "label": "Ibis memtable"},
|
|
167
172
|
"mssql": {"background": "#E2E2E2", "text": "#222222", "label": "MSSQL"},
|
|
173
|
+
"bigquery": {"background": "#4285F4", "text": "#FFFFFF", "label": "BigQuery"},
|
|
168
174
|
"pyspark": {"background": "#E66F21", "text": "#FFFFFF", "label": "Spark DataFrame"},
|
|
169
175
|
"databricks": {"background": "#FF3621", "text": "#FFFFFF", "label": "Databricks"},
|
|
170
176
|
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from math import floor, log10
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from great_tables.vals import fmt_integer, fmt_number, fmt_scientific
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _round_to_sig_figs(value: float, sig_figs: int) -> float:
|
|
13
|
+
if value == 0:
|
|
14
|
+
return 0
|
|
15
|
+
return round(value, sig_figs - int(floor(log10(abs(value)))) - 1)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _compact_integer_fmt(value: float | int) -> str:
|
|
19
|
+
if value == 0:
|
|
20
|
+
formatted = "0"
|
|
21
|
+
elif abs(value) >= 1 and abs(value) < 10_000:
|
|
22
|
+
formatted = fmt_integer(value, use_seps=False)[0]
|
|
23
|
+
else:
|
|
24
|
+
formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
|
|
25
|
+
|
|
26
|
+
return formatted
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _compact_decimal_fmt(value: float | int) -> str:
|
|
30
|
+
if value == 0:
|
|
31
|
+
formatted = "0.00"
|
|
32
|
+
elif abs(value) < 1 and abs(value) >= 0.01:
|
|
33
|
+
formatted = fmt_number(value, decimals=2)[0]
|
|
34
|
+
elif abs(value) < 0.01:
|
|
35
|
+
formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
|
|
36
|
+
elif abs(value) >= 1 and abs(value) < 1000:
|
|
37
|
+
formatted = fmt_number(value, n_sigfig=3)[0]
|
|
38
|
+
elif abs(value) >= 1000 and abs(value) < 10_000:
|
|
39
|
+
formatted = fmt_number(value, decimals=0, use_seps=False)[0]
|
|
40
|
+
else:
|
|
41
|
+
formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
|
|
42
|
+
|
|
43
|
+
return formatted
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _compact_0_1_fmt(value: float | int | None) -> str | None:
|
|
47
|
+
if value is None:
|
|
48
|
+
return value
|
|
49
|
+
|
|
50
|
+
if value == 0:
|
|
51
|
+
return " 0.00"
|
|
52
|
+
|
|
53
|
+
if value == 1:
|
|
54
|
+
return " 1.00"
|
|
55
|
+
|
|
56
|
+
if abs(value) < 1 and abs(value) >= 0.01:
|
|
57
|
+
return " " + fmt_number(value, decimals=2)[0]
|
|
58
|
+
|
|
59
|
+
if abs(value) < 0.01:
|
|
60
|
+
return "<0.01"
|
|
61
|
+
|
|
62
|
+
if abs(value) > 0.99:
|
|
63
|
+
return ">0.99"
|
|
64
|
+
|
|
65
|
+
return fmt_number(value, n_sigfig=3)[0]
|
pointblank/_utils.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import inspect
|
|
4
4
|
import re
|
|
5
|
+
from collections import defaultdict
|
|
5
6
|
from typing import TYPE_CHECKING, Any
|
|
6
7
|
|
|
7
8
|
import narwhals as nw
|
|
@@ -12,9 +13,28 @@ from narwhals.typing import FrameT
|
|
|
12
13
|
from pointblank._constants import ASSERTION_TYPE_METHOD_MAP, GENERAL_COLUMN_TYPES
|
|
13
14
|
|
|
14
15
|
if TYPE_CHECKING:
|
|
16
|
+
from collections.abc import Mapping
|
|
17
|
+
|
|
15
18
|
from pointblank._typing import AbsoluteBounds, Tolerance
|
|
16
19
|
|
|
17
20
|
|
|
21
|
+
def transpose_dicts(list_of_dicts: list[dict[str, Any]]) -> dict[str, list[Any]]:
|
|
22
|
+
if not list_of_dicts:
|
|
23
|
+
return {}
|
|
24
|
+
|
|
25
|
+
# Get all unique keys across all dictionaries
|
|
26
|
+
all_keys = set()
|
|
27
|
+
for d in list_of_dicts:
|
|
28
|
+
all_keys.update(d.keys())
|
|
29
|
+
|
|
30
|
+
result = defaultdict(list)
|
|
31
|
+
for d in list_of_dicts:
|
|
32
|
+
for key in all_keys:
|
|
33
|
+
result[key].append(d.get(key)) # None is default for missing keys
|
|
34
|
+
|
|
35
|
+
return dict(result)
|
|
36
|
+
|
|
37
|
+
|
|
18
38
|
def _derive_single_bound(ref: int, tol: int | float) -> int:
|
|
19
39
|
"""Derive a single bound using the reference."""
|
|
20
40
|
if not isinstance(tol, float | int):
|
|
@@ -88,6 +108,29 @@ def _get_tbl_type(data: FrameT | Any) -> str:
|
|
|
88
108
|
return "unknown" # pragma: no cover
|
|
89
109
|
|
|
90
110
|
|
|
111
|
+
def _is_narwhals_table(data: any) -> bool:
|
|
112
|
+
# Check if the data is a Narwhals DataFrame
|
|
113
|
+
type_str = str(type(data)).lower()
|
|
114
|
+
|
|
115
|
+
if "narwhals" in type_str:
|
|
116
|
+
# If the object is not a Narwhals DataFrame, return False
|
|
117
|
+
return True
|
|
118
|
+
|
|
119
|
+
return False
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _is_lazy_frame(data: any) -> bool:
|
|
123
|
+
# Check if the data is a Polars or Narwhals DataFrame
|
|
124
|
+
type_str = str(type(data)).lower()
|
|
125
|
+
|
|
126
|
+
if "polars" not in type_str and "narwhals" not in type_str:
|
|
127
|
+
# If the object is neither a Polars nor a Narwhals DataFrame, return False
|
|
128
|
+
return False
|
|
129
|
+
|
|
130
|
+
# Check if the data is a lazy frame
|
|
131
|
+
return "lazy" in type_str
|
|
132
|
+
|
|
133
|
+
|
|
91
134
|
def _is_lib_present(lib_name: str) -> bool:
|
|
92
135
|
import importlib
|
|
93
136
|
|
|
@@ -186,6 +229,77 @@ def _check_column_exists(dfn: nw.DataFrame, column: str) -> None:
|
|
|
186
229
|
raise ValueError(f"Column '{column}' not found in DataFrame.")
|
|
187
230
|
|
|
188
231
|
|
|
232
|
+
def _count_true_values_in_column(
|
|
233
|
+
tbl: FrameT,
|
|
234
|
+
column: str,
|
|
235
|
+
inverse: bool = False,
|
|
236
|
+
) -> int:
|
|
237
|
+
"""
|
|
238
|
+
Count the number of `True` values in a specified column of a table.
|
|
239
|
+
|
|
240
|
+
Parameters
|
|
241
|
+
----------
|
|
242
|
+
tbl
|
|
243
|
+
A Narwhals-compatible DataFrame or table-like object.
|
|
244
|
+
column
|
|
245
|
+
The column in which to count the `True` values.
|
|
246
|
+
inverse
|
|
247
|
+
If `True`, count the number of `False` values instead.
|
|
248
|
+
|
|
249
|
+
Returns
|
|
250
|
+
-------
|
|
251
|
+
int
|
|
252
|
+
The count of `True` (or `False`) values in the specified column.
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
# Convert the DataFrame to a Narwhals DataFrame (no detrimental effect if
|
|
256
|
+
# already a Narwhals DataFrame)
|
|
257
|
+
tbl_nw = nw.from_native(tbl)
|
|
258
|
+
|
|
259
|
+
# Filter the table based on the column and whether we want to count True or False values
|
|
260
|
+
tbl_filtered = tbl_nw.filter(nw.col(column) if not inverse else ~nw.col(column))
|
|
261
|
+
|
|
262
|
+
# Always collect table if it is a LazyFrame; this is required to get the row count
|
|
263
|
+
if _is_lazy_frame(tbl_filtered):
|
|
264
|
+
tbl_filtered = tbl_filtered.collect()
|
|
265
|
+
|
|
266
|
+
return len(tbl_filtered)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _count_null_values_in_column(
|
|
270
|
+
tbl: FrameT,
|
|
271
|
+
column: str,
|
|
272
|
+
) -> int:
|
|
273
|
+
"""
|
|
274
|
+
Count the number of Null values in a specified column of a table.
|
|
275
|
+
|
|
276
|
+
Parameters
|
|
277
|
+
----------
|
|
278
|
+
tbl
|
|
279
|
+
A Narwhals-compatible DataFrame or table-like object.
|
|
280
|
+
column
|
|
281
|
+
The column in which to count the Null values.
|
|
282
|
+
|
|
283
|
+
Returns
|
|
284
|
+
-------
|
|
285
|
+
int
|
|
286
|
+
The count of Null values in the specified column.
|
|
287
|
+
"""
|
|
288
|
+
|
|
289
|
+
# Convert the DataFrame to a Narwhals DataFrame (no detrimental effect if
|
|
290
|
+
# already a Narwhals DataFrame)
|
|
291
|
+
tbl_nw = nw.from_native(tbl)
|
|
292
|
+
|
|
293
|
+
# Filter the table to get rows where the specified column is Null
|
|
294
|
+
tbl_filtered = tbl_nw.filter(nw.col(column).is_null())
|
|
295
|
+
|
|
296
|
+
# Always collect table if it is a LazyFrame; this is required to get the row count
|
|
297
|
+
if _is_lazy_frame(tbl_filtered):
|
|
298
|
+
tbl_filtered = tbl_filtered.collect()
|
|
299
|
+
|
|
300
|
+
return len(tbl_filtered)
|
|
301
|
+
|
|
302
|
+
|
|
189
303
|
def _is_numeric_dtype(dtype: str) -> bool:
|
|
190
304
|
"""
|
|
191
305
|
Check if a given data type string represents a numeric type.
|
|
@@ -514,6 +628,8 @@ def _get_api_text() -> str:
|
|
|
514
628
|
"Validate.get_data_extracts",
|
|
515
629
|
"Validate.all_passed",
|
|
516
630
|
"Validate.assert_passing",
|
|
631
|
+
"Validate.assert_below_threshold",
|
|
632
|
+
"Validate.above_threshold",
|
|
517
633
|
"Validate.n",
|
|
518
634
|
"Validate.n_passed",
|
|
519
635
|
"Validate.n_failed",
|
|
@@ -531,6 +647,7 @@ def _get_api_text() -> str:
|
|
|
531
647
|
"missing_vals_tbl",
|
|
532
648
|
"assistant",
|
|
533
649
|
"load_dataset",
|
|
650
|
+
"get_data_path",
|
|
534
651
|
]
|
|
535
652
|
|
|
536
653
|
utility_exported = [
|
|
@@ -782,3 +899,14 @@ def _format_to_float_value(
|
|
|
782
899
|
formatted_vals = _get_column_of_values(gt, column_name="x", context="html")
|
|
783
900
|
|
|
784
901
|
return formatted_vals[0]
|
|
902
|
+
|
|
903
|
+
|
|
904
|
+
def _pivot_to_dict(col_dict: Mapping[str, Any]): # TODO : Type hint and unit test
|
|
905
|
+
result_dict = {}
|
|
906
|
+
for col, sub_dict in col_dict.items():
|
|
907
|
+
for key, value in sub_dict.items():
|
|
908
|
+
# add columns fields not present
|
|
909
|
+
if key not in result_dict:
|
|
910
|
+
result_dict[key] = [None] * len(col_dict)
|
|
911
|
+
result_dict[key][list(col_dict.keys()).index(col)] = value
|
|
912
|
+
return result_dict
|
pointblank/_utils_html.py
CHANGED
|
@@ -1,9 +1,49 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from great_tables import html
|
|
6
|
+
|
|
3
7
|
from pointblank._constants import TABLE_TYPE_STYLES
|
|
4
8
|
from pointblank._utils import _format_to_integer_value
|
|
5
9
|
|
|
6
10
|
|
|
11
|
+
def _fmt_frac(vec) -> list[str | None]:
|
|
12
|
+
res: list[str | None] = []
|
|
13
|
+
for x in vec:
|
|
14
|
+
if x is None:
|
|
15
|
+
res.append(x)
|
|
16
|
+
continue
|
|
17
|
+
|
|
18
|
+
if x == 0:
|
|
19
|
+
res.append("0")
|
|
20
|
+
continue
|
|
21
|
+
|
|
22
|
+
if x < 0.01:
|
|
23
|
+
res.append("<.01")
|
|
24
|
+
continue
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
intx: int = int(x)
|
|
28
|
+
except ValueError: # generic object, ie. NaN
|
|
29
|
+
res.append(str(x))
|
|
30
|
+
continue
|
|
31
|
+
|
|
32
|
+
if intx == x: # can remove trailing 0s w/o loss
|
|
33
|
+
res.append(str(intx))
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
res.append(str(round(x, 2)))
|
|
37
|
+
|
|
38
|
+
return res
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _make_sublabel(major: str, minor: str) -> Any:
|
|
42
|
+
return html(
|
|
43
|
+
f'{major!s}<span style="font-size: 0.75em; vertical-align: sub; position: relative; line-height: 0.5em;">{minor!s}</span>'
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
7
47
|
def _create_table_type_html(
|
|
8
48
|
tbl_type: str | None, tbl_name: str | None, font_size: str = "10px"
|
|
9
49
|
) -> str:
|
pointblank/actions.py
CHANGED
|
@@ -216,7 +216,7 @@ def send_slack_notification(
|
|
|
216
216
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
217
217
|
actions=pb.Actions(critical=notify_slack),
|
|
218
218
|
)
|
|
219
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
219
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
220
220
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
221
221
|
.col_vals_gt(columns="session_duration", value=15)
|
|
222
222
|
.interrogate()
|
|
@@ -248,7 +248,7 @@ def send_slack_notification(
|
|
|
248
248
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
249
249
|
final_actions=pb.FinalActions(notify_slack),
|
|
250
250
|
)
|
|
251
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
251
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
252
252
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
253
253
|
.col_vals_gt(columns="session_duration", value=15)
|
|
254
254
|
.interrogate()
|
|
@@ -316,7 +316,7 @@ def send_slack_notification(
|
|
|
316
316
|
actions=pb.Actions(default=notify_slack),
|
|
317
317
|
final_actions=pb.FinalActions(notify_slack),
|
|
318
318
|
)
|
|
319
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
319
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
320
320
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
321
321
|
.col_vals_gt(columns="session_duration", value=15)
|
|
322
322
|
.interrogate()
|
pointblank/assistant.py
CHANGED
|
@@ -176,9 +176,7 @@ def assistant(
|
|
|
176
176
|
if data is not None:
|
|
177
177
|
scan = DataScan(data=data)
|
|
178
178
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
tbl_type = scan_dict["tbl_type"]
|
|
179
|
+
tbl_type: str = scan.profile.implementation.name.lower()
|
|
182
180
|
tbl_json = scan.to_json()
|
|
183
181
|
|
|
184
182
|
if tbl_name is not None:
|
pointblank/column.py
CHANGED
|
@@ -1007,7 +1007,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
|
|
|
1007
1007
|
`[rev_01, rev_02, profit_01, profit_02, age]`
|
|
1008
1008
|
|
|
1009
1009
|
and you want to validate columns that have two digits at the end of the name, you can use
|
|
1010
|
-
`columns=matches(r"
|
|
1010
|
+
`columns=matches(r"[0-9]{2}$")`. This will select the `rev_01`, `rev_02`, `profit_01`, and
|
|
1011
1011
|
`profit_02` columns.
|
|
1012
1012
|
|
|
1013
1013
|
There will be a validation step created for every resolved column. Note that if there aren't any
|
|
@@ -1061,7 +1061,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
|
|
|
1061
1061
|
[`col()`](`pointblank.col`) function, like this:
|
|
1062
1062
|
|
|
1063
1063
|
```python
|
|
1064
|
-
col(matches(r"
|
|
1064
|
+
col(matches(r"^[0-9]{5}") & ends_with("_id"))
|
|
1065
1065
|
```
|
|
1066
1066
|
|
|
1067
1067
|
There are four operators that can be used to compose column selectors:
|
|
@@ -1107,7 +1107,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
|
|
|
1107
1107
|
|
|
1108
1108
|
validation = (
|
|
1109
1109
|
pb.Validate(data=tbl)
|
|
1110
|
-
.col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID
|
|
1110
|
+
.col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID[0-9]{4}")
|
|
1111
1111
|
.interrogate()
|
|
1112
1112
|
)
|
|
1113
1113
|
|
|
@@ -1115,7 +1115,7 @@ def matches(pattern: str, case_sensitive: bool = False) -> Matches:
|
|
|
1115
1115
|
```
|
|
1116
1116
|
|
|
1117
1117
|
From the results of the validation table we get two validation steps, one for `id_old` and one
|
|
1118
|
-
for `new_identifier`. The values in both columns all match the pattern `"ID
|
|
1118
|
+
for `new_identifier`. The values in both columns all match the pattern `"ID[0-9]{4}"`.
|
|
1119
1119
|
|
|
1120
1120
|
We can also use the `matches()` function in combination with other column selectors (within
|
|
1121
1121
|
[`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select
|
pointblank/compare.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from pointblank import DataScan
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from narwhals.typing import IntoFrame
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Compare:
|
|
12
|
+
def __init__(self, a: IntoFrame, b: IntoFrame) -> None:
|
|
13
|
+
self.a: IntoFrame = a
|
|
14
|
+
self.b: IntoFrame = b
|
|
15
|
+
|
|
16
|
+
def compare(self) -> None:
|
|
17
|
+
## Scan both frames
|
|
18
|
+
self._scana = DataScan(self.a)
|
|
19
|
+
self._scanb = DataScan(self.b)
|
|
20
|
+
|
|
21
|
+
## Get summary outs
|
|
22
|
+
summarya = self._scana.summary_data
|
|
23
|
+
summaryb = self._scana.summary_data
|
|
24
|
+
|
|
25
|
+
summarya.columns
|
|
26
|
+
|
|
27
|
+
self._scana.profile
|