pointblank 0.11.5__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_constants.py +0 -1
- pointblank/_interrogation.py +181 -38
- pointblank/_utils.py +29 -2
- pointblank/assistant.py +9 -0
- pointblank/cli.py +39 -24
- pointblank/data/api-docs.txt +658 -29
- pointblank/schema.py +17 -0
- pointblank/segments.py +163 -0
- pointblank/validate.py +320 -57
- pointblank/yaml.py +162 -19
- {pointblank-0.11.5.dist-info → pointblank-0.12.0.dist-info}/METADATA +58 -5
- {pointblank-0.11.5.dist-info → pointblank-0.12.0.dist-info}/RECORD +17 -16
- {pointblank-0.11.5.dist-info → pointblank-0.12.0.dist-info}/WHEEL +0 -0
- {pointblank-0.11.5.dist-info → pointblank-0.12.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.11.5.dist-info → pointblank-0.12.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.11.5.dist-info → pointblank-0.12.0.dist-info}/top_level.txt +0 -0
pointblank/__init__.py
CHANGED
|
@@ -25,6 +25,7 @@ from pointblank.column import (
|
|
|
25
25
|
from pointblank.datascan import DataScan, col_summary_tbl
|
|
26
26
|
from pointblank.draft import DraftValidation
|
|
27
27
|
from pointblank.schema import Schema
|
|
28
|
+
from pointblank.segments import seg_group
|
|
28
29
|
from pointblank.tf import TF
|
|
29
30
|
from pointblank.thresholds import Actions, FinalActions, Thresholds
|
|
30
31
|
from pointblank.validate import (
|
|
@@ -76,6 +77,7 @@ __all__ = [
|
|
|
76
77
|
"get_validation_summary",
|
|
77
78
|
"get_column_count",
|
|
78
79
|
"get_row_count",
|
|
80
|
+
"seg_group",
|
|
79
81
|
"send_slack_notification",
|
|
80
82
|
# YAML functionality
|
|
81
83
|
"yaml_interrogate",
|
pointblank/_constants.py
CHANGED
pointblank/_interrogation.py
CHANGED
|
@@ -23,6 +23,74 @@ if TYPE_CHECKING:
|
|
|
23
23
|
from pointblank._typing import AbsoluteTolBounds
|
|
24
24
|
|
|
25
25
|
|
|
26
|
+
def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val: Any) -> Any:
|
|
27
|
+
"""
|
|
28
|
+
Safely modify datetime comparison values for LazyFrame compatibility.
|
|
29
|
+
|
|
30
|
+
This function handles the case where we can't directly slice LazyFrames
|
|
31
|
+
to get column dtypes for datetime conversion.
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
# First try to get column dtype from schema for LazyFrames
|
|
35
|
+
column_dtype = None
|
|
36
|
+
|
|
37
|
+
if hasattr(data_frame, "collect_schema"):
|
|
38
|
+
schema = data_frame.collect_schema()
|
|
39
|
+
column_dtype = schema.get(column)
|
|
40
|
+
elif hasattr(data_frame, "schema"):
|
|
41
|
+
schema = data_frame.schema
|
|
42
|
+
column_dtype = schema.get(column)
|
|
43
|
+
|
|
44
|
+
# If we got a dtype from schema, use it
|
|
45
|
+
if column_dtype is not None:
|
|
46
|
+
# Create a mock column object for _modify_datetime_compare_val
|
|
47
|
+
class MockColumn:
|
|
48
|
+
def __init__(self, dtype):
|
|
49
|
+
self.dtype = dtype
|
|
50
|
+
|
|
51
|
+
mock_column = MockColumn(column_dtype)
|
|
52
|
+
return _modify_datetime_compare_val(tgt_column=mock_column, compare_val=compare_val)
|
|
53
|
+
|
|
54
|
+
# Fallback: try collecting a small sample if possible
|
|
55
|
+
try:
|
|
56
|
+
sample = data_frame.head(1).collect()
|
|
57
|
+
if hasattr(sample, "dtypes") and column in sample.columns:
|
|
58
|
+
# For pandas-like dtypes
|
|
59
|
+
column_dtype = sample.dtypes[column] if hasattr(sample, "dtypes") else None
|
|
60
|
+
if column_dtype:
|
|
61
|
+
|
|
62
|
+
class MockColumn:
|
|
63
|
+
def __init__(self, dtype):
|
|
64
|
+
self.dtype = dtype
|
|
65
|
+
|
|
66
|
+
mock_column = MockColumn(column_dtype)
|
|
67
|
+
return _modify_datetime_compare_val(
|
|
68
|
+
tgt_column=mock_column, compare_val=compare_val
|
|
69
|
+
)
|
|
70
|
+
except Exception:
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
# Final fallback: try direct access (for eager DataFrames)
|
|
74
|
+
try:
|
|
75
|
+
if hasattr(data_frame, "dtypes") and column in data_frame.columns:
|
|
76
|
+
column_dtype = data_frame.dtypes[column]
|
|
77
|
+
|
|
78
|
+
class MockColumn:
|
|
79
|
+
def __init__(self, dtype):
|
|
80
|
+
self.dtype = dtype
|
|
81
|
+
|
|
82
|
+
mock_column = MockColumn(column_dtype)
|
|
83
|
+
return _modify_datetime_compare_val(tgt_column=mock_column, compare_val=compare_val)
|
|
84
|
+
except Exception:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
except Exception:
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
# If all else fails, return the original compare_val
|
|
91
|
+
return compare_val
|
|
92
|
+
|
|
93
|
+
|
|
26
94
|
@dataclass
|
|
27
95
|
class Interrogator:
|
|
28
96
|
"""
|
|
@@ -136,9 +204,7 @@ class Interrogator:
|
|
|
136
204
|
|
|
137
205
|
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
138
206
|
|
|
139
|
-
compare_expr =
|
|
140
|
-
tgt_column=self.x[self.column], compare_val=compare_expr
|
|
141
|
-
)
|
|
207
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
142
208
|
|
|
143
209
|
return (
|
|
144
210
|
self.x.with_columns(
|
|
@@ -211,9 +277,7 @@ class Interrogator:
|
|
|
211
277
|
|
|
212
278
|
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
213
279
|
|
|
214
|
-
compare_expr =
|
|
215
|
-
tgt_column=self.x[self.column], compare_val=compare_expr
|
|
216
|
-
)
|
|
280
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
217
281
|
|
|
218
282
|
return (
|
|
219
283
|
self.x.with_columns(
|
|
@@ -329,9 +393,7 @@ class Interrogator:
|
|
|
329
393
|
else:
|
|
330
394
|
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
331
395
|
|
|
332
|
-
compare_expr =
|
|
333
|
-
tgt_column=self.x[self.column], compare_val=compare_expr
|
|
334
|
-
)
|
|
396
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
335
397
|
|
|
336
398
|
tbl = self.x.with_columns(
|
|
337
399
|
pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
|
|
@@ -421,9 +483,7 @@ class Interrogator:
|
|
|
421
483
|
).to_native()
|
|
422
484
|
|
|
423
485
|
else:
|
|
424
|
-
compare_expr =
|
|
425
|
-
tgt_column=self.x[self.column], compare_val=self.compare
|
|
426
|
-
)
|
|
486
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, self.compare)
|
|
427
487
|
|
|
428
488
|
return self.x.with_columns(
|
|
429
489
|
pb_is_good_=nw.col(self.column) != nw.lit(compare_expr),
|
|
@@ -544,9 +604,7 @@ class Interrogator:
|
|
|
544
604
|
if ref_col_has_null_vals:
|
|
545
605
|
# Create individual cases for Pandas and Polars
|
|
546
606
|
|
|
547
|
-
compare_expr =
|
|
548
|
-
tgt_column=self.x[self.column], compare_val=self.compare
|
|
549
|
-
)
|
|
607
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, self.compare)
|
|
550
608
|
|
|
551
609
|
if is_pandas_dataframe(self.x.to_native()):
|
|
552
610
|
tbl = self.x.with_columns(
|
|
@@ -584,6 +642,25 @@ class Interrogator:
|
|
|
584
642
|
|
|
585
643
|
return tbl
|
|
586
644
|
|
|
645
|
+
else:
|
|
646
|
+
# Generic case for other DataFrame types (PySpark, etc.)
|
|
647
|
+
# Use similar logic to Polars but handle potential differences
|
|
648
|
+
tbl = self.x.with_columns(
|
|
649
|
+
pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
|
|
650
|
+
pb_is_good_2=nw.lit(self.na_pass), # Pass if any Null in val or compare
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
tbl = tbl.with_columns(pb_is_good_3=nw.col(self.column) != nw.lit(compare_expr))
|
|
654
|
+
|
|
655
|
+
tbl = tbl.with_columns(
|
|
656
|
+
pb_is_good_=(
|
|
657
|
+
(nw.col("pb_is_good_1") & nw.col("pb_is_good_2"))
|
|
658
|
+
| (nw.col("pb_is_good_3") & ~nw.col("pb_is_good_1"))
|
|
659
|
+
)
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
return tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
|
|
663
|
+
|
|
587
664
|
def ge(self) -> FrameT | Any:
|
|
588
665
|
# Ibis backends ---------------------------------------------
|
|
589
666
|
|
|
@@ -629,9 +706,7 @@ class Interrogator:
|
|
|
629
706
|
|
|
630
707
|
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
631
708
|
|
|
632
|
-
compare_expr =
|
|
633
|
-
tgt_column=self.x[self.column], compare_val=compare_expr
|
|
634
|
-
)
|
|
709
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
635
710
|
|
|
636
711
|
tbl = (
|
|
637
712
|
self.x.with_columns(
|
|
@@ -702,9 +777,7 @@ class Interrogator:
|
|
|
702
777
|
|
|
703
778
|
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
704
779
|
|
|
705
|
-
compare_expr =
|
|
706
|
-
tgt_column=self.x[self.column], compare_val=compare_expr
|
|
707
|
-
)
|
|
780
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
708
781
|
|
|
709
782
|
return (
|
|
710
783
|
self.x.with_columns(
|
|
@@ -834,10 +907,8 @@ class Interrogator:
|
|
|
834
907
|
low_val = _get_compare_expr_nw(compare=self.low)
|
|
835
908
|
high_val = _get_compare_expr_nw(compare=self.high)
|
|
836
909
|
|
|
837
|
-
low_val =
|
|
838
|
-
high_val =
|
|
839
|
-
tgt_column=self.x[self.column], compare_val=high_val
|
|
840
|
-
)
|
|
910
|
+
low_val = _safe_modify_datetime_compare_val(self.x, self.column, low_val)
|
|
911
|
+
high_val = _safe_modify_datetime_compare_val(self.x, self.column, high_val)
|
|
841
912
|
|
|
842
913
|
tbl = self.x.with_columns(
|
|
843
914
|
pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
|
|
@@ -1026,10 +1097,8 @@ class Interrogator:
|
|
|
1026
1097
|
low_val = _get_compare_expr_nw(compare=self.low)
|
|
1027
1098
|
high_val = _get_compare_expr_nw(compare=self.high)
|
|
1028
1099
|
|
|
1029
|
-
low_val =
|
|
1030
|
-
high_val =
|
|
1031
|
-
tgt_column=self.x[self.column], compare_val=high_val
|
|
1032
|
-
)
|
|
1100
|
+
low_val = _safe_modify_datetime_compare_val(self.x, self.column, low_val)
|
|
1101
|
+
high_val = _safe_modify_datetime_compare_val(self.x, self.column, high_val)
|
|
1033
1102
|
|
|
1034
1103
|
tbl = self.x.with_columns(
|
|
1035
1104
|
pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
|
|
@@ -1209,14 +1278,15 @@ class Interrogator:
|
|
|
1209
1278
|
else:
|
|
1210
1279
|
columns_subset = self.columns_subset
|
|
1211
1280
|
|
|
1212
|
-
# Create a
|
|
1213
|
-
|
|
1281
|
+
# Create a count of duplicates using group_by approach like Ibis backend
|
|
1282
|
+
# Group by the columns of interest and count occurrences
|
|
1283
|
+
count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
|
|
1214
1284
|
|
|
1215
|
-
#
|
|
1216
|
-
|
|
1285
|
+
# Join back to original table to get count for each row
|
|
1286
|
+
tbl = tbl.join(count_tbl, on=columns_subset, how="left")
|
|
1217
1287
|
|
|
1218
|
-
#
|
|
1219
|
-
tbl = tbl.with_columns(pb_is_good_
|
|
1288
|
+
# Passing rows will have the value `1` (no duplicates, so True), otherwise False applies
|
|
1289
|
+
tbl = tbl.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
|
|
1220
1290
|
|
|
1221
1291
|
return tbl.to_native()
|
|
1222
1292
|
|
|
@@ -2088,6 +2158,8 @@ class ConjointlyValidation:
|
|
|
2088
2158
|
return self._get_pandas_results()
|
|
2089
2159
|
elif "duckdb" in self.tbl_type or "ibis" in self.tbl_type:
|
|
2090
2160
|
return self._get_ibis_results()
|
|
2161
|
+
elif "pyspark" in self.tbl_type:
|
|
2162
|
+
return self._get_pyspark_results()
|
|
2091
2163
|
else: # pragma: no cover
|
|
2092
2164
|
raise NotImplementedError(f"Support for {self.tbl_type} is not yet implemented")
|
|
2093
2165
|
|
|
@@ -2247,6 +2319,53 @@ class ConjointlyValidation:
|
|
|
2247
2319
|
results_tbl = self.data_tbl.mutate(pb_is_good_=ibis.literal(True))
|
|
2248
2320
|
return results_tbl
|
|
2249
2321
|
|
|
2322
|
+
def _get_pyspark_results(self):
|
|
2323
|
+
"""Process expressions for PySpark DataFrames."""
|
|
2324
|
+
from pyspark.sql import functions as F
|
|
2325
|
+
|
|
2326
|
+
pyspark_columns = []
|
|
2327
|
+
|
|
2328
|
+
for expr_fn in self.expressions:
|
|
2329
|
+
try:
|
|
2330
|
+
# First try direct evaluation with PySpark DataFrame
|
|
2331
|
+
expr_result = expr_fn(self.data_tbl)
|
|
2332
|
+
|
|
2333
|
+
# Check if it's a PySpark Column
|
|
2334
|
+
if hasattr(expr_result, "_jc"): # PySpark Column has _jc attribute
|
|
2335
|
+
pyspark_columns.append(expr_result)
|
|
2336
|
+
else:
|
|
2337
|
+
raise TypeError(
|
|
2338
|
+
f"Expression returned {type(expr_result)}, expected PySpark Column"
|
|
2339
|
+
)
|
|
2340
|
+
|
|
2341
|
+
except Exception as e:
|
|
2342
|
+
try:
|
|
2343
|
+
# Try as a ColumnExpression (for pb.expr_col style)
|
|
2344
|
+
col_expr = expr_fn(None)
|
|
2345
|
+
|
|
2346
|
+
if hasattr(col_expr, "to_pyspark_expr"):
|
|
2347
|
+
# Convert to PySpark expression
|
|
2348
|
+
pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
|
|
2349
|
+
pyspark_columns.append(pyspark_expr)
|
|
2350
|
+
else:
|
|
2351
|
+
raise TypeError(f"Cannot convert {type(col_expr)} to PySpark Column")
|
|
2352
|
+
except Exception as nested_e:
|
|
2353
|
+
print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
|
|
2354
|
+
|
|
2355
|
+
# Combine results with AND logic
|
|
2356
|
+
if pyspark_columns:
|
|
2357
|
+
final_result = pyspark_columns[0]
|
|
2358
|
+
for col in pyspark_columns[1:]:
|
|
2359
|
+
final_result = final_result & col
|
|
2360
|
+
|
|
2361
|
+
# Create results table with boolean column
|
|
2362
|
+
results_tbl = self.data_tbl.withColumn("pb_is_good_", final_result)
|
|
2363
|
+
return results_tbl
|
|
2364
|
+
|
|
2365
|
+
# Default case
|
|
2366
|
+
results_tbl = self.data_tbl.withColumn("pb_is_good_", F.lit(True))
|
|
2367
|
+
return results_tbl
|
|
2368
|
+
|
|
2250
2369
|
|
|
2251
2370
|
class SpeciallyValidation:
|
|
2252
2371
|
def __init__(self, data_tbl, expression, threshold, tbl_type):
|
|
@@ -2359,13 +2478,22 @@ class NumberOfTestUnits:
|
|
|
2359
2478
|
column: str
|
|
2360
2479
|
|
|
2361
2480
|
def get_test_units(self, tbl_type: str) -> int:
|
|
2362
|
-
if
|
|
2481
|
+
if (
|
|
2482
|
+
tbl_type == "pandas"
|
|
2483
|
+
or tbl_type == "polars"
|
|
2484
|
+
or tbl_type == "pyspark"
|
|
2485
|
+
or tbl_type == "local"
|
|
2486
|
+
):
|
|
2363
2487
|
# Convert the DataFrame to a format that narwhals can work with and:
|
|
2364
2488
|
# - check if the column exists
|
|
2365
2489
|
dfn = _column_test_prep(
|
|
2366
2490
|
df=self.df, column=self.column, allowed_types=None, check_exists=False
|
|
2367
2491
|
)
|
|
2368
2492
|
|
|
2493
|
+
# Handle LazyFrames which don't have len()
|
|
2494
|
+
if hasattr(dfn, "collect"):
|
|
2495
|
+
dfn = dfn.collect()
|
|
2496
|
+
|
|
2369
2497
|
return len(dfn)
|
|
2370
2498
|
|
|
2371
2499
|
if tbl_type in IBIS_BACKENDS:
|
|
@@ -2383,7 +2511,22 @@ def _get_compare_expr_nw(compare: Any) -> Any:
|
|
|
2383
2511
|
|
|
2384
2512
|
|
|
2385
2513
|
def _column_has_null_values(table: FrameT, column: str) -> bool:
|
|
2386
|
-
|
|
2514
|
+
try:
|
|
2515
|
+
# Try the standard null_count() method
|
|
2516
|
+
null_count = (table.select(column).null_count())[column][0]
|
|
2517
|
+
except AttributeError:
|
|
2518
|
+
# For LazyFrames, collect first then get null count
|
|
2519
|
+
try:
|
|
2520
|
+
collected = table.select(column).collect()
|
|
2521
|
+
null_count = (collected.null_count())[column][0]
|
|
2522
|
+
except Exception:
|
|
2523
|
+
# Fallback: check if any values are null
|
|
2524
|
+
try:
|
|
2525
|
+
result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect()
|
|
2526
|
+
null_count = result["null_count"][0]
|
|
2527
|
+
except Exception:
|
|
2528
|
+
# Last resort: return False (assume no nulls)
|
|
2529
|
+
return False
|
|
2387
2530
|
|
|
2388
2531
|
if null_count is None or null_count == 0:
|
|
2389
2532
|
return False
|
|
@@ -2414,7 +2557,7 @@ def _check_nulls_across_columns_nw(table, columns_subset):
|
|
|
2414
2557
|
|
|
2415
2558
|
# Build the expression by combining each column's `is_null()` with OR operations
|
|
2416
2559
|
null_expr = functools.reduce(
|
|
2417
|
-
lambda acc, col: acc |
|
|
2560
|
+
lambda acc, col: acc | nw.col(col).is_null() if acc is not None else nw.col(col).is_null(),
|
|
2418
2561
|
column_names,
|
|
2419
2562
|
None,
|
|
2420
2563
|
)
|
pointblank/_utils.py
CHANGED
|
@@ -66,11 +66,13 @@ def _get_tbl_type(data: FrameT | Any) -> str:
|
|
|
66
66
|
except Exception as e:
|
|
67
67
|
raise TypeError("The `data` object is not a DataFrame or Ibis Table.") from e
|
|
68
68
|
|
|
69
|
-
# Detect through regex if the table is a polars or
|
|
69
|
+
# Detect through regex if the table is a polars, pandas, or Spark DataFrame
|
|
70
70
|
if re.search(r"polars", df_ns_str, re.IGNORECASE):
|
|
71
71
|
return "polars"
|
|
72
72
|
elif re.search(r"pandas", df_ns_str, re.IGNORECASE):
|
|
73
73
|
return "pandas"
|
|
74
|
+
elif re.search(r"pyspark", df_ns_str, re.IGNORECASE):
|
|
75
|
+
return "pyspark"
|
|
74
76
|
|
|
75
77
|
# If ibis is present, then get the table's backend name
|
|
76
78
|
ibis_present = _is_lib_present(lib_name="ibis")
|
|
@@ -164,7 +166,7 @@ def _check_any_df_lib(method_used: str) -> None:
|
|
|
164
166
|
def _is_value_a_df(value: Any) -> bool:
|
|
165
167
|
try:
|
|
166
168
|
ns = nw.get_native_namespace(value)
|
|
167
|
-
if "polars" in str(ns) or "pandas" in str(ns):
|
|
169
|
+
if "polars" in str(ns) or "pandas" in str(ns) or "pyspark" in str(ns):
|
|
168
170
|
return True
|
|
169
171
|
else: # pragma: no cover
|
|
170
172
|
return False
|
|
@@ -619,6 +621,10 @@ def _get_api_text() -> str:
|
|
|
619
621
|
"expr_col",
|
|
620
622
|
]
|
|
621
623
|
|
|
624
|
+
segments_exported = [
|
|
625
|
+
"seg_group",
|
|
626
|
+
]
|
|
627
|
+
|
|
622
628
|
interrogation_exported = [
|
|
623
629
|
"Validate.interrogate",
|
|
624
630
|
"Validate.get_tabular_report",
|
|
@@ -648,6 +654,12 @@ def _get_api_text() -> str:
|
|
|
648
654
|
"assistant",
|
|
649
655
|
"load_dataset",
|
|
650
656
|
"get_data_path",
|
|
657
|
+
"connect_to_table",
|
|
658
|
+
]
|
|
659
|
+
|
|
660
|
+
yaml_exported = [
|
|
661
|
+
"yaml_interrogate",
|
|
662
|
+
"validate_yaml",
|
|
651
663
|
]
|
|
652
664
|
|
|
653
665
|
utility_exported = [
|
|
@@ -679,6 +691,10 @@ many steps). Furthermore, the `col()` function can be used to declare a comparis
|
|
|
679
691
|
for the `value=` argument in many `col_vals_*()` methods) when you can't use a fixed value
|
|
680
692
|
for comparison."""
|
|
681
693
|
|
|
694
|
+
segments_desc = (
|
|
695
|
+
"""Combine multiple values into a single segment using `seg_*()` helper functions."""
|
|
696
|
+
)
|
|
697
|
+
|
|
682
698
|
interrogation_desc = """The validation plan is put into action when `interrogate()` is called.
|
|
683
699
|
The workflow for performing a comprehensive validation is then: (1) `Validate()`, (2) adding
|
|
684
700
|
validation steps, (3) `interrogate()`. After interrogation of the data, we can view a validation
|
|
@@ -694,6 +710,11 @@ datasets included in the package can be accessed via the `load_dataset()` functi
|
|
|
694
710
|
`config()` utility lets us set global configuration parameters. Want to chat with an assistant? Use
|
|
695
711
|
the `assistant()` function to get help with Pointblank."""
|
|
696
712
|
|
|
713
|
+
yaml_desc = """The *YAML* group contains functions that allow for the use of YAML to orchestrate
|
|
714
|
+
validation workflows. The `yaml_interrogate()` function can be used to run a validation workflow from
|
|
715
|
+
YAML strings or files. The `validate_yaml()` function checks if the YAML configuration
|
|
716
|
+
passes its own validity checks."""
|
|
717
|
+
|
|
697
718
|
utility_desc = """The Utility Functions group contains functions that are useful for accessing
|
|
698
719
|
metadata about the target data. Use `get_column_count()` or `get_row_count()` to get the number of
|
|
699
720
|
columns or rows in a table. The `get_action_metadata()` function is useful when building custom
|
|
@@ -718,12 +739,18 @@ table information, and timing details."""
|
|
|
718
739
|
api_text += f"""\n## The Column Selection family\n\n{column_selection_desc}\n\n"""
|
|
719
740
|
api_text += get_api_details(module=pointblank, exported_list=column_selection_exported)
|
|
720
741
|
|
|
742
|
+
api_text += f"""\n## The Segments family\n\n{segments_desc}\n\n"""
|
|
743
|
+
api_text += get_api_details(module=pointblank, exported_list=segments_exported)
|
|
744
|
+
|
|
721
745
|
api_text += f"""\n## The Interrogation and Reporting family\n\n{interrogation_desc}\n\n"""
|
|
722
746
|
api_text += get_api_details(module=pointblank, exported_list=interrogation_exported)
|
|
723
747
|
|
|
724
748
|
api_text += f"""\n## The Inspection and Assistance family\n\n{inspect_desc}\n\n"""
|
|
725
749
|
api_text += get_api_details(module=pointblank, exported_list=inspect_exported)
|
|
726
750
|
|
|
751
|
+
api_text += f"""\n## The YAML family\n\n{yaml_desc}\n\n"""
|
|
752
|
+
api_text += get_api_details(module=pointblank, exported_list=yaml_exported)
|
|
753
|
+
|
|
727
754
|
api_text += f"""\n## The Utility Functions family\n\n{utility_desc}\n\n"""
|
|
728
755
|
api_text += get_api_details(module=pointblank, exported_list=utility_exported)
|
|
729
756
|
|
pointblank/assistant.py
CHANGED
|
@@ -138,10 +138,15 @@ def assistant(
|
|
|
138
138
|
|
|
139
139
|
- Polars DataFrame (`"polars"`)
|
|
140
140
|
- Pandas DataFrame (`"pandas"`)
|
|
141
|
+
- PySpark table (`"pyspark"`)
|
|
141
142
|
- DuckDB table (`"duckdb"`)*
|
|
142
143
|
- MySQL table (`"mysql"`)*
|
|
143
144
|
- PostgreSQL table (`"postgresql"`)*
|
|
144
145
|
- SQLite table (`"sqlite"`)*
|
|
146
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
147
|
+
- Snowflake table (`"snowflake"`)*
|
|
148
|
+
- Databricks table (`"databricks"`)*
|
|
149
|
+
- BigQuery table (`"bigquery"`)*
|
|
145
150
|
- Parquet table (`"parquet"`)*
|
|
146
151
|
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
147
152
|
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
@@ -152,6 +157,10 @@ def assistant(
|
|
|
152
157
|
`ibis.expr.types.relations.Table`). Furthermore, using `assistant()` with these types of tables
|
|
153
158
|
requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or
|
|
154
159
|
Pandas DataFrame, the availability of Ibis is not needed.
|
|
160
|
+
|
|
161
|
+
To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
|
|
162
|
+
provided. The file will be automatically detected and loaded using the best available DataFrame
|
|
163
|
+
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
155
164
|
"""
|
|
156
165
|
|
|
157
166
|
# Check that the chatlas package is installed
|
pointblank/cli.py
CHANGED
|
@@ -1360,10 +1360,10 @@ def preview(
|
|
|
1360
1360
|
For tables with many columns, use these options to control which columns are displayed:
|
|
1361
1361
|
|
|
1362
1362
|
\b
|
|
1363
|
-
- --columns: Specify exact columns (
|
|
1364
|
-
- --col-range: Select column range (
|
|
1365
|
-
- --col-first: Show first N columns (
|
|
1366
|
-
- --col-last: Show last N columns (
|
|
1363
|
+
- --columns: Specify exact columns (--columns "name,age,email")
|
|
1364
|
+
- --col-range: Select column range (--col-range "1:10", --col-range "5:", --col-range ":15")
|
|
1365
|
+
- --col-first: Show first N columns (--col-first 5)
|
|
1366
|
+
- --col-last: Show last N columns (--col-last 3)
|
|
1367
1367
|
|
|
1368
1368
|
Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
|
|
1369
1369
|
"""
|
|
@@ -1920,31 +1920,43 @@ def validate(
|
|
|
1920
1920
|
|
|
1921
1921
|
AVAILABLE CHECK_TYPES:
|
|
1922
1922
|
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
The default CHECK_TYPE is 'rows-distinct' which checks for duplicate rows.
|
|
1923
|
+
Require no additional options:
|
|
1926
1924
|
|
|
1927
1925
|
\b
|
|
1928
1926
|
- rows-distinct: Check if all rows in the dataset are unique (no duplicates)
|
|
1929
1927
|
- rows-complete: Check if all rows are complete (no missing values in any column)
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
- col-
|
|
1935
|
-
- col-vals-
|
|
1936
|
-
|
|
1928
|
+
|
|
1929
|
+
Require --column:
|
|
1930
|
+
|
|
1931
|
+
\b
|
|
1932
|
+
- col-exists: Check if a specific column exists in the dataset
|
|
1933
|
+
- col-vals-not-null: Check if all values in a column are not null/missing
|
|
1934
|
+
|
|
1935
|
+
Require --column and --value:
|
|
1936
|
+
|
|
1937
|
+
\b
|
|
1938
|
+
- col-vals-gt: Check if column values are greater than a fixed value
|
|
1939
|
+
- col-vals-ge: Check if column values are greater than or equal to a fixed value
|
|
1940
|
+
- col-vals-lt: Check if column values are less than a fixed value
|
|
1941
|
+
- col-vals-le: Check if column values are less than or equal to a fixed value
|
|
1942
|
+
|
|
1943
|
+
Require --column and --set:
|
|
1944
|
+
|
|
1945
|
+
\b
|
|
1946
|
+
- col-vals-in-set: Check if column values are in an allowed set
|
|
1947
|
+
|
|
1948
|
+
Use --list-checks to see all available validation methods with examples. The default CHECK_TYPE
|
|
1949
|
+
is 'rows-distinct' which checks for duplicate rows.
|
|
1937
1950
|
|
|
1938
1951
|
Examples:
|
|
1939
1952
|
|
|
1940
1953
|
\b
|
|
1941
|
-
pb validate data.csv
|
|
1942
|
-
pb validate data.csv --list-checks
|
|
1954
|
+
pb validate data.csv # Uses default validation (rows-distinct)
|
|
1955
|
+
pb validate data.csv --list-checks # Show all available checks
|
|
1943
1956
|
pb validate data.csv --check rows-distinct
|
|
1944
1957
|
pb validate data.csv --check rows-distinct --show-extract
|
|
1945
1958
|
pb validate data.csv --check rows-distinct --write-extract failing_rows_folder
|
|
1946
1959
|
pb validate data.csv --check rows-distinct --exit-code
|
|
1947
|
-
pb validate data.csv --check rows-complete
|
|
1948
1960
|
pb validate data.csv --check col-exists --column price
|
|
1949
1961
|
pb validate data.csv --check col-vals-not-null --column email
|
|
1950
1962
|
pb validate data.csv --check col-vals-gt --column score --value 50
|
|
@@ -1952,7 +1964,6 @@ def validate(
|
|
|
1952
1964
|
|
|
1953
1965
|
Multiple validations in one command:
|
|
1954
1966
|
pb validate data.csv --check rows-distinct --check rows-complete
|
|
1955
|
-
pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
|
|
1956
1967
|
"""
|
|
1957
1968
|
try:
|
|
1958
1969
|
import sys
|
|
@@ -4627,36 +4638,40 @@ def pl(
|
|
|
4627
4638
|
pb pl "pl.read_csv('data.csv').select(['name', 'age'])"
|
|
4628
4639
|
pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)"
|
|
4629
4640
|
|
|
4641
|
+
\b
|
|
4630
4642
|
# Multi-line with editor (supports multiple statements)
|
|
4631
4643
|
pb pl --edit
|
|
4632
4644
|
|
|
4645
|
+
\b
|
|
4633
4646
|
# Multi-statement code example in editor:
|
|
4634
4647
|
# csv = pl.read_csv('data.csv')
|
|
4635
4648
|
# result = csv.select(['name', 'age']).filter(pl.col('age') > 25)
|
|
4636
4649
|
|
|
4650
|
+
\b
|
|
4637
4651
|
# Multi-line with a specific editor
|
|
4638
4652
|
pb pl --edit --editor nano
|
|
4639
4653
|
pb pl --edit --editor code
|
|
4640
4654
|
pb pl --edit --editor micro
|
|
4641
4655
|
|
|
4656
|
+
\b
|
|
4642
4657
|
# From file
|
|
4643
4658
|
pb pl --file query.py
|
|
4644
4659
|
|
|
4645
|
-
|
|
4646
|
-
|
|
4660
|
+
\b
|
|
4661
|
+
Piping to other pb commands
|
|
4662
|
+
pb pl "pl.read_csv('data.csv').head(20)" --pipe | pb validate --check rows-distinct
|
|
4647
4663
|
pb pl --edit --pipe | pb preview --head 10
|
|
4648
4664
|
pb pl --edit --pipe | pb scan --output-html report.html
|
|
4649
4665
|
pb pl --edit --pipe | pb missing --output-html missing_report.html
|
|
4650
4666
|
|
|
4651
|
-
Use --output-format to change how results are displayed:
|
|
4652
|
-
|
|
4653
4667
|
\b
|
|
4668
|
+
Use --output-format to change how results are displayed:
|
|
4654
4669
|
pb pl "pl.read_csv('data.csv')" --output-format scan
|
|
4655
4670
|
pb pl "pl.read_csv('data.csv')" --output-format missing
|
|
4656
4671
|
pb pl "pl.read_csv('data.csv')" --output-format info
|
|
4657
4672
|
|
|
4658
|
-
Note: For multi-statement code, assign your final result to a variable like
|
|
4659
|
-
'
|
|
4673
|
+
Note: For multi-statement code, assign your final result to a variable like 'result', 'df',
|
|
4674
|
+
'data', or ensure it's the last expression.
|
|
4660
4675
|
"""
|
|
4661
4676
|
try:
|
|
4662
4677
|
# Check if Polars is available
|