pointblank 0.11.0__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/assistant.py +14 -3
- pointblank/cli.py +2418 -1511
- pointblank/compare.py +9 -0
- pointblank/datascan.py +25 -3
- pointblank/validate.py +346 -37
- {pointblank-0.11.0.dist-info → pointblank-0.11.1.dist-info}/METADATA +16 -10
- {pointblank-0.11.0.dist-info → pointblank-0.11.1.dist-info}/RECORD +11 -11
- {pointblank-0.11.0.dist-info → pointblank-0.11.1.dist-info}/WHEEL +0 -0
- {pointblank-0.11.0.dist-info → pointblank-0.11.1.dist-info}/entry_points.txt +0 -0
- {pointblank-0.11.0.dist-info → pointblank-0.11.1.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.11.0.dist-info → pointblank-0.11.1.dist-info}/top_level.txt +0 -0
pointblank/cli.py
CHANGED
|
@@ -15,6 +15,81 @@ from pointblank._utils import _get_tbl_type, _is_lib_present
|
|
|
15
15
|
console = Console()
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
class OrderedGroup(click.Group):
|
|
19
|
+
"""A Click Group that displays commands in a custom order."""
|
|
20
|
+
|
|
21
|
+
def list_commands(self, ctx):
|
|
22
|
+
"""Return commands in the desired logical order."""
|
|
23
|
+
# Define the desired order
|
|
24
|
+
desired_order = [
|
|
25
|
+
# Data Discovery/Exploration
|
|
26
|
+
"info",
|
|
27
|
+
"preview",
|
|
28
|
+
"scan",
|
|
29
|
+
"missing",
|
|
30
|
+
# Validation
|
|
31
|
+
"validate",
|
|
32
|
+
"run",
|
|
33
|
+
"make-template",
|
|
34
|
+
# Utilities
|
|
35
|
+
"datasets",
|
|
36
|
+
"requirements",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
# Get all available commands
|
|
40
|
+
available_commands = super().list_commands(ctx)
|
|
41
|
+
|
|
42
|
+
# Return commands in desired order, followed by any not in the list
|
|
43
|
+
ordered = []
|
|
44
|
+
for cmd in desired_order:
|
|
45
|
+
if cmd in available_commands:
|
|
46
|
+
ordered.append(cmd)
|
|
47
|
+
|
|
48
|
+
# Add any commands not in our desired order (safety fallback)
|
|
49
|
+
for cmd in available_commands:
|
|
50
|
+
if cmd not in ordered:
|
|
51
|
+
ordered.append(cmd)
|
|
52
|
+
|
|
53
|
+
return ordered
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _load_data_source(data_source: str) -> Any:
|
|
57
|
+
"""
|
|
58
|
+
Centralized data loading function for CLI that handles all supported data source types.
|
|
59
|
+
|
|
60
|
+
This function provides a consistent way to load data across all CLI commands by leveraging
|
|
61
|
+
the _process_data() utility function and adding support for pointblank dataset names.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
data_source : str
|
|
66
|
+
The data source which could be:
|
|
67
|
+
- A pointblank dataset name (small_table, game_revenue, nycflights, global_sales)
|
|
68
|
+
- A GitHub URL pointing to a CSV or Parquet file
|
|
69
|
+
- A database connection string (e.g., "duckdb:///path/to/file.ddb::table_name")
|
|
70
|
+
- A CSV file path (string or Path object with .csv extension)
|
|
71
|
+
- A Parquet file path, glob pattern, directory, or partitioned dataset
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
Any
|
|
76
|
+
Loaded data as a DataFrame or other data object
|
|
77
|
+
|
|
78
|
+
Raises
|
|
79
|
+
------
|
|
80
|
+
ValueError
|
|
81
|
+
If the pointblank dataset name is not recognized
|
|
82
|
+
"""
|
|
83
|
+
# Check if it's a pointblank dataset name first
|
|
84
|
+
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
85
|
+
return pb.load_dataset(data_source)
|
|
86
|
+
|
|
87
|
+
# Otherwise, use the centralized _process_data() function for all other data sources
|
|
88
|
+
from pointblank.validate import _process_data
|
|
89
|
+
|
|
90
|
+
return _process_data(data_source)
|
|
91
|
+
|
|
92
|
+
|
|
18
93
|
def _format_cell_value(
|
|
19
94
|
value: Any, is_row_number: bool = False, max_width: int = 50, num_columns: int = 10
|
|
20
95
|
) -> str:
|
|
@@ -206,173 +281,443 @@ def _format_dtype_compact(dtype_str: str) -> str:
|
|
|
206
281
|
return dtype_str
|
|
207
282
|
|
|
208
283
|
|
|
209
|
-
def
|
|
210
|
-
|
|
284
|
+
def _rich_print_scan_table(
|
|
285
|
+
scan_result: Any,
|
|
286
|
+
data_source: str,
|
|
287
|
+
source_type: str,
|
|
288
|
+
table_type: str,
|
|
289
|
+
total_rows: int | None = None,
|
|
290
|
+
total_columns: int | None = None,
|
|
291
|
+
) -> None:
|
|
292
|
+
"""
|
|
293
|
+
Display scan results as a Rich table in the terminal with statistical measures.
|
|
211
294
|
|
|
212
295
|
Args:
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
296
|
+
scan_result: The GT object from col_summary_tbl()
|
|
297
|
+
data_source: Name of the data source being scanned
|
|
298
|
+
source_type: Type of data source (e.g., "Pointblank dataset: small_table")
|
|
299
|
+
table_type: Type of table (e.g., "polars.LazyFrame")
|
|
300
|
+
total_rows: Total number of rows in the dataset
|
|
301
|
+
total_columns: Total number of columns in the dataset
|
|
219
302
|
"""
|
|
220
303
|
try:
|
|
221
|
-
|
|
222
|
-
df = None
|
|
304
|
+
import re
|
|
223
305
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
|
|
227
|
-
df = gt_table._tbl_data
|
|
228
|
-
elif (
|
|
229
|
-
hasattr(gt_table, "_body")
|
|
230
|
-
and hasattr(gt_table._body, "body")
|
|
231
|
-
and gt_table._body.body is not None
|
|
232
|
-
):
|
|
233
|
-
df = gt_table._body.body
|
|
234
|
-
elif hasattr(gt_table, "_data") and gt_table._data is not None:
|
|
235
|
-
df = gt_table._data
|
|
236
|
-
elif hasattr(gt_table, "data") and gt_table.data is not None:
|
|
237
|
-
df = gt_table.data
|
|
306
|
+
import narwhals as nw
|
|
307
|
+
from rich.box import SIMPLE_HEAD
|
|
238
308
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
309
|
+
# Extract the underlying DataFrame from the GT object
|
|
310
|
+
# The GT object has a _tbl_data attribute that contains the DataFrame
|
|
311
|
+
gt_data = scan_result._tbl_data
|
|
242
312
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
if preview_info and "source_type" in preview_info and "table_type" in preview_info:
|
|
246
|
-
source_type = preview_info["source_type"]
|
|
247
|
-
table_type = preview_info["table_type"]
|
|
248
|
-
table_title = f"Data Preview / {source_type} / {table_type}"
|
|
313
|
+
# Convert to Narwhals DataFrame for consistent handling
|
|
314
|
+
nw_data = nw.from_native(gt_data)
|
|
249
315
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
show_header=True,
|
|
253
|
-
header_style="bold magenta",
|
|
254
|
-
box=SIMPLE_HEAD,
|
|
255
|
-
title_style="bold cyan",
|
|
256
|
-
title_justify="left",
|
|
257
|
-
)
|
|
316
|
+
# Convert to dictionary for easier access
|
|
317
|
+
data_dict = nw_data.to_dict(as_series=False)
|
|
258
318
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
columns = list(df.columns)
|
|
263
|
-
elif hasattr(df, "schema"): # pragma: no cover
|
|
264
|
-
columns = list(df.schema.names)
|
|
265
|
-
elif hasattr(df, "column_names"): # pragma: no cover
|
|
266
|
-
columns = list(df.column_names)
|
|
319
|
+
# Create main scan table with missing data table styling
|
|
320
|
+
# Create a comprehensive title with data source, source type, and table type
|
|
321
|
+
title_text = f"Column Summary / {source_type} / {table_type}"
|
|
267
322
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
if hasattr(df, "to_dicts") and len(df) > 0:
|
|
272
|
-
first_dict = df.to_dicts()[0]
|
|
273
|
-
columns = list(first_dict.keys())
|
|
274
|
-
elif hasattr(df, "to_dict") and len(df) > 0:
|
|
275
|
-
first_dict = df.to_dict("records")[0]
|
|
276
|
-
columns = list(first_dict.keys())
|
|
277
|
-
except Exception: # pragma: no cover
|
|
278
|
-
columns = [f"Column {i + 1}" for i in range(10)] # Default fallback
|
|
323
|
+
# Add dimensions subtitle in gray if available
|
|
324
|
+
if total_rows is not None and total_columns is not None:
|
|
325
|
+
title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
|
|
279
326
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
327
|
+
scan_table = Table(
|
|
328
|
+
title=title_text,
|
|
329
|
+
show_header=True,
|
|
330
|
+
header_style="bold magenta",
|
|
331
|
+
box=SIMPLE_HEAD,
|
|
332
|
+
title_style="bold cyan",
|
|
333
|
+
title_justify="left",
|
|
334
|
+
)
|
|
283
335
|
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
max_col_width = min(30, terminal_width // 8)
|
|
294
|
-
except Exception: # pragma: no cover
|
|
295
|
-
# Fallback if we can't get terminal width
|
|
296
|
-
max_col_width = 40 if len(columns) <= 10 else 25
|
|
336
|
+
# Add columns with specific styling and appropriate widths
|
|
337
|
+
scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
|
|
338
|
+
scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
|
|
339
|
+
scan_table.add_column(
|
|
340
|
+
"NA", style="red", width=6, justify="right"
|
|
341
|
+
) # Adjusted for better formatting
|
|
342
|
+
scan_table.add_column(
|
|
343
|
+
"UQ", style="green", width=8, justify="right"
|
|
344
|
+
) # Adjusted for boolean values
|
|
297
345
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
346
|
+
# Add statistical columns if they exist with appropriate widths
|
|
347
|
+
stat_columns = []
|
|
348
|
+
column_mapping = {
|
|
349
|
+
"mean": ("Mean", "blue", 9),
|
|
350
|
+
"std": ("SD", "blue", 9),
|
|
351
|
+
"min": ("Min", "yellow", 9),
|
|
352
|
+
"median": ("Med", "yellow", 9),
|
|
353
|
+
"max": ("Max", "yellow", 9),
|
|
354
|
+
"q_1": ("Q₁", "magenta", 8),
|
|
355
|
+
"q_3": ("Q₃", "magenta", 9),
|
|
356
|
+
"iqr": ("IQR", "magenta", 8),
|
|
357
|
+
}
|
|
302
358
|
|
|
303
|
-
|
|
359
|
+
for col_key, (display_name, color, width) in column_mapping.items():
|
|
360
|
+
if col_key in data_dict:
|
|
361
|
+
scan_table.add_column(display_name, style=color, width=width, justify="right")
|
|
362
|
+
stat_columns.append(col_key)
|
|
304
363
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
364
|
+
# Helper function to extract column name and type from HTML
|
|
365
|
+
def extract_column_info(html_content: str) -> tuple[str, str]:
|
|
366
|
+
"""Extract column name and type from HTML formatted content."""
|
|
367
|
+
# Extract column name from first div
|
|
368
|
+
name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
|
|
369
|
+
column_name = name_match.group(1) if name_match else "Unknown"
|
|
370
|
+
|
|
371
|
+
# Extract data type from second div (with gray color)
|
|
372
|
+
type_match = re.search(r"<div[^>]*color: gray[^>]*>([^<]+)</div>", html_content)
|
|
373
|
+
if type_match:
|
|
374
|
+
data_type = type_match.group(1)
|
|
375
|
+
# Convert to compact format using the existing function
|
|
376
|
+
compact_type = _format_dtype_compact(data_type)
|
|
377
|
+
data_type = compact_type
|
|
312
378
|
else:
|
|
313
|
-
|
|
379
|
+
data_type = "unknown"
|
|
314
380
|
|
|
315
|
-
|
|
316
|
-
dtypes_dict = _get_column_dtypes(df, columns)
|
|
381
|
+
return column_name, data_type
|
|
317
382
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
if data_dict:
|
|
326
|
-
row_nums = [row.get("_row_num_", 0) for row in data_dict]
|
|
327
|
-
max_row_num = max(row_nums) if row_nums else 0
|
|
328
|
-
row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
|
|
329
|
-
elif hasattr(df, "to_dict"):
|
|
330
|
-
data_dict = df.to_dict("records")
|
|
331
|
-
if data_dict:
|
|
332
|
-
row_nums = [row.get("_row_num_", 0) for row in data_dict]
|
|
333
|
-
max_row_num = max(row_nums) if row_nums else 0
|
|
334
|
-
row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
|
|
335
|
-
except Exception: # pragma: no cover
|
|
336
|
-
# If we can't determine max row number, use default
|
|
337
|
-
row_num_width = 8 # Slightly larger default for safety
|
|
383
|
+
# Helper function to format values with improved number formatting
|
|
384
|
+
def format_value(
|
|
385
|
+
value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
|
|
386
|
+
) -> str:
|
|
387
|
+
"""Format values for display with smart number formatting and HTML cleanup."""
|
|
388
|
+
if value is None or (isinstance(value, str) and value.strip() == ""):
|
|
389
|
+
return "[dim]—[/dim]"
|
|
338
390
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
rich_table.add_column("···", style="dim", width=3, no_wrap=True)
|
|
343
|
-
else:
|
|
344
|
-
# Handle row number column specially
|
|
345
|
-
if col == "_row_num_":
|
|
346
|
-
# Row numbers get no header, right alignment, and dim gray style
|
|
347
|
-
# Use dynamic width to prevent truncation
|
|
348
|
-
rich_table.add_column(
|
|
349
|
-
"", style="dim", justify="right", no_wrap=True, width=row_num_width
|
|
350
|
-
)
|
|
351
|
-
else:
|
|
352
|
-
display_col = str(col)
|
|
391
|
+
# Handle missing values indicator
|
|
392
|
+
if is_missing and str(value) == "0":
|
|
393
|
+
return "[green]●[/green]" # No missing values
|
|
353
394
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
dtype_display = f"<{dtypes_dict[col]}>"
|
|
357
|
-
# Create header with column name and data type
|
|
358
|
-
header_text = f"{display_col}\n[dim yellow]{dtype_display}[/dim yellow]"
|
|
359
|
-
else:
|
|
360
|
-
header_text = display_col
|
|
395
|
+
# Clean up HTML formatting from the raw data
|
|
396
|
+
str_val = str(value)
|
|
361
397
|
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
398
|
+
# Handle multi-line values with <br> tags FIRST - take the first line (absolute number)
|
|
399
|
+
if "<br>" in str_val:
|
|
400
|
+
str_val = str_val.split("<br>")[0].strip()
|
|
401
|
+
# For unique values, we want just the integer part
|
|
402
|
+
if is_unique:
|
|
403
|
+
try:
|
|
404
|
+
# Try to extract just the integer part for unique counts
|
|
405
|
+
num_val = float(str_val)
|
|
406
|
+
return str(int(num_val))
|
|
407
|
+
except (ValueError, TypeError):
|
|
408
|
+
pass
|
|
369
409
|
|
|
370
|
-
#
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
410
|
+
# Now handle HTML content (especially from boolean unique values)
|
|
411
|
+
if "<" in str_val and ">" in str_val:
|
|
412
|
+
# Remove HTML tags completely for cleaner display
|
|
413
|
+
str_val = re.sub(r"<[^>]+>", "", str_val).strip()
|
|
414
|
+
# Clean up extra whitespace
|
|
415
|
+
str_val = re.sub(r"\s+", " ", str_val).strip()
|
|
416
|
+
|
|
417
|
+
# Handle values like "2<.01" - extract the first number
|
|
418
|
+
if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
|
|
419
|
+
# Extract number before the < symbol
|
|
420
|
+
before_lt = str_val.split("<")[0].strip()
|
|
421
|
+
if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
|
|
422
|
+
str_val = before_lt
|
|
423
|
+
|
|
424
|
+
# Handle boolean unique values like "T0.62F0.38" - extract the more readable format
|
|
425
|
+
if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
|
|
426
|
+
# Extract T and F values
|
|
427
|
+
t_match = re.search(r"T(\d+\.\d+)", str_val)
|
|
428
|
+
f_match = re.search(r"F(\d+\.\d+)", str_val)
|
|
429
|
+
if t_match and f_match:
|
|
430
|
+
t_val = float(t_match.group(1))
|
|
431
|
+
f_val = float(f_match.group(1))
|
|
432
|
+
# Show as "T0.62F0.38" but truncated if needed
|
|
433
|
+
formatted = f"T{t_val:.2f}F{f_val:.2f}"
|
|
434
|
+
if len(formatted) > max_width:
|
|
435
|
+
# Truncate to fit, showing dominant value
|
|
436
|
+
if t_val > f_val:
|
|
437
|
+
return f"T{t_val:.1f}"
|
|
438
|
+
else:
|
|
439
|
+
return f"F{f_val:.1f}"
|
|
440
|
+
return formatted
|
|
441
|
+
|
|
442
|
+
# Try to parse as a number for better formatting
|
|
443
|
+
try:
|
|
444
|
+
# Try to convert to float first
|
|
445
|
+
num_val = float(str_val)
|
|
446
|
+
|
|
447
|
+
# Handle special cases
|
|
448
|
+
if num_val == 0:
|
|
449
|
+
return "0"
|
|
450
|
+
elif abs(num_val) == int(abs(num_val)) and abs(num_val) < 10000:
|
|
451
|
+
# Simple integers under 10000
|
|
452
|
+
return str(int(num_val))
|
|
453
|
+
elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
|
|
454
|
+
# Likely dates in YYYYMMDD format - format as date-like
|
|
455
|
+
int_val = int(num_val)
|
|
456
|
+
if 19000101 <= int_val <= 29991231: # Reasonable date range
|
|
457
|
+
str_date = str(int_val)
|
|
458
|
+
if len(str_date) == 8:
|
|
459
|
+
return (
|
|
460
|
+
f"{str_date[:4]}-{str_date[4:6]}-{str_date[6:]}"[: max_width - 1]
|
|
461
|
+
+ "…"
|
|
462
|
+
)
|
|
463
|
+
# Otherwise treat as large number
|
|
464
|
+
return f"{num_val / 1000000:.1f}M"
|
|
465
|
+
elif abs(num_val) >= 1000000:
|
|
466
|
+
# Large numbers - use scientific notation or M/k notation
|
|
467
|
+
|
|
468
|
+
if abs(num_val) >= 1000000000:
|
|
469
|
+
return f"{num_val:.1e}"
|
|
470
|
+
else:
|
|
471
|
+
return f"{num_val / 1000000:.1f}M"
|
|
472
|
+
elif abs(num_val) >= 10000:
|
|
473
|
+
# Numbers >= 10k - use compact notation
|
|
474
|
+
return f"{num_val / 1000:.1f}k"
|
|
475
|
+
elif abs(num_val) >= 100:
|
|
476
|
+
# Numbers 100-9999 - show with minimal decimals
|
|
477
|
+
return f"{num_val:.1f}"
|
|
478
|
+
elif abs(num_val) >= 10:
|
|
479
|
+
# Numbers 10-99 - show with one decimal
|
|
480
|
+
return f"{num_val:.1f}"
|
|
481
|
+
elif abs(num_val) >= 1:
|
|
482
|
+
# Numbers 1-9 - show with two decimals
|
|
483
|
+
return f"{num_val:.2f}"
|
|
484
|
+
elif abs(num_val) >= 0.01:
|
|
485
|
+
# Small numbers - show with appropriate precision
|
|
486
|
+
return f"{num_val:.2f}"
|
|
487
|
+
else:
|
|
488
|
+
# Very small numbers - use scientific notation
|
|
489
|
+
|
|
490
|
+
return f"{num_val:.1e}"
|
|
491
|
+
|
|
492
|
+
except (ValueError, TypeError):
|
|
493
|
+
# Not a number, handle as string
|
|
494
|
+
pass
|
|
495
|
+
|
|
496
|
+
# Handle date/datetime strings - show abbreviated format
|
|
497
|
+
if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
|
|
498
|
+
# Likely a date/datetime, show abbreviated
|
|
499
|
+
if len(str_val) > max_width:
|
|
500
|
+
return str_val[: max_width - 1] + "…"
|
|
501
|
+
|
|
502
|
+
# General string truncation with ellipsis
|
|
503
|
+
if len(str_val) > max_width:
|
|
504
|
+
return str_val[: max_width - 1] + "…"
|
|
505
|
+
|
|
506
|
+
return str_val
|
|
507
|
+
|
|
508
|
+
# Populate table rows
|
|
509
|
+
num_rows = len(data_dict["colname"])
|
|
510
|
+
for i in range(num_rows):
|
|
511
|
+
row_data = []
|
|
512
|
+
|
|
513
|
+
# Column name and type from HTML content
|
|
514
|
+
colname_html = data_dict["colname"][i]
|
|
515
|
+
column_name, data_type = extract_column_info(colname_html)
|
|
516
|
+
row_data.append(column_name)
|
|
517
|
+
row_data.append(data_type)
|
|
518
|
+
|
|
519
|
+
# Missing values (NA)
|
|
520
|
+
missing_val = data_dict.get("n_missing", [None] * num_rows)[i]
|
|
521
|
+
row_data.append(format_value(missing_val, is_missing=True, max_width=6))
|
|
522
|
+
|
|
523
|
+
# Unique values (UQ)
|
|
524
|
+
unique_val = data_dict.get("n_unique", [None] * num_rows)[i]
|
|
525
|
+
row_data.append(format_value(unique_val, is_unique=True, max_width=8))
|
|
526
|
+
|
|
527
|
+
# Statistical columns
|
|
528
|
+
for stat_col in stat_columns:
|
|
529
|
+
stat_val = data_dict.get(stat_col, [None] * num_rows)[i]
|
|
530
|
+
# Use appropriate width based on column type
|
|
531
|
+
if stat_col in ["q_1", "iqr"]:
|
|
532
|
+
width = 8
|
|
533
|
+
elif stat_col in ["mean", "std", "min", "median", "max", "q_3"]:
|
|
534
|
+
width = 9
|
|
535
|
+
else:
|
|
536
|
+
width = 8
|
|
537
|
+
row_data.append(format_value(stat_val, max_width=width))
|
|
538
|
+
|
|
539
|
+
scan_table.add_row(*row_data)
|
|
540
|
+
|
|
541
|
+
# Display the results
|
|
542
|
+
console.print()
|
|
543
|
+
console.print(scan_table)
|
|
544
|
+
|
|
545
|
+
except Exception as e:
|
|
546
|
+
# Fallback to simple message if table creation fails
|
|
547
|
+
console.print(f"[yellow]Scan results available for {data_source}[/yellow]")
|
|
548
|
+
console.print(f"[red]Error displaying table: {str(e)}[/red]")
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def _rich_print_gt_table(
|
|
552
|
+
gt_table: Any, preview_info: dict | None = None, show_summary: bool = True
|
|
553
|
+
) -> None:
|
|
554
|
+
"""Convert a GT table to Rich table and display it in the terminal.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
gt_table: The GT table object to display
|
|
558
|
+
preview_info: Optional dict with preview context info:
|
|
559
|
+
- total_rows: Total rows in the dataset
|
|
560
|
+
- head_rows: Number of head rows shown
|
|
561
|
+
- tail_rows: Number of tail rows shown
|
|
562
|
+
- is_complete: Whether the entire dataset is shown
|
|
563
|
+
show_summary: Whether to show the row count summary at the bottom
|
|
564
|
+
"""
|
|
565
|
+
try:
|
|
566
|
+
# Try to extract the underlying data from the GT table
|
|
567
|
+
df = None
|
|
568
|
+
|
|
569
|
+
# Great Tables stores the original data in different places depending on how it was created
|
|
570
|
+
# Let's try multiple approaches to get the data
|
|
571
|
+
if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
|
|
572
|
+
df = gt_table._tbl_data
|
|
573
|
+
elif (
|
|
574
|
+
hasattr(gt_table, "_body")
|
|
575
|
+
and hasattr(gt_table._body, "body")
|
|
576
|
+
and gt_table._body.body is not None
|
|
577
|
+
):
|
|
578
|
+
df = gt_table._body.body
|
|
579
|
+
elif hasattr(gt_table, "_data") and gt_table._data is not None:
|
|
580
|
+
df = gt_table._data
|
|
581
|
+
elif hasattr(gt_table, "data") and gt_table.data is not None:
|
|
582
|
+
df = gt_table.data
|
|
583
|
+
|
|
584
|
+
if df is not None:
|
|
585
|
+
# Create a Rich table with horizontal lines
|
|
586
|
+
from rich.box import SIMPLE_HEAD
|
|
587
|
+
|
|
588
|
+
# Create enhanced title if preview_info contains metadata
|
|
589
|
+
table_title = None
|
|
590
|
+
if preview_info and "source_type" in preview_info and "table_type" in preview_info:
|
|
591
|
+
source_type = preview_info["source_type"]
|
|
592
|
+
table_type = preview_info["table_type"]
|
|
593
|
+
table_title = f"Data Preview / {source_type} / {table_type}"
|
|
594
|
+
|
|
595
|
+
rich_table = Table(
|
|
596
|
+
title=table_title,
|
|
597
|
+
show_header=True,
|
|
598
|
+
header_style="bold magenta",
|
|
599
|
+
box=SIMPLE_HEAD,
|
|
600
|
+
title_style="bold cyan",
|
|
601
|
+
title_justify="left",
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
# Get column names
|
|
605
|
+
columns = []
|
|
606
|
+
if hasattr(df, "columns"):
|
|
607
|
+
columns = list(df.columns)
|
|
608
|
+
elif hasattr(df, "schema"): # pragma: no cover
|
|
609
|
+
columns = list(df.schema.names)
|
|
610
|
+
elif hasattr(df, "column_names"): # pragma: no cover
|
|
611
|
+
columns = list(df.column_names)
|
|
612
|
+
|
|
613
|
+
if not columns: # pragma: no cover
|
|
614
|
+
# Fallback: try to determine columns from first row
|
|
615
|
+
try:
|
|
616
|
+
if hasattr(df, "to_dicts") and len(df) > 0:
|
|
617
|
+
first_dict = df.to_dicts()[0]
|
|
618
|
+
columns = list(first_dict.keys())
|
|
619
|
+
elif hasattr(df, "to_dict") and len(df) > 0:
|
|
620
|
+
first_dict = df.to_dict("records")[0]
|
|
621
|
+
columns = list(first_dict.keys())
|
|
622
|
+
except Exception: # pragma: no cover
|
|
623
|
+
columns = [f"Column {i + 1}" for i in range(10)] # Default fallback
|
|
624
|
+
|
|
625
|
+
# Add columns to Rich table
|
|
626
|
+
# Handle wide tables by limiting columns displayed
|
|
627
|
+
max_terminal_cols = 15 # Reasonable limit for terminal display
|
|
628
|
+
|
|
629
|
+
# Get terminal width to adjust column behavior
|
|
630
|
+
try:
|
|
631
|
+
terminal_width = console.size.width
|
|
632
|
+
# Estimate max column width based on terminal size and number of columns
|
|
633
|
+
if len(columns) <= 5:
|
|
634
|
+
max_col_width = min(60, terminal_width // 4)
|
|
635
|
+
elif len(columns) <= 10:
|
|
636
|
+
max_col_width = min(40, terminal_width // 6)
|
|
637
|
+
else:
|
|
638
|
+
max_col_width = min(30, terminal_width // 8)
|
|
639
|
+
except Exception: # pragma: no cover
|
|
640
|
+
# Fallback if we can't get terminal width
|
|
641
|
+
max_col_width = 40 if len(columns) <= 10 else 25
|
|
642
|
+
|
|
643
|
+
if len(columns) > max_terminal_cols:
|
|
644
|
+
# For wide tables, show first few, middle indicator, and last few columns
|
|
645
|
+
first_cols = 7
|
|
646
|
+
last_cols = 7
|
|
647
|
+
|
|
648
|
+
display_columns = columns[:first_cols] + ["...more..."] + columns[-last_cols:]
|
|
649
|
+
|
|
650
|
+
console.print(
|
|
651
|
+
f"\n[yellow]⚠ Table has {len(columns)} columns. Showing first {first_cols} and last {last_cols} columns.[/yellow]"
|
|
652
|
+
)
|
|
653
|
+
console.print("[dim]Use --columns to specify which columns to display.[/dim]")
|
|
654
|
+
console.print(
|
|
655
|
+
f"[dim]Full column list: {', '.join(columns[:5])}...{', '.join(columns[-5:])}[/dim]\n"
|
|
656
|
+
)
|
|
657
|
+
else:
|
|
658
|
+
display_columns = columns
|
|
659
|
+
|
|
660
|
+
# Get data types for columns
|
|
661
|
+
dtypes_dict = _get_column_dtypes(df, columns)
|
|
662
|
+
|
|
663
|
+
# Calculate row number column width if needed
|
|
664
|
+
row_num_width = 6 # Default width
|
|
665
|
+
if "_row_num_" in columns:
|
|
666
|
+
try:
|
|
667
|
+
# Get the maximum row number to calculate appropriate width
|
|
668
|
+
if hasattr(df, "to_dicts"):
|
|
669
|
+
data_dict = df.to_dicts()
|
|
670
|
+
if data_dict:
|
|
671
|
+
row_nums = [row.get("_row_num_", 0) for row in data_dict]
|
|
672
|
+
max_row_num = max(row_nums) if row_nums else 0
|
|
673
|
+
row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
|
|
674
|
+
elif hasattr(df, "to_dict"):
|
|
675
|
+
data_dict = df.to_dict("records")
|
|
676
|
+
if data_dict:
|
|
677
|
+
row_nums = [row.get("_row_num_", 0) for row in data_dict]
|
|
678
|
+
max_row_num = max(row_nums) if row_nums else 0
|
|
679
|
+
row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
|
|
680
|
+
except Exception: # pragma: no cover
|
|
681
|
+
# If we can't determine max row number, use default
|
|
682
|
+
row_num_width = 8 # Slightly larger default for safety
|
|
683
|
+
|
|
684
|
+
for i, col in enumerate(display_columns):
|
|
685
|
+
if col == "...more...":
|
|
686
|
+
# Add a special indicator column
|
|
687
|
+
rich_table.add_column("···", style="dim", width=3, no_wrap=True)
|
|
688
|
+
else:
|
|
689
|
+
# Handle row number column specially
|
|
690
|
+
if col == "_row_num_":
|
|
691
|
+
# Row numbers get no header, right alignment, and dim gray style
|
|
692
|
+
# Use dynamic width to prevent truncation
|
|
693
|
+
rich_table.add_column(
|
|
694
|
+
"", style="dim", justify="right", no_wrap=True, width=row_num_width
|
|
695
|
+
)
|
|
696
|
+
else:
|
|
697
|
+
display_col = str(col)
|
|
698
|
+
|
|
699
|
+
# Get data type for this column (if available)
|
|
700
|
+
if col in dtypes_dict:
|
|
701
|
+
dtype_display = f"<{dtypes_dict[col]}>"
|
|
702
|
+
# Create header with column name and data type
|
|
703
|
+
header_text = f"{display_col}\n[dim yellow]{dtype_display}[/dim yellow]"
|
|
704
|
+
else:
|
|
705
|
+
header_text = display_col
|
|
706
|
+
|
|
707
|
+
rich_table.add_column(
|
|
708
|
+
header_text,
|
|
709
|
+
style="cyan",
|
|
710
|
+
no_wrap=False,
|
|
711
|
+
overflow="ellipsis",
|
|
712
|
+
max_width=max_col_width,
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
# Convert data to list of rows
|
|
716
|
+
rows = []
|
|
717
|
+
try:
|
|
718
|
+
if hasattr(df, "to_dicts"):
|
|
719
|
+
# Polars interface
|
|
720
|
+
data_dict = df.to_dicts()
|
|
376
721
|
if len(columns) > max_terminal_cols:
|
|
377
722
|
# For wide tables, extract only the displayed columns
|
|
378
723
|
display_data_columns = (
|
|
@@ -519,44 +864,45 @@ def _rich_print_gt_table(gt_table: Any, preview_info: dict | None = None) -> Non
|
|
|
519
864
|
console.print()
|
|
520
865
|
console.print(rich_table)
|
|
521
866
|
|
|
522
|
-
# Show summary info
|
|
523
|
-
|
|
867
|
+
# Show summary info (conditionally)
|
|
868
|
+
if show_summary:
|
|
869
|
+
total_rows = len(rows)
|
|
524
870
|
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
871
|
+
# Use preview info if available, otherwise fall back to old logic
|
|
872
|
+
if preview_info:
|
|
873
|
+
total_dataset_rows = preview_info.get("total_rows", total_rows)
|
|
874
|
+
head_rows = preview_info.get("head_rows", 0)
|
|
875
|
+
tail_rows = preview_info.get("tail_rows", 0)
|
|
876
|
+
is_complete = preview_info.get("is_complete", False)
|
|
531
877
|
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
else:
|
|
552
|
-
# Original logic as fallback
|
|
553
|
-
max_rows = 50 # This should match the limit used above
|
|
554
|
-
if total_rows > max_rows:
|
|
555
|
-
console.print(
|
|
556
|
-
f"\n[dim]Showing first {max_rows} of {total_rows} rows. Use --output-html to see all data.[/dim]"
|
|
557
|
-
)
|
|
878
|
+
if is_complete:
|
|
879
|
+
console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
|
|
880
|
+
elif head_rows > 0 and tail_rows > 0:
|
|
881
|
+
console.print(
|
|
882
|
+
f"\n[dim]Showing first {head_rows} and last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
|
|
883
|
+
)
|
|
884
|
+
elif head_rows > 0:
|
|
885
|
+
console.print(
|
|
886
|
+
f"\n[dim]Showing first {head_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
|
|
887
|
+
)
|
|
888
|
+
elif tail_rows > 0:
|
|
889
|
+
console.print(
|
|
890
|
+
f"\n[dim]Showing last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
|
|
891
|
+
)
|
|
892
|
+
else:
|
|
893
|
+
# Fallback for other cases
|
|
894
|
+
console.print(
|
|
895
|
+
f"\n[dim]Showing {total_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
|
|
896
|
+
)
|
|
558
897
|
else:
|
|
559
|
-
|
|
898
|
+
# Original logic as fallback
|
|
899
|
+
max_rows = 50 # This should match the limit used above
|
|
900
|
+
if total_rows > max_rows:
|
|
901
|
+
console.print(
|
|
902
|
+
f"\n[dim]Showing first {max_rows} of {total_rows} rows. Use --output-html to see all data.[/dim]"
|
|
903
|
+
)
|
|
904
|
+
else:
|
|
905
|
+
console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
|
|
560
906
|
|
|
561
907
|
else:
|
|
562
908
|
# If we can't extract data, show the success message
|
|
@@ -693,73 +1039,71 @@ def _display_validation_summary(validation: Any) -> None:
|
|
|
693
1039
|
console.print(f"[dim]{traceback.format_exc()}[/dim]") # pragma: no cover
|
|
694
1040
|
|
|
695
1041
|
|
|
696
|
-
@click.group()
|
|
1042
|
+
@click.group(cls=OrderedGroup)
|
|
697
1043
|
@click.version_option(version=pb.__version__, prog_name="pb")
|
|
698
1044
|
def cli():
|
|
699
1045
|
"""
|
|
700
1046
|
Pointblank CLI - Data validation and quality tools for data engineers.
|
|
701
1047
|
|
|
702
|
-
Use this CLI to
|
|
1048
|
+
Use this CLI to run validation scripts, preview tables, and generate reports
|
|
703
1049
|
directly from the command line.
|
|
704
1050
|
"""
|
|
705
1051
|
pass
|
|
706
1052
|
|
|
707
1053
|
|
|
708
1054
|
@cli.command()
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
List available built-in datasets.
|
|
1055
|
+
@click.argument("data_source", type=str)
|
|
1056
|
+
def info(data_source: str):
|
|
712
1057
|
"""
|
|
713
|
-
|
|
714
|
-
("small_table", "13 rows × 8 columns", "Small demo dataset for testing"),
|
|
715
|
-
("game_revenue", "2,000 rows × 11 columns", "Game development company revenue data"),
|
|
716
|
-
("nycflights", "336,776 rows × 18 columns", "NYC airport flights data from 2013"),
|
|
717
|
-
("global_sales", "50,000 rows × 20 columns", "Global sales data across regions"),
|
|
718
|
-
]
|
|
1058
|
+
Display information about a data source.
|
|
719
1059
|
|
|
720
|
-
table
|
|
721
|
-
title="Available Pointblank Datasets", show_header=True, header_style="bold magenta"
|
|
722
|
-
)
|
|
723
|
-
table.add_column("Dataset Name", style="cyan", no_wrap=True)
|
|
724
|
-
table.add_column("Dimensions", style="green")
|
|
725
|
-
table.add_column("Description", style="white")
|
|
1060
|
+
Shows table type, dimensions, column names, and data types.
|
|
726
1061
|
|
|
727
|
-
|
|
728
|
-
table.add_row(name, dims, desc)
|
|
1062
|
+
DATA_SOURCE can be:
|
|
729
1063
|
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
1064
|
+
\b
|
|
1065
|
+
- CSV file path (e.g., data.csv)
|
|
1066
|
+
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
1067
|
+
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1068
|
+
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1069
|
+
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1070
|
+
"""
|
|
1071
|
+
try:
|
|
1072
|
+
with console.status("[bold green]Loading data..."):
|
|
1073
|
+
# Load the data source using the centralized function
|
|
1074
|
+
data = _load_data_source(data_source)
|
|
733
1075
|
|
|
1076
|
+
# Get table information
|
|
1077
|
+
tbl_type = _get_tbl_type(data)
|
|
1078
|
+
row_count = pb.get_row_count(data)
|
|
1079
|
+
col_count = pb.get_column_count(data)
|
|
734
1080
|
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
"""
|
|
738
|
-
Check installed dependencies and their availability.
|
|
739
|
-
"""
|
|
740
|
-
dependencies = [
|
|
741
|
-
("polars", "Polars DataFrame support"),
|
|
742
|
-
("pandas", "Pandas DataFrame support"),
|
|
743
|
-
("ibis", "Ibis backend support (DuckDB, etc.)"),
|
|
744
|
-
("duckdb", "DuckDB database support"),
|
|
745
|
-
("pyarrow", "Parquet file support"),
|
|
746
|
-
]
|
|
1081
|
+
# Import the box style
|
|
1082
|
+
from rich.box import SIMPLE_HEAD
|
|
747
1083
|
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
1084
|
+
# Create info table
|
|
1085
|
+
info_table = Table(
|
|
1086
|
+
title="Data Source Information",
|
|
1087
|
+
show_header=True,
|
|
1088
|
+
header_style="bold magenta",
|
|
1089
|
+
box=SIMPLE_HEAD,
|
|
1090
|
+
title_style="bold cyan",
|
|
1091
|
+
title_justify="left",
|
|
1092
|
+
)
|
|
1093
|
+
info_table.add_column("Property", style="cyan", no_wrap=True)
|
|
1094
|
+
info_table.add_column("Value", style="green")
|
|
752
1095
|
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
status = "[red]✗ Not installed[/red]"
|
|
1096
|
+
info_table.add_row("Source", data_source)
|
|
1097
|
+
info_table.add_row("Table Type", tbl_type)
|
|
1098
|
+
info_table.add_row("Rows", f"{row_count:,}")
|
|
1099
|
+
info_table.add_row("Columns", f"{col_count:,}")
|
|
758
1100
|
|
|
759
|
-
|
|
1101
|
+
console.print()
|
|
1102
|
+
console.print(info_table)
|
|
760
1103
|
|
|
761
|
-
|
|
762
|
-
|
|
1104
|
+
except Exception as e:
|
|
1105
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1106
|
+
sys.exit(1)
|
|
763
1107
|
|
|
764
1108
|
|
|
765
1109
|
@cli.command()
|
|
@@ -799,6 +1143,7 @@ def preview(
|
|
|
799
1143
|
\b
|
|
800
1144
|
- CSV file path (e.g., data.csv)
|
|
801
1145
|
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
1146
|
+
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
802
1147
|
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
803
1148
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
804
1149
|
|
|
@@ -816,14 +1161,10 @@ def preview(
|
|
|
816
1161
|
"""
|
|
817
1162
|
try:
|
|
818
1163
|
with console.status("[bold green]Loading data..."):
|
|
819
|
-
#
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
else:
|
|
824
|
-
# Assume it's a file path or connection string
|
|
825
|
-
data = data_source
|
|
826
|
-
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1164
|
+
# Load the data source using the centralized function
|
|
1165
|
+
data = _load_data_source(data_source)
|
|
1166
|
+
|
|
1167
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
827
1168
|
|
|
828
1169
|
# Parse columns if provided
|
|
829
1170
|
columns_list = None
|
|
@@ -832,18 +1173,8 @@ def preview(
|
|
|
832
1173
|
|
|
833
1174
|
# If data has _row_num_ and it's not explicitly included, add it at the beginning
|
|
834
1175
|
try:
|
|
835
|
-
|
|
836
|
-
_process_connection_string,
|
|
837
|
-
_process_csv_input,
|
|
838
|
-
_process_parquet_input,
|
|
839
|
-
)
|
|
840
|
-
|
|
841
|
-
# Process the data source to get actual data object to check for _row_num_
|
|
1176
|
+
# Data is already processed, just use it directly
|
|
842
1177
|
processed_data = data
|
|
843
|
-
if isinstance(data, str):
|
|
844
|
-
processed_data = _process_connection_string(data)
|
|
845
|
-
processed_data = _process_csv_input(processed_data)
|
|
846
|
-
processed_data = _process_parquet_input(processed_data)
|
|
847
1178
|
|
|
848
1179
|
# Get column names from the processed data
|
|
849
1180
|
all_columns = []
|
|
@@ -860,19 +1191,8 @@ def preview(
|
|
|
860
1191
|
pass
|
|
861
1192
|
elif col_range or col_first or col_last:
|
|
862
1193
|
# Need to get column names to apply range/first/last selection
|
|
863
|
-
#
|
|
864
|
-
from pointblank.validate import (
|
|
865
|
-
_process_connection_string,
|
|
866
|
-
_process_csv_input,
|
|
867
|
-
_process_parquet_input,
|
|
868
|
-
)
|
|
869
|
-
|
|
870
|
-
# Process the data source to get actual data object
|
|
1194
|
+
# Data is already processed, just use it directly
|
|
871
1195
|
processed_data = data
|
|
872
|
-
if isinstance(data, str):
|
|
873
|
-
processed_data = _process_connection_string(data)
|
|
874
|
-
processed_data = _process_csv_input(processed_data)
|
|
875
|
-
processed_data = _process_parquet_input(processed_data)
|
|
876
1196
|
|
|
877
1197
|
# Get column names from the processed data
|
|
878
1198
|
all_columns = []
|
|
@@ -934,20 +1254,11 @@ def preview(
|
|
|
934
1254
|
with console.status("[bold green]Generating preview..."):
|
|
935
1255
|
# Get total dataset size before preview and gather metadata
|
|
936
1256
|
try:
|
|
937
|
-
#
|
|
938
|
-
from pointblank.validate import (
|
|
939
|
-
_process_connection_string,
|
|
940
|
-
_process_csv_input,
|
|
941
|
-
_process_parquet_input,
|
|
942
|
-
)
|
|
943
|
-
|
|
1257
|
+
# Data is already processed, just use it directly
|
|
944
1258
|
processed_data = data
|
|
945
|
-
if isinstance(data, str):
|
|
946
|
-
processed_data = _process_connection_string(data)
|
|
947
|
-
processed_data = _process_csv_input(processed_data)
|
|
948
|
-
processed_data = _process_parquet_input(processed_data)
|
|
949
1259
|
|
|
950
1260
|
total_dataset_rows = pb.get_row_count(processed_data)
|
|
1261
|
+
total_dataset_columns = pb.get_column_count(processed_data)
|
|
951
1262
|
|
|
952
1263
|
# Determine source type and table type for enhanced preview title
|
|
953
1264
|
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
@@ -959,6 +1270,7 @@ def preview(
|
|
|
959
1270
|
except Exception:
|
|
960
1271
|
# If we can't get metadata, set defaults
|
|
961
1272
|
total_dataset_rows = None
|
|
1273
|
+
total_dataset_columns = None
|
|
962
1274
|
source_type = f"Data source: {data_source}"
|
|
963
1275
|
table_type = "unknown"
|
|
964
1276
|
|
|
@@ -989,6 +1301,7 @@ def preview(
|
|
|
989
1301
|
|
|
990
1302
|
preview_info = {
|
|
991
1303
|
"total_rows": total_dataset_rows,
|
|
1304
|
+
"total_columns": total_dataset_columns,
|
|
992
1305
|
"head_rows": head,
|
|
993
1306
|
"tail_rows": tail,
|
|
994
1307
|
"is_complete": is_complete,
|
|
@@ -1003,71 +1316,6 @@ def preview(
|
|
|
1003
1316
|
sys.exit(1) # pragma: no cover
|
|
1004
1317
|
|
|
1005
1318
|
|
|
1006
|
-
@cli.command()
|
|
1007
|
-
@click.argument("data_source", type=str)
|
|
1008
|
-
def info(data_source: str):
|
|
1009
|
-
"""
|
|
1010
|
-
Display information about a data source.
|
|
1011
|
-
|
|
1012
|
-
Shows table type, dimensions, column names, and data types.
|
|
1013
|
-
"""
|
|
1014
|
-
try:
|
|
1015
|
-
with console.status("[bold green]Loading data..."):
|
|
1016
|
-
# Try to load as a pointblank dataset first
|
|
1017
|
-
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1018
|
-
data = pb.load_dataset(data_source)
|
|
1019
|
-
source_type = f"Pointblank dataset: {data_source}"
|
|
1020
|
-
console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
|
|
1021
|
-
else:
|
|
1022
|
-
# Assume it's a file path or connection string
|
|
1023
|
-
data = data_source
|
|
1024
|
-
source_type = f"External source: {data_source}"
|
|
1025
|
-
|
|
1026
|
-
# Process the data to get actual table object for inspection
|
|
1027
|
-
from pointblank.validate import (
|
|
1028
|
-
_process_connection_string,
|
|
1029
|
-
_process_csv_input,
|
|
1030
|
-
_process_parquet_input,
|
|
1031
|
-
)
|
|
1032
|
-
|
|
1033
|
-
data = _process_connection_string(data)
|
|
1034
|
-
data = _process_csv_input(data)
|
|
1035
|
-
data = _process_parquet_input(data)
|
|
1036
|
-
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1037
|
-
|
|
1038
|
-
# Get table information
|
|
1039
|
-
tbl_type = _get_tbl_type(data)
|
|
1040
|
-
row_count = pb.get_row_count(data)
|
|
1041
|
-
col_count = pb.get_column_count(data)
|
|
1042
|
-
|
|
1043
|
-
# Import the box style for consistent styling with scan table
|
|
1044
|
-
from rich.box import SIMPLE_HEAD
|
|
1045
|
-
|
|
1046
|
-
# Create info table with same styling as scan table
|
|
1047
|
-
info_table = Table(
|
|
1048
|
-
title="Data Source Information",
|
|
1049
|
-
show_header=True,
|
|
1050
|
-
header_style="bold magenta",
|
|
1051
|
-
box=SIMPLE_HEAD,
|
|
1052
|
-
title_style="bold cyan",
|
|
1053
|
-
title_justify="left",
|
|
1054
|
-
)
|
|
1055
|
-
info_table.add_column("Property", style="cyan", no_wrap=True)
|
|
1056
|
-
info_table.add_column("Value", style="green")
|
|
1057
|
-
|
|
1058
|
-
info_table.add_row("Source", source_type)
|
|
1059
|
-
info_table.add_row("Table Type", tbl_type)
|
|
1060
|
-
info_table.add_row("Rows", f"{row_count:,}")
|
|
1061
|
-
info_table.add_row("Columns", f"{col_count:,}")
|
|
1062
|
-
|
|
1063
|
-
console.print()
|
|
1064
|
-
console.print(info_table)
|
|
1065
|
-
|
|
1066
|
-
except Exception as e:
|
|
1067
|
-
console.print(f"[red]Error:[/red] {e}")
|
|
1068
|
-
sys.exit(1)
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
1319
|
@cli.command()
|
|
1072
1320
|
@click.argument("data_source", type=str)
|
|
1073
1321
|
@click.option("--output-html", type=click.Path(), help="Save HTML scan report to file")
|
|
@@ -1093,6 +1341,7 @@ def scan(
|
|
|
1093
1341
|
\b
|
|
1094
1342
|
- CSV file path (e.g., data.csv)
|
|
1095
1343
|
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
1344
|
+
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1096
1345
|
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1097
1346
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1098
1347
|
"""
|
|
@@ -1102,14 +1351,10 @@ def scan(
|
|
|
1102
1351
|
start_time = time.time()
|
|
1103
1352
|
|
|
1104
1353
|
with console.status("[bold green]Loading data..."):
|
|
1105
|
-
#
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
else:
|
|
1110
|
-
# Assume it's a file path or connection string
|
|
1111
|
-
data = data_source
|
|
1112
|
-
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1354
|
+
# Load the data source using the centralized function
|
|
1355
|
+
data = _load_data_source(data_source)
|
|
1356
|
+
|
|
1357
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1113
1358
|
|
|
1114
1359
|
# Parse columns if provided
|
|
1115
1360
|
columns_list = None
|
|
@@ -1119,35 +1364,22 @@ def scan(
|
|
|
1119
1364
|
# Generate data scan
|
|
1120
1365
|
with console.status("[bold green]Generating data scan..."):
|
|
1121
1366
|
# Use col_summary_tbl for comprehensive column scanning
|
|
1367
|
+
# Data is already processed by _load_data_source
|
|
1368
|
+
scan_result = pb.col_summary_tbl(data=data)
|
|
1369
|
+
|
|
1122
1370
|
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1123
|
-
# For pointblank datasets, data is already the loaded dataframe
|
|
1124
|
-
scan_result = pb.col_summary_tbl(data=data)
|
|
1125
1371
|
source_type = f"Pointblank dataset: {data_source}"
|
|
1126
|
-
table_type = _get_tbl_type(data)
|
|
1127
|
-
# Get row count for footer
|
|
1128
|
-
try:
|
|
1129
|
-
total_rows = pb.get_row_count(data)
|
|
1130
|
-
except Exception:
|
|
1131
|
-
total_rows = None
|
|
1132
1372
|
else:
|
|
1133
|
-
# For file paths and connection strings, load the data first
|
|
1134
|
-
from pointblank.validate import (
|
|
1135
|
-
_process_connection_string,
|
|
1136
|
-
_process_csv_input,
|
|
1137
|
-
_process_parquet_input,
|
|
1138
|
-
)
|
|
1139
|
-
|
|
1140
|
-
processed_data = _process_connection_string(data)
|
|
1141
|
-
processed_data = _process_csv_input(processed_data)
|
|
1142
|
-
processed_data = _process_parquet_input(processed_data)
|
|
1143
|
-
scan_result = pb.col_summary_tbl(data=processed_data)
|
|
1144
1373
|
source_type = f"External source: {data_source}"
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1374
|
+
|
|
1375
|
+
table_type = _get_tbl_type(data)
|
|
1376
|
+
# Get row count and column count for header
|
|
1377
|
+
try:
|
|
1378
|
+
total_rows = pb.get_row_count(data)
|
|
1379
|
+
total_columns = pb.get_column_count(data)
|
|
1380
|
+
except Exception:
|
|
1381
|
+
total_rows = None
|
|
1382
|
+
total_columns = None
|
|
1151
1383
|
|
|
1152
1384
|
scan_time = time.time() - start_time
|
|
1153
1385
|
|
|
@@ -1167,7 +1399,7 @@ def scan(
|
|
|
1167
1399
|
# Display detailed column summary using rich formatting
|
|
1168
1400
|
try:
|
|
1169
1401
|
_rich_print_scan_table(
|
|
1170
|
-
scan_result, data_source, source_type, table_type, total_rows
|
|
1402
|
+
scan_result, data_source, source_type, table_type, total_rows, total_columns
|
|
1171
1403
|
)
|
|
1172
1404
|
|
|
1173
1405
|
except Exception as e:
|
|
@@ -1190,40 +1422,23 @@ def missing(data_source: str, output_html: str | None):
|
|
|
1190
1422
|
\b
|
|
1191
1423
|
- CSV file path (e.g., data.csv)
|
|
1192
1424
|
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
1425
|
+
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1193
1426
|
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1194
1427
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1195
1428
|
"""
|
|
1196
1429
|
try:
|
|
1197
1430
|
with console.status("[bold green]Loading data..."):
|
|
1198
|
-
#
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
else:
|
|
1203
|
-
# Assume it's a file path or connection string
|
|
1204
|
-
data = data_source
|
|
1205
|
-
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1431
|
+
# Load the data source using the centralized function
|
|
1432
|
+
data = _load_data_source(data_source)
|
|
1433
|
+
|
|
1434
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1206
1435
|
|
|
1207
1436
|
# Generate missing values table
|
|
1208
1437
|
with console.status("[bold green]Analyzing missing values..."):
|
|
1209
1438
|
gt_table = pb.missing_vals_tbl(data)
|
|
1210
1439
|
|
|
1211
|
-
#
|
|
1440
|
+
# Data is already processed, just use it directly
|
|
1212
1441
|
original_data = data
|
|
1213
|
-
if isinstance(data, str):
|
|
1214
|
-
# Process the data to get the actual data object
|
|
1215
|
-
from pointblank.validate import (
|
|
1216
|
-
_process_connection_string,
|
|
1217
|
-
_process_csv_input,
|
|
1218
|
-
_process_parquet_input,
|
|
1219
|
-
)
|
|
1220
|
-
|
|
1221
|
-
try:
|
|
1222
|
-
original_data = _process_connection_string(data)
|
|
1223
|
-
original_data = _process_csv_input(original_data)
|
|
1224
|
-
original_data = _process_parquet_input(original_data)
|
|
1225
|
-
except Exception: # pragma: no cover
|
|
1226
|
-
pass # Use the string data as fallback
|
|
1227
1442
|
|
|
1228
1443
|
if output_html:
|
|
1229
1444
|
# Save HTML to file
|
|
@@ -1239,556 +1454,505 @@ def missing(data_source: str, output_html: str | None):
|
|
|
1239
1454
|
sys.exit(1)
|
|
1240
1455
|
|
|
1241
1456
|
|
|
1242
|
-
@cli.command()
|
|
1243
|
-
@click.argument("output_file", type=click.Path())
|
|
1244
|
-
def validate_example(output_file: str):
|
|
1245
|
-
"""
|
|
1246
|
-
Generate an example validation script.
|
|
1247
|
-
|
|
1248
|
-
Creates a sample Python script showing how to use Pointblank for validation.
|
|
1249
|
-
"""
|
|
1250
|
-
example_script = '''"""
|
|
1251
|
-
Example Pointblank validation script.
|
|
1252
|
-
|
|
1253
|
-
This script demonstrates how to create validation rules for your data.
|
|
1254
|
-
Modify the validation rules below to match your data requirements.
|
|
1255
|
-
"""
|
|
1256
|
-
|
|
1257
|
-
import pointblank as pb
|
|
1258
|
-
|
|
1259
|
-
# Create a validation object
|
|
1260
|
-
# The 'data' variable is automatically provided by the CLI
|
|
1261
|
-
validation = (
|
|
1262
|
-
pb.Validate(
|
|
1263
|
-
data=data,
|
|
1264
|
-
tbl_name="Example Data",
|
|
1265
|
-
label="CLI Validation Example",
|
|
1266
|
-
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
1267
|
-
)
|
|
1268
|
-
# Add your validation rules here
|
|
1269
|
-
# Example rules (modify these based on your data structure):
|
|
1270
|
-
|
|
1271
|
-
# Check that specific columns exist
|
|
1272
|
-
# .col_exists(["column1", "column2"])
|
|
1273
|
-
|
|
1274
|
-
# Check for null values
|
|
1275
|
-
# .col_vals_not_null(columns="important_column")
|
|
1276
|
-
|
|
1277
|
-
# Check value ranges
|
|
1278
|
-
# .col_vals_gt(columns="amount", value=0)
|
|
1279
|
-
# .col_vals_between(columns="score", left=0, right=100)
|
|
1280
|
-
|
|
1281
|
-
# Check string patterns
|
|
1282
|
-
# .col_vals_regex(columns="email", pattern=r"^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$")
|
|
1283
|
-
|
|
1284
|
-
# Check unique values
|
|
1285
|
-
# .col_vals_unique(columns="id")
|
|
1286
|
-
|
|
1287
|
-
# Finalize the validation
|
|
1288
|
-
.interrogate()
|
|
1289
|
-
)
|
|
1290
|
-
|
|
1291
|
-
# The validation object will be automatically used by the CLI
|
|
1292
|
-
'''
|
|
1293
|
-
|
|
1294
|
-
Path(output_file).write_text(example_script)
|
|
1295
|
-
console.print(f"[green]✓[/green] Example validation script created: {output_file}")
|
|
1296
|
-
console.print("\nEdit the script to add your validation rules, then run:")
|
|
1297
|
-
console.print(f"[cyan]pb validate your_data.csv {output_file}[/cyan]")
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
@cli.command()
|
|
1457
|
+
@cli.command(name="validate")
|
|
1301
1458
|
@click.argument("data_source", type=str)
|
|
1302
|
-
@click.
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1459
|
+
@click.option(
|
|
1460
|
+
"--check",
|
|
1461
|
+
"checks", # Changed to collect multiple values
|
|
1462
|
+
type=click.Choice(
|
|
1463
|
+
[
|
|
1464
|
+
"rows-distinct",
|
|
1465
|
+
"col-vals-not-null",
|
|
1466
|
+
"rows-complete",
|
|
1467
|
+
"col-exists",
|
|
1468
|
+
"col-vals-in-set",
|
|
1469
|
+
"col-vals-gt",
|
|
1470
|
+
"col-vals-ge",
|
|
1471
|
+
"col-vals-lt",
|
|
1472
|
+
"col-vals-le",
|
|
1473
|
+
]
|
|
1474
|
+
),
|
|
1475
|
+
multiple=True, # Allow multiple --check options
|
|
1476
|
+
help="Type of validation check to perform. Can be used multiple times for multiple checks.",
|
|
1477
|
+
)
|
|
1478
|
+
@click.option("--list-checks", is_flag=True, help="List available validation checks and exit")
|
|
1479
|
+
@click.option(
|
|
1480
|
+
"--column",
|
|
1481
|
+
"columns", # Changed to collect multiple values
|
|
1482
|
+
multiple=True, # Allow multiple --column options
|
|
1483
|
+
help="Column name or integer position as #N (1-based index) for validation.",
|
|
1484
|
+
)
|
|
1485
|
+
@click.option(
|
|
1486
|
+
"--set",
|
|
1487
|
+
"sets", # Changed to collect multiple values
|
|
1488
|
+
multiple=True, # Allow multiple --set options
|
|
1489
|
+
help="Comma-separated allowed values for col-vals-in-set checks.",
|
|
1490
|
+
)
|
|
1491
|
+
@click.option(
|
|
1492
|
+
"--value",
|
|
1493
|
+
"values", # Changed to collect multiple values
|
|
1494
|
+
type=float,
|
|
1495
|
+
multiple=True, # Allow multiple --value options
|
|
1496
|
+
help="Numeric value for comparison checks.",
|
|
1497
|
+
)
|
|
1498
|
+
@click.option(
|
|
1499
|
+
"--show-extract", is_flag=True, help="Show extract of failing rows if validation fails"
|
|
1500
|
+
)
|
|
1501
|
+
@click.option(
|
|
1502
|
+
"--write-extract", type=str, help="Save failing rows to folder. Provide base name for folder."
|
|
1503
|
+
)
|
|
1504
|
+
@click.option(
|
|
1505
|
+
"--limit", "-l", default=10, help="Maximum number of failing rows to show/save (default: 10)"
|
|
1506
|
+
)
|
|
1507
|
+
@click.option("--exit-code", is_flag=True, help="Exit with non-zero code if validation fails")
|
|
1508
|
+
@click.pass_context
|
|
1306
1509
|
def validate(
|
|
1510
|
+
ctx: click.Context,
|
|
1307
1511
|
data_source: str,
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1512
|
+
checks: tuple[str, ...], # Changed to tuple
|
|
1513
|
+
columns: tuple[str, ...], # Changed to tuple
|
|
1514
|
+
sets: tuple[str, ...], # Changed to tuple
|
|
1515
|
+
values: tuple[float, ...], # Changed to tuple
|
|
1516
|
+
show_extract: bool,
|
|
1517
|
+
write_extract: str | None,
|
|
1518
|
+
limit: int,
|
|
1519
|
+
exit_code: bool,
|
|
1520
|
+
list_checks: bool,
|
|
1312
1521
|
):
|
|
1313
1522
|
"""
|
|
1314
|
-
|
|
1523
|
+
Perform single or multiple data validations.
|
|
1524
|
+
|
|
1525
|
+
Run one or more validation checks on your data in a single command.
|
|
1526
|
+
Use multiple --check options to perform multiple validations.
|
|
1315
1527
|
|
|
1316
1528
|
DATA_SOURCE can be:
|
|
1317
1529
|
|
|
1318
1530
|
\b
|
|
1319
1531
|
- CSV file path (e.g., data.csv)
|
|
1320
1532
|
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
1533
|
+
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1321
1534
|
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1322
1535
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1323
1536
|
|
|
1324
|
-
|
|
1325
|
-
See 'pb validate-example' for a sample script.
|
|
1326
|
-
"""
|
|
1327
|
-
try:
|
|
1328
|
-
with console.status("[bold green]Loading data..."):
|
|
1329
|
-
# Try to load as a pointblank dataset first
|
|
1330
|
-
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1331
|
-
data = pb.load_dataset(data_source)
|
|
1332
|
-
console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
|
|
1333
|
-
else:
|
|
1334
|
-
# Assume it's a file path or connection string
|
|
1335
|
-
data = data_source
|
|
1336
|
-
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1337
|
-
|
|
1338
|
-
# Execute the validation script
|
|
1339
|
-
with console.status("[bold green]Running validation..."):
|
|
1340
|
-
# Read and execute the validation script
|
|
1341
|
-
script_content = Path(validation_script).read_text()
|
|
1342
|
-
|
|
1343
|
-
# Create a namespace with pointblank and the data
|
|
1344
|
-
namespace = {
|
|
1345
|
-
"pb": pb,
|
|
1346
|
-
"pointblank": pb,
|
|
1347
|
-
"data": data,
|
|
1348
|
-
"__name__": "__main__",
|
|
1349
|
-
}
|
|
1350
|
-
|
|
1351
|
-
# Execute the script
|
|
1352
|
-
try:
|
|
1353
|
-
exec(script_content, namespace)
|
|
1354
|
-
except Exception as e:
|
|
1355
|
-
console.print(f"[red]Error executing validation script:[/red] {e}")
|
|
1356
|
-
sys.exit(1)
|
|
1357
|
-
|
|
1358
|
-
# Look for a validation object in the namespace
|
|
1359
|
-
validation = None
|
|
1360
|
-
|
|
1361
|
-
# Try to find the 'validation' variable specifically first
|
|
1362
|
-
if "validation" in namespace:
|
|
1363
|
-
validation = namespace["validation"]
|
|
1364
|
-
else:
|
|
1365
|
-
# Look for any validation object in the namespace
|
|
1366
|
-
for key, value in namespace.items():
|
|
1367
|
-
if hasattr(value, "interrogate") and hasattr(value, "validation_info"):
|
|
1368
|
-
validation = value
|
|
1369
|
-
break
|
|
1370
|
-
# Also check if it's a Validate object that has been interrogated
|
|
1371
|
-
elif str(type(value)).find("Validate") != -1:
|
|
1372
|
-
validation = value
|
|
1373
|
-
break
|
|
1374
|
-
|
|
1375
|
-
if validation is None:
|
|
1376
|
-
raise ValueError(
|
|
1377
|
-
"No validation object found in script. "
|
|
1378
|
-
"Script should create a Validate object and assign it to a variable named 'validation'."
|
|
1379
|
-
)
|
|
1380
|
-
|
|
1381
|
-
console.print("[green]✓[/green] Validation completed")
|
|
1382
|
-
|
|
1383
|
-
# Display summary
|
|
1384
|
-
_display_validation_summary(validation)
|
|
1385
|
-
|
|
1386
|
-
# Save outputs
|
|
1387
|
-
if output_html:
|
|
1388
|
-
try:
|
|
1389
|
-
# Get HTML representation
|
|
1390
|
-
html_content = validation._repr_html_()
|
|
1391
|
-
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
1392
|
-
console.print(f"[green]✓[/green] HTML report saved to: {output_html}")
|
|
1393
|
-
except Exception as e:
|
|
1394
|
-
console.print(f"[yellow]Warning: Could not save HTML report: {e}[/yellow]")
|
|
1395
|
-
|
|
1396
|
-
if output_json:
|
|
1397
|
-
try:
|
|
1398
|
-
# Get JSON report
|
|
1399
|
-
json_report = validation.get_json_report()
|
|
1400
|
-
Path(output_json).write_text(json_report, encoding="utf-8")
|
|
1401
|
-
console.print(f"[green]✓[/green] JSON summary saved to: {output_json}")
|
|
1402
|
-
except Exception as e:
|
|
1403
|
-
console.print(f"[yellow]Warning: Could not save JSON report: {e}[/yellow]")
|
|
1404
|
-
|
|
1405
|
-
# Check if we should fail on error
|
|
1406
|
-
if fail_on_error:
|
|
1407
|
-
try:
|
|
1408
|
-
if (
|
|
1409
|
-
hasattr(validation, "validation_info")
|
|
1410
|
-
and validation.validation_info is not None
|
|
1411
|
-
):
|
|
1412
|
-
info = validation.validation_info
|
|
1413
|
-
n_critical = sum(1 for step in info if step.critical)
|
|
1414
|
-
n_error = sum(1 for step in info if step.error)
|
|
1415
|
-
|
|
1416
|
-
if n_critical > 0 or n_error > 0:
|
|
1417
|
-
severity = "critical" if n_critical > 0 else "error"
|
|
1418
|
-
console.print(
|
|
1419
|
-
f"[red]Exiting with error due to {severity} validation failures[/red]"
|
|
1420
|
-
)
|
|
1421
|
-
sys.exit(1)
|
|
1422
|
-
except Exception as e:
|
|
1423
|
-
console.print(
|
|
1424
|
-
f"[yellow]Warning: Could not check validation status for fail-on-error: {e}[/yellow]"
|
|
1425
|
-
)
|
|
1426
|
-
|
|
1427
|
-
except Exception as e:
|
|
1428
|
-
console.print(f"[red]Error:[/red] {e}")
|
|
1429
|
-
sys.exit(1)
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
@cli.command()
|
|
1433
|
-
@click.argument("data_source", type=str)
|
|
1434
|
-
@click.argument("validation_script", type=click.Path(exists=True))
|
|
1435
|
-
@click.argument("step_number", type=int)
|
|
1436
|
-
@click.option(
|
|
1437
|
-
"--limit", "-l", default=100, help="Maximum number of failing rows to show (default: 100)"
|
|
1438
|
-
)
|
|
1439
|
-
@click.option("--output-csv", type=click.Path(), help="Save failing rows to CSV file")
|
|
1440
|
-
@click.option("--output-html", type=click.Path(), help="Save failing rows table to HTML file")
|
|
1441
|
-
def extract(
|
|
1442
|
-
data_source: str,
|
|
1443
|
-
validation_script: str,
|
|
1444
|
-
step_number: int,
|
|
1445
|
-
limit: int,
|
|
1446
|
-
output_csv: str | None,
|
|
1447
|
-
output_html: str | None,
|
|
1448
|
-
):
|
|
1449
|
-
"""
|
|
1450
|
-
Extract failing rows from a specific validation step.
|
|
1451
|
-
|
|
1452
|
-
This command runs a validation and extracts the rows that failed
|
|
1453
|
-
a specific validation step, which is useful for debugging data quality issues.
|
|
1454
|
-
|
|
1455
|
-
DATA_SOURCE: Same as validate command
|
|
1456
|
-
VALIDATION_SCRIPT: Path to validation script
|
|
1457
|
-
STEP_NUMBER: The step number to extract failing rows from (1-based)
|
|
1458
|
-
"""
|
|
1459
|
-
try:
|
|
1460
|
-
with console.status("[bold green]Loading data..."):
|
|
1461
|
-
# Try to load as a pointblank dataset first
|
|
1462
|
-
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1463
|
-
data = pb.load_dataset(data_source)
|
|
1464
|
-
console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
|
|
1465
|
-
else:
|
|
1466
|
-
# Assume it's a file path or connection string
|
|
1467
|
-
data = data_source
|
|
1468
|
-
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1469
|
-
|
|
1470
|
-
# Execute the validation script
|
|
1471
|
-
with console.status("[bold green]Running validation..."):
|
|
1472
|
-
# Read and execute the validation script
|
|
1473
|
-
script_content = Path(validation_script).read_text()
|
|
1474
|
-
|
|
1475
|
-
# Create a namespace with pointblank and the data
|
|
1476
|
-
namespace = {
|
|
1477
|
-
"pb": pb,
|
|
1478
|
-
"pointblank": pb,
|
|
1479
|
-
"data": data,
|
|
1480
|
-
"__name__": "__main__",
|
|
1481
|
-
}
|
|
1482
|
-
|
|
1483
|
-
# Execute the script
|
|
1484
|
-
try:
|
|
1485
|
-
exec(script_content, namespace)
|
|
1486
|
-
except Exception as e:
|
|
1487
|
-
console.print(f"[red]Error executing validation script:[/red] {e}")
|
|
1488
|
-
sys.exit(1)
|
|
1489
|
-
|
|
1490
|
-
# Look for a validation object in the namespace
|
|
1491
|
-
validation = None
|
|
1492
|
-
if "validation" in namespace:
|
|
1493
|
-
validation = namespace["validation"]
|
|
1494
|
-
else:
|
|
1495
|
-
# Look for any validation object in the namespace
|
|
1496
|
-
for key, value in namespace.items():
|
|
1497
|
-
if hasattr(value, "interrogate") and hasattr(value, "validation_info"):
|
|
1498
|
-
validation = value
|
|
1499
|
-
break
|
|
1500
|
-
elif str(type(value)).find("Validate") != -1:
|
|
1501
|
-
validation = value
|
|
1502
|
-
break
|
|
1503
|
-
|
|
1504
|
-
if validation is None:
|
|
1505
|
-
raise ValueError(
|
|
1506
|
-
"No validation object found in script. "
|
|
1507
|
-
"Script should create a Validate object and assign it to a variable named 'validation'."
|
|
1508
|
-
)
|
|
1509
|
-
|
|
1510
|
-
console.print("[green]✓[/green] Validation completed")
|
|
1511
|
-
|
|
1512
|
-
# Extract failing rows from the specified step
|
|
1513
|
-
with console.status(f"[bold green]Extracting failing rows from step {step_number}..."):
|
|
1514
|
-
try:
|
|
1515
|
-
# Get the data extracts for the specific step
|
|
1516
|
-
step_extract = validation.get_data_extracts(i=step_number, frame=True)
|
|
1537
|
+
AVAILABLE CHECKS:
|
|
1517
1538
|
|
|
1518
|
-
|
|
1519
|
-
console.print(f"[yellow]No failing rows found for step {step_number}[/yellow]")
|
|
1520
|
-
return
|
|
1539
|
+
Use --list-checks to see all available validation methods with examples.
|
|
1521
1540
|
|
|
1522
|
-
|
|
1523
|
-
if len(step_extract) > limit:
|
|
1524
|
-
step_extract = step_extract.head(limit)
|
|
1525
|
-
console.print(f"[yellow]Limited to first {limit} failing rows[/yellow]")
|
|
1541
|
+
The default check is 'rows-distinct' which checks for duplicate rows.
|
|
1526
1542
|
|
|
1527
|
-
|
|
1543
|
+
\b
|
|
1544
|
+
- rows-distinct: Check if all rows in the dataset are unique (no duplicates)
|
|
1545
|
+
- rows-complete: Check if all rows are complete (no missing values in any column)
|
|
1546
|
+
- col-exists: Check if a specific column exists in the dataset (requires --column)
|
|
1547
|
+
- col-vals-not-null: Check if all values in a column are not null/missing (requires --column)
|
|
1548
|
+
- col-vals-gt: Check if all values in a column are greater than a threshold (requires --column and --value)
|
|
1549
|
+
- col-vals-ge: Check if all values in a column are greater than or equal to a threshold (requires --column and --value)
|
|
1550
|
+
- col-vals-lt: Check if all values in a column are less than a threshold (requires --column and --value)
|
|
1551
|
+
- col-vals-le: Check if all values in a column are less than or equal to a threshold (requires --column and --value)
|
|
1552
|
+
- col-vals-in-set: Check if all values in a column are in an allowed set (requires --column and --set)
|
|
1528
1553
|
|
|
1529
|
-
|
|
1530
|
-
if output_csv:
|
|
1531
|
-
if hasattr(step_extract, "write_csv"):
|
|
1532
|
-
step_extract.write_csv(output_csv)
|
|
1533
|
-
else:
|
|
1534
|
-
step_extract.to_csv(output_csv, index=False)
|
|
1535
|
-
console.print(f"[green]✓[/green] Failing rows saved to CSV: {output_csv}")
|
|
1554
|
+
Examples:
|
|
1536
1555
|
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1556
|
+
\b
|
|
1557
|
+
pb validate data.csv # Uses default validation (rows-distinct)
|
|
1558
|
+
pb validate data.csv --list-checks # Show all available checks
|
|
1559
|
+
pb validate data.csv --check rows-distinct
|
|
1560
|
+
pb validate data.csv --check rows-distinct --show-extract
|
|
1561
|
+
pb validate data.csv --check rows-distinct --write-extract failing_rows_folder
|
|
1562
|
+
pb validate data.csv --check rows-distinct --exit-code
|
|
1563
|
+
pb validate data.csv --check rows-complete
|
|
1564
|
+
pb validate data.csv --check col-exists --column price
|
|
1565
|
+
pb validate data.csv --check col-vals-not-null --column email
|
|
1566
|
+
pb validate data.csv --check col-vals-gt --column score --value 50
|
|
1567
|
+
pb validate data.csv --check col-vals-in-set --column status --set "active,inactive,pending"
|
|
1568
|
+
|
|
1569
|
+
Multiple validations in one command:
|
|
1570
|
+
pb validate data.csv --check rows-distinct --check rows-complete
|
|
1571
|
+
pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
|
|
1572
|
+
"""
|
|
1573
|
+
try:
|
|
1574
|
+
# Handle backward compatibility and parameter conversion
|
|
1575
|
+
import sys
|
|
1576
|
+
|
|
1577
|
+
# Convert parameter tuples to lists, handling default case
|
|
1578
|
+
if not checks:
|
|
1579
|
+
# No --check options provided, use default
|
|
1580
|
+
checks_list = ["rows-distinct"]
|
|
1581
|
+
is_using_default_check = True
|
|
1582
|
+
else:
|
|
1583
|
+
checks_list = list(checks)
|
|
1584
|
+
is_using_default_check = False
|
|
1547
1585
|
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
title=f"Failing Rows - Step {step_number}",
|
|
1552
|
-
show_header=True,
|
|
1553
|
-
header_style="bold red",
|
|
1554
|
-
)
|
|
1555
|
-
info_table.add_column("Property", style="cyan")
|
|
1556
|
-
info_table.add_column("Value", style="white")
|
|
1586
|
+
columns_list = list(columns) if columns else []
|
|
1587
|
+
sets_list = list(sets) if sets else []
|
|
1588
|
+
values_list = list(values) if values else []
|
|
1557
1589
|
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
)
|
|
1590
|
+
# Map parameters to checks intelligently
|
|
1591
|
+
mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
|
|
1592
|
+
checks_list, columns_list, sets_list, values_list
|
|
1593
|
+
)
|
|
1563
1594
|
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1595
|
+
# Handle --list-checks option
|
|
1596
|
+
if list_checks:
|
|
1597
|
+
console.print("[bold bright_cyan]Available Validation Checks:[/bold bright_cyan]")
|
|
1598
|
+
console.print()
|
|
1599
|
+
console.print("[bold magenta]Basic checks:[/bold magenta]")
|
|
1600
|
+
console.print(
|
|
1601
|
+
" • [bold cyan]rows-distinct[/bold cyan] Check for duplicate rows [yellow](default)[/yellow]"
|
|
1602
|
+
)
|
|
1603
|
+
console.print(
|
|
1604
|
+
" • [bold cyan]rows-complete[/bold cyan] Check for missing values in any column"
|
|
1605
|
+
)
|
|
1606
|
+
console.print()
|
|
1607
|
+
console.print(
|
|
1608
|
+
"[bold magenta]Column-specific checks [bright_black](require --column)[/bright_black]:[/bold magenta]"
|
|
1609
|
+
)
|
|
1610
|
+
console.print(" • [bold cyan]col-exists[/bold cyan] Check if a column exists")
|
|
1611
|
+
console.print(
|
|
1612
|
+
" • [bold cyan]col-vals-not-null[/bold cyan] Check for null values in a column"
|
|
1613
|
+
)
|
|
1614
|
+
console.print()
|
|
1615
|
+
console.print(
|
|
1616
|
+
"[bold magenta]Value comparison checks [bright_black](require --column and --value)[/bright_black]:[/bold magenta]"
|
|
1617
|
+
)
|
|
1618
|
+
console.print(
|
|
1619
|
+
" • [bold cyan]col-vals-gt[/bold cyan] Values greater than threshold"
|
|
1620
|
+
)
|
|
1621
|
+
console.print(
|
|
1622
|
+
" • [bold cyan]col-vals-ge[/bold cyan] Values greater than or equal to threshold"
|
|
1623
|
+
)
|
|
1624
|
+
console.print(" • [bold cyan]col-vals-lt[/bold cyan] Values less than threshold")
|
|
1625
|
+
console.print(
|
|
1626
|
+
" • [bold cyan]col-vals-le[/bold cyan] Values less than or equal to threshold"
|
|
1627
|
+
)
|
|
1628
|
+
console.print()
|
|
1629
|
+
console.print(
|
|
1630
|
+
"[bold magenta]Set validation check [bright_black](requires --column and --set)[/bright_black]:[/bold magenta]"
|
|
1631
|
+
)
|
|
1632
|
+
console.print(
|
|
1633
|
+
" • [bold cyan]col-vals-in-set[/bold cyan] Values must be in allowed set"
|
|
1634
|
+
)
|
|
1635
|
+
console.print()
|
|
1636
|
+
console.print("[bold bright_yellow]Examples:[/bold bright_yellow]")
|
|
1637
|
+
console.print(
|
|
1638
|
+
f" [bright_blue]pb validate {data_source} --check rows-distinct[/bright_blue]"
|
|
1639
|
+
)
|
|
1640
|
+
console.print(
|
|
1641
|
+
f" [bright_blue]pb validate {data_source} --check col-vals-not-null --column price[/bright_blue]"
|
|
1642
|
+
)
|
|
1643
|
+
console.print(
|
|
1644
|
+
f" [bright_blue]pb validate {data_source} --check col-vals-gt --column age --value 18[/bright_blue]"
|
|
1645
|
+
)
|
|
1646
|
+
import sys
|
|
1568
1647
|
|
|
1569
|
-
|
|
1570
|
-
console.print(f"[red]Error extracting failing rows:[/red] {e}")
|
|
1571
|
-
# Try to provide helpful information
|
|
1572
|
-
if hasattr(validation, "validation_info") and validation.validation_info:
|
|
1573
|
-
max_step = len(validation.validation_info)
|
|
1574
|
-
console.print(f"[yellow]Available steps: 1 to {max_step}[/yellow]")
|
|
1575
|
-
|
|
1576
|
-
# Show step information
|
|
1577
|
-
steps_table = Table(title="Available Validation Steps", show_header=True)
|
|
1578
|
-
steps_table.add_column("Step", style="cyan")
|
|
1579
|
-
steps_table.add_column("Type", style="white")
|
|
1580
|
-
steps_table.add_column("Column", style="green")
|
|
1581
|
-
steps_table.add_column("Has Failures", style="yellow")
|
|
1582
|
-
|
|
1583
|
-
for i, step in enumerate(validation.validation_info, 1):
|
|
1584
|
-
has_failures = "Yes" if not step.all_passed else "No"
|
|
1585
|
-
steps_table.add_row(
|
|
1586
|
-
str(i),
|
|
1587
|
-
step.assertion_type,
|
|
1588
|
-
str(step.column) if step.column else "—",
|
|
1589
|
-
has_failures,
|
|
1590
|
-
)
|
|
1648
|
+
sys.exit(0)
|
|
1591
1649
|
|
|
1592
|
-
|
|
1650
|
+
# Validate required parameters for different check types
|
|
1651
|
+
# Check parameters for each check in the list using mapped parameters
|
|
1652
|
+
for i, check in enumerate(checks_list):
|
|
1653
|
+
# Get corresponding mapped parameters for this check
|
|
1654
|
+
column = mapped_columns[i] if i < len(mapped_columns) else None
|
|
1655
|
+
set_val = mapped_sets[i] if i < len(mapped_sets) else None
|
|
1656
|
+
value = mapped_values[i] if i < len(mapped_values) else None
|
|
1657
|
+
|
|
1658
|
+
if check == "col-vals-not-null" and not column:
|
|
1659
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
1660
|
+
console.print(
|
|
1661
|
+
"Example: pb validate data.csv --check col-vals-not-null --column email"
|
|
1662
|
+
)
|
|
1593
1663
|
sys.exit(1)
|
|
1594
1664
|
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1665
|
+
if check == "col-exists" and not column:
|
|
1666
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
1667
|
+
console.print("Example: pb validate data.csv --check col-exists --column price")
|
|
1668
|
+
sys.exit(1)
|
|
1598
1669
|
|
|
1670
|
+
if check == "col-vals-in-set" and not column:
|
|
1671
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
1672
|
+
console.print(
|
|
1673
|
+
"Example: pb validate data.csv --check col-vals-in-set --column status --set 'active,inactive'"
|
|
1674
|
+
)
|
|
1675
|
+
sys.exit(1)
|
|
1599
1676
|
|
|
1600
|
-
|
|
1601
|
-
|
|
1677
|
+
if check == "col-vals-in-set" and not set_val:
|
|
1678
|
+
console.print(f"[red]Error:[/red] --set is required for {check} check")
|
|
1679
|
+
console.print(
|
|
1680
|
+
"Example: pb validate data.csv --check col-vals-in-set --column status --set 'active,inactive'"
|
|
1681
|
+
)
|
|
1682
|
+
sys.exit(1)
|
|
1602
1683
|
|
|
1603
|
-
|
|
1604
|
-
|
|
1684
|
+
if check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"] and not column:
|
|
1685
|
+
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
1686
|
+
console.print(
|
|
1687
|
+
f"Example: pb validate data.csv --check {check} --column score --value 50"
|
|
1688
|
+
)
|
|
1689
|
+
sys.exit(1)
|
|
1605
1690
|
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
elif value > 99.0 and value < 100.0:
|
|
1616
|
-
return ">99%" # More than 99%
|
|
1617
|
-
else:
|
|
1618
|
-
return f"{int(round(value))}%" # Round to nearest integer with % sign
|
|
1691
|
+
if (
|
|
1692
|
+
check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"]
|
|
1693
|
+
and value is None
|
|
1694
|
+
):
|
|
1695
|
+
console.print(f"[red]Error:[/red] --value is required for {check} check")
|
|
1696
|
+
console.print(
|
|
1697
|
+
f"Example: pb validate data.csv --check {check} --column score --value 50"
|
|
1698
|
+
)
|
|
1699
|
+
sys.exit(1)
|
|
1619
1700
|
|
|
1701
|
+
with console.status("[bold green]Loading data..."):
|
|
1702
|
+
# Load the data source using the centralized function
|
|
1703
|
+
data = _load_data_source(data_source)
|
|
1704
|
+
|
|
1705
|
+
# Get all column names for error reporting
|
|
1706
|
+
if hasattr(data, "columns"):
|
|
1707
|
+
all_columns = list(data.columns)
|
|
1708
|
+
elif hasattr(data, "schema"):
|
|
1709
|
+
all_columns = list(data.schema.names)
|
|
1710
|
+
else:
|
|
1711
|
+
all_columns = []
|
|
1620
1712
|
|
|
1621
|
-
|
|
1622
|
-
|
|
1713
|
+
# Resolve any '#N' column references to actual column names
|
|
1714
|
+
columns_list = _resolve_column_indices(columns_list, data)
|
|
1623
1715
|
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1716
|
+
# Check for out-of-range #N columns and provide a helpful error
|
|
1717
|
+
for col in columns_list:
|
|
1718
|
+
if isinstance(col, str) and col.startswith("#"):
|
|
1719
|
+
try:
|
|
1720
|
+
idx = int(col[1:])
|
|
1721
|
+
if idx < 1 or idx > len(all_columns):
|
|
1722
|
+
console.print(
|
|
1723
|
+
f"[red]Error:[/red] There is no column {idx} (the column position "
|
|
1724
|
+
f"range is 1 to {len(all_columns)})"
|
|
1725
|
+
)
|
|
1726
|
+
sys.exit(1)
|
|
1727
|
+
except Exception:
|
|
1728
|
+
pass # Let later validation handle other errors
|
|
1631
1729
|
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
elif hasattr(gt_table, "data") and gt_table.data is not None:
|
|
1637
|
-
df = gt_table.data
|
|
1730
|
+
# Update mapped_columns to use resolved column names
|
|
1731
|
+
mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
|
|
1732
|
+
checks_list, columns_list, sets_list, values_list
|
|
1733
|
+
)
|
|
1638
1734
|
|
|
1639
|
-
|
|
1640
|
-
# Create a Rich table with horizontal lines
|
|
1641
|
-
from rich.box import SIMPLE_HEAD
|
|
1735
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1642
1736
|
|
|
1643
|
-
|
|
1737
|
+
# Build a single validation object with chained checks
|
|
1738
|
+
with console.status(f"[bold green]Running {len(checks_list)} validation check(s)..."):
|
|
1739
|
+
# Initialize validation object
|
|
1740
|
+
validation = pb.Validate(
|
|
1741
|
+
data=data,
|
|
1742
|
+
tbl_name=f"Data from {data_source}",
|
|
1743
|
+
label=f"CLI Validation: {', '.join(checks_list)}",
|
|
1744
|
+
)
|
|
1644
1745
|
|
|
1645
|
-
#
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
if
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
columns = list(df.schema.names)
|
|
1652
|
-
except Exception as e:
|
|
1653
|
-
console.print(f"[red]Error getting columns:[/red] {e}")
|
|
1654
|
-
columns = []
|
|
1746
|
+
# Add each check to the validation chain
|
|
1747
|
+
for i, check in enumerate(checks_list):
|
|
1748
|
+
# Get corresponding mapped parameters for this check
|
|
1749
|
+
column = mapped_columns[i] if i < len(mapped_columns) else None
|
|
1750
|
+
set_val = mapped_sets[i] if i < len(mapped_sets) else None
|
|
1751
|
+
value = mapped_values[i] if i < len(mapped_values) else None
|
|
1655
1752
|
|
|
1656
|
-
|
|
1657
|
-
|
|
1753
|
+
if check == "rows-distinct":
|
|
1754
|
+
validation = validation.rows_distinct()
|
|
1755
|
+
elif check == "col-vals-not-null":
|
|
1756
|
+
validation = validation.col_vals_not_null(columns=column)
|
|
1757
|
+
elif check == "rows-complete":
|
|
1758
|
+
validation = validation.rows_complete()
|
|
1759
|
+
elif check == "col-exists":
|
|
1760
|
+
validation = validation.col_exists(columns=column)
|
|
1761
|
+
elif check == "col-vals-in-set":
|
|
1762
|
+
# Parse the comma-separated set values
|
|
1763
|
+
allowed_values = [v.strip() for v in set_val.split(",")]
|
|
1764
|
+
validation = validation.col_vals_in_set(columns=column, set=allowed_values)
|
|
1765
|
+
elif check == "col-vals-gt":
|
|
1766
|
+
validation = validation.col_vals_gt(columns=column, value=value)
|
|
1767
|
+
elif check == "col-vals-ge":
|
|
1768
|
+
validation = validation.col_vals_ge(columns=column, value=value)
|
|
1769
|
+
elif check == "col-vals-lt":
|
|
1770
|
+
validation = validation.col_vals_lt(columns=column, value=value)
|
|
1771
|
+
elif check == "col-vals-le":
|
|
1772
|
+
validation = validation.col_vals_le(columns=column, value=value)
|
|
1773
|
+
else:
|
|
1774
|
+
console.print(f"[red]Error:[/red] Unknown check type: {check}")
|
|
1775
|
+
sys.exit(1)
|
|
1658
1776
|
|
|
1659
|
-
#
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
try:
|
|
1663
|
-
# Get column types from original data
|
|
1664
|
-
if hasattr(original_data, "columns"):
|
|
1665
|
-
original_columns = list(original_data.columns)
|
|
1666
|
-
column_types = _get_column_dtypes(original_data, original_columns)
|
|
1667
|
-
except Exception as e:
|
|
1668
|
-
console.print(f"[red]Error getting column types:[/red] {e}")
|
|
1669
|
-
pass # Use empty dict as fallback
|
|
1777
|
+
# Execute all validations
|
|
1778
|
+
validation = validation.interrogate()
|
|
1779
|
+
all_passed = validation.all_passed()
|
|
1670
1780
|
|
|
1671
|
-
#
|
|
1672
|
-
|
|
1781
|
+
# Display completion message
|
|
1782
|
+
if len(checks_list) == 1:
|
|
1783
|
+
if is_using_default_check:
|
|
1784
|
+
console.print(
|
|
1785
|
+
f"[green]✓[/green] {checks_list[0]} validation completed [dim](default validation)[/dim]"
|
|
1786
|
+
)
|
|
1787
|
+
else:
|
|
1788
|
+
console.print(f"[green]✓[/green] {checks_list[0]} validation completed")
|
|
1789
|
+
else:
|
|
1790
|
+
console.print(f"[green]✓[/green] {len(checks_list)} validations completed")
|
|
1791
|
+
|
|
1792
|
+
# Display results based on whether we have single or multiple checks
|
|
1793
|
+
if len(checks_list) == 1:
|
|
1794
|
+
# Single check - use current display format
|
|
1795
|
+
_display_validation_result(
|
|
1796
|
+
validation,
|
|
1797
|
+
checks_list,
|
|
1798
|
+
mapped_columns,
|
|
1799
|
+
mapped_sets,
|
|
1800
|
+
mapped_values,
|
|
1801
|
+
data_source,
|
|
1802
|
+
0,
|
|
1803
|
+
1,
|
|
1804
|
+
show_extract,
|
|
1805
|
+
write_extract,
|
|
1806
|
+
limit,
|
|
1807
|
+
)
|
|
1808
|
+
else:
|
|
1809
|
+
# Multiple checks - use stacked display format
|
|
1810
|
+
any_failed = False
|
|
1811
|
+
for i in range(len(checks_list)):
|
|
1812
|
+
console.print() # Add spacing between results
|
|
1813
|
+
_display_validation_result(
|
|
1814
|
+
validation,
|
|
1815
|
+
checks_list,
|
|
1816
|
+
mapped_columns,
|
|
1817
|
+
mapped_sets,
|
|
1818
|
+
mapped_values,
|
|
1819
|
+
data_source,
|
|
1820
|
+
i,
|
|
1821
|
+
len(checks_list),
|
|
1822
|
+
show_extract,
|
|
1823
|
+
write_extract,
|
|
1824
|
+
limit,
|
|
1825
|
+
)
|
|
1673
1826
|
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1827
|
+
# Check if this validation failed
|
|
1828
|
+
if hasattr(validation, "validation_info") and len(validation.validation_info) > i:
|
|
1829
|
+
step_info = validation.validation_info[i]
|
|
1830
|
+
if step_info.n_failed > 0:
|
|
1831
|
+
any_failed = True
|
|
1677
1832
|
|
|
1678
|
-
#
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
justify="center",
|
|
1684
|
-
no_wrap=True,
|
|
1685
|
-
width=5, # Fixed width optimized for percentage values
|
|
1833
|
+
# Show tip about --show-extract if any failed and not already used
|
|
1834
|
+
if any_failed and not show_extract:
|
|
1835
|
+
console.print()
|
|
1836
|
+
console.print(
|
|
1837
|
+
"[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
|
|
1686
1838
|
)
|
|
1687
1839
|
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1840
|
+
# Add informational hints when using default validation (only for single check)
|
|
1841
|
+
if len(checks_list) == 1 and is_using_default_check:
|
|
1842
|
+
console.print()
|
|
1843
|
+
console.print("[bold blue]ℹ️ Information:[/bold blue] Using default validation method")
|
|
1844
|
+
console.print("To specify a different validation, use the --check option.")
|
|
1845
|
+
console.print()
|
|
1846
|
+
console.print("[bold magenta]Common validation options:[/bold magenta]")
|
|
1847
|
+
console.print(
|
|
1848
|
+
" • [bold cyan]--check rows-complete[/bold cyan] Check for rows with missing values"
|
|
1849
|
+
)
|
|
1850
|
+
console.print(
|
|
1851
|
+
" • [bold cyan]--check col-vals-not-null[/bold cyan] Check for null values in a column [bright_black](requires --column)[/bright_black]"
|
|
1852
|
+
)
|
|
1853
|
+
console.print(
|
|
1854
|
+
" • [bold cyan]--check col-exists[/bold cyan] Check if a column exists [bright_black](requires --column)[/bright_black]"
|
|
1855
|
+
)
|
|
1856
|
+
console.print()
|
|
1857
|
+
console.print("[bold bright_yellow]Examples:[/bold bright_yellow]")
|
|
1858
|
+
console.print(
|
|
1859
|
+
f" [bright_blue]pb validate {data_source} --check rows-complete[/bright_blue]"
|
|
1860
|
+
)
|
|
1861
|
+
console.print(
|
|
1862
|
+
f" [bright_blue]pb validate {data_source} --check col-vals-not-null --column price[/bright_blue]"
|
|
1863
|
+
)
|
|
1697
1864
|
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1865
|
+
# Exit with appropriate code if requested
|
|
1866
|
+
if exit_code and not all_passed:
|
|
1867
|
+
console.print("[dim]Exiting with non-zero code due to validation failure[/dim]")
|
|
1868
|
+
import sys
|
|
1702
1869
|
|
|
1703
|
-
|
|
1704
|
-
if len(column_name) > 20:
|
|
1705
|
-
truncated_name = column_name[:17] + "…"
|
|
1706
|
-
else:
|
|
1707
|
-
truncated_name = column_name
|
|
1870
|
+
sys.exit(1)
|
|
1708
1871
|
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
if len(dtype) > 10:
|
|
1713
|
-
truncated_dtype = dtype[:9] + "…"
|
|
1714
|
-
else:
|
|
1715
|
-
truncated_dtype = dtype
|
|
1716
|
-
else:
|
|
1717
|
-
truncated_dtype = "?"
|
|
1872
|
+
except Exception as e:
|
|
1873
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1874
|
+
sys.exit(1)
|
|
1718
1875
|
|
|
1719
|
-
# Start building the row with column name and type
|
|
1720
|
-
formatted_row = [truncated_name, truncated_dtype]
|
|
1721
1876
|
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
formatted_row.append(str(value))
|
|
1877
|
+
@cli.command()
|
|
1878
|
+
def datasets():
|
|
1879
|
+
"""
|
|
1880
|
+
List available built-in datasets.
|
|
1881
|
+
"""
|
|
1882
|
+
from rich.box import SIMPLE_HEAD
|
|
1729
1883
|
|
|
1730
|
-
|
|
1884
|
+
datasets_info = [
|
|
1885
|
+
("small_table", "13 rows × 8 columns", "Small demo dataset for testing"),
|
|
1886
|
+
("game_revenue", "2,000 rows × 11 columns", "Game development company revenue data"),
|
|
1887
|
+
("nycflights", "336,776 rows × 18 columns", "NYC airport flights data from 2013"),
|
|
1888
|
+
("global_sales", "50,000 rows × 20 columns", "Global sales data across regions"),
|
|
1889
|
+
]
|
|
1731
1890
|
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1891
|
+
table = Table(
|
|
1892
|
+
title="Available Pointblank Datasets", show_header=True, header_style="bold magenta"
|
|
1893
|
+
)
|
|
1735
1894
|
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1895
|
+
# Create the datasets table
|
|
1896
|
+
table = Table(
|
|
1897
|
+
title="Available Pointblank Datasets",
|
|
1898
|
+
show_header=True,
|
|
1899
|
+
header_style="bold magenta",
|
|
1900
|
+
box=SIMPLE_HEAD,
|
|
1901
|
+
title_style="bold cyan",
|
|
1902
|
+
title_justify="left",
|
|
1903
|
+
)
|
|
1739
1904
|
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
rich_table.add_row(*row)
|
|
1744
|
-
except Exception as e:
|
|
1745
|
-
console.print(f"[red]Error adding row:[/red] {e}")
|
|
1746
|
-
break
|
|
1905
|
+
table.add_column("Dataset Name", style="cyan", no_wrap=True)
|
|
1906
|
+
table.add_column("Dimensions", style="green")
|
|
1907
|
+
table.add_column("Description", style="white")
|
|
1747
1908
|
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
# Create a custom header line that shows the spanner
|
|
1751
|
-
header_parts = []
|
|
1752
|
-
header_parts.append(" " * 20) # Space for Column header
|
|
1753
|
-
header_parts.append(" " * 10) # Space for Type header
|
|
1909
|
+
for name, dims, desc in datasets_info:
|
|
1910
|
+
table.add_row(name, dims, desc)
|
|
1754
1911
|
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1912
|
+
console.print(table)
|
|
1913
|
+
console.print("\n[dim]Use these dataset names directly with any pb CLI command.[/dim]")
|
|
1914
|
+
console.print("[dim]Example: pb preview small_table[/dim]")
|
|
1758
1915
|
|
|
1759
|
-
# Print the custom spanner header
|
|
1760
|
-
console.print("[dim]" + " ".join(header_parts) + "[/dim]")
|
|
1761
1916
|
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1917
|
+
@cli.command()
|
|
1918
|
+
def requirements():
|
|
1919
|
+
"""
|
|
1920
|
+
Check installed dependencies and their availability.
|
|
1921
|
+
"""
|
|
1922
|
+
from rich.box import SIMPLE_HEAD
|
|
1766
1923
|
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1924
|
+
dependencies = [
|
|
1925
|
+
("polars", "Polars DataFrame support"),
|
|
1926
|
+
("pandas", "Pandas DataFrame support"),
|
|
1927
|
+
("ibis", "Ibis backend support (DuckDB, etc.)"),
|
|
1928
|
+
("duckdb", "DuckDB database support"),
|
|
1929
|
+
("pyarrow", "Parquet file support"),
|
|
1930
|
+
]
|
|
1770
1931
|
|
|
1771
|
-
|
|
1772
|
-
|
|
1932
|
+
# Create requirements table
|
|
1933
|
+
table = Table(
|
|
1934
|
+
title="Dependency Status",
|
|
1935
|
+
show_header=True,
|
|
1936
|
+
header_style="bold magenta",
|
|
1937
|
+
box=SIMPLE_HEAD,
|
|
1938
|
+
title_style="bold cyan",
|
|
1939
|
+
title_justify="left",
|
|
1940
|
+
)
|
|
1773
1941
|
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
|
|
1777
|
-
"[dim]Symbols: [green]●[/green] = no missing values, "
|
|
1778
|
-
"[red]●[/red] = completely missing, "
|
|
1779
|
-
"<1% = less than 1% missing, "
|
|
1780
|
-
">99% = more than 99% missing[/dim]"
|
|
1781
|
-
)
|
|
1782
|
-
console.print(footer_text)
|
|
1942
|
+
table.add_column("Package", style="cyan", no_wrap=True)
|
|
1943
|
+
table.add_column("Status", style="white")
|
|
1944
|
+
table.add_column("Description", style="dim")
|
|
1783
1945
|
|
|
1946
|
+
for package, description in dependencies:
|
|
1947
|
+
if _is_lib_present(package):
|
|
1948
|
+
status = "[green]✓ Installed[/green]"
|
|
1784
1949
|
else:
|
|
1785
|
-
|
|
1786
|
-
_rich_print_gt_table(gt_table)
|
|
1950
|
+
status = "[red]✗ Not installed[/red]"
|
|
1787
1951
|
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1952
|
+
table.add_row(package, status, description)
|
|
1953
|
+
|
|
1954
|
+
console.print(table)
|
|
1955
|
+
console.print("\n[dim]Install missing packages to enable additional functionality.[/dim]")
|
|
1792
1956
|
|
|
1793
1957
|
|
|
1794
1958
|
def _rich_print_scan_table(
|
|
@@ -1797,6 +1961,7 @@ def _rich_print_scan_table(
|
|
|
1797
1961
|
source_type: str,
|
|
1798
1962
|
table_type: str,
|
|
1799
1963
|
total_rows: int | None = None,
|
|
1964
|
+
total_columns: int | None = None,
|
|
1800
1965
|
) -> None:
|
|
1801
1966
|
"""
|
|
1802
1967
|
Display scan results as a Rich table in the terminal with statistical measures.
|
|
@@ -1807,6 +1972,7 @@ def _rich_print_scan_table(
|
|
|
1807
1972
|
source_type: Type of data source (e.g., "Pointblank dataset: small_table")
|
|
1808
1973
|
table_type: Type of table (e.g., "polars.LazyFrame")
|
|
1809
1974
|
total_rows: Total number of rows in the dataset
|
|
1975
|
+
total_columns: Total number of columns in the dataset
|
|
1810
1976
|
"""
|
|
1811
1977
|
try:
|
|
1812
1978
|
import re
|
|
@@ -1828,6 +1994,11 @@ def _rich_print_scan_table(
|
|
|
1828
1994
|
# Create a comprehensive title with data source, source type, and table type
|
|
1829
1995
|
title_text = f"Column Summary / {source_type} / {table_type}"
|
|
1830
1996
|
|
|
1997
|
+
# Add dimensions subtitle in gray if available
|
|
1998
|
+
if total_rows is not None and total_columns is not None:
|
|
1999
|
+
title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
|
|
2000
|
+
|
|
2001
|
+
# Create the scan table
|
|
1831
2002
|
scan_table = Table(
|
|
1832
2003
|
title=title_text,
|
|
1833
2004
|
show_header=True,
|
|
@@ -1990,6 +2161,7 @@ def _rich_print_scan_table(
|
|
|
1990
2161
|
return f"{num_val:.2f}"
|
|
1991
2162
|
else:
|
|
1992
2163
|
# Very small numbers - use scientific notation
|
|
2164
|
+
|
|
1993
2165
|
return f"{num_val:.1e}"
|
|
1994
2166
|
|
|
1995
2167
|
except (ValueError, TypeError):
|
|
@@ -2002,83 +2174,260 @@ def _rich_print_scan_table(
|
|
|
2002
2174
|
if len(str_val) > max_width:
|
|
2003
2175
|
return str_val[: max_width - 1] + "…"
|
|
2004
2176
|
|
|
2005
|
-
# General string truncation with ellipsis
|
|
2006
|
-
if len(str_val) > max_width:
|
|
2007
|
-
return str_val[: max_width - 1] + "…"
|
|
2177
|
+
# General string truncation with ellipsis
|
|
2178
|
+
if len(str_val) > max_width:
|
|
2179
|
+
return str_val[: max_width - 1] + "…"
|
|
2180
|
+
|
|
2181
|
+
return str_val
|
|
2182
|
+
|
|
2183
|
+
# Populate table rows
|
|
2184
|
+
num_rows = len(data_dict["colname"])
|
|
2185
|
+
for i in range(num_rows):
|
|
2186
|
+
row_data = []
|
|
2187
|
+
|
|
2188
|
+
# Column name and type from HTML content
|
|
2189
|
+
colname_html = data_dict["colname"][i]
|
|
2190
|
+
column_name, data_type = extract_column_info(colname_html)
|
|
2191
|
+
row_data.append(column_name)
|
|
2192
|
+
row_data.append(data_type)
|
|
2193
|
+
|
|
2194
|
+
# Missing values (NA)
|
|
2195
|
+
missing_val = data_dict.get("n_missing", [None] * num_rows)[i]
|
|
2196
|
+
row_data.append(format_value(missing_val, is_missing=True, max_width=6))
|
|
2197
|
+
|
|
2198
|
+
# Unique values (UQ)
|
|
2199
|
+
unique_val = data_dict.get("n_unique", [None] * num_rows)[i]
|
|
2200
|
+
row_data.append(format_value(unique_val, is_unique=True, max_width=8))
|
|
2201
|
+
|
|
2202
|
+
# Statistical columns
|
|
2203
|
+
for stat_col in stat_columns:
|
|
2204
|
+
stat_val = data_dict.get(stat_col, [None] * num_rows)[i]
|
|
2205
|
+
# Use appropriate width based on column type
|
|
2206
|
+
if stat_col in ["q_1", "iqr"]:
|
|
2207
|
+
width = 8
|
|
2208
|
+
elif stat_col in ["mean", "std", "min", "median", "max", "q_3"]:
|
|
2209
|
+
width = 9
|
|
2210
|
+
else:
|
|
2211
|
+
width = 8
|
|
2212
|
+
row_data.append(format_value(stat_val, max_width=width))
|
|
2213
|
+
|
|
2214
|
+
scan_table.add_row(*row_data)
|
|
2215
|
+
|
|
2216
|
+
# Display the results
|
|
2217
|
+
console.print()
|
|
2218
|
+
console.print(scan_table)
|
|
2219
|
+
|
|
2220
|
+
except Exception as e:
|
|
2221
|
+
# Fallback to simple message if table creation fails
|
|
2222
|
+
console.print(f"[yellow]Scan results available for {data_source}[/yellow]")
|
|
2223
|
+
console.print(f"[red]Error displaying table: {str(e)}[/red]")
|
|
2224
|
+
|
|
2225
|
+
|
|
2226
|
+
def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
|
|
2227
|
+
"""Convert a missing values GT table to Rich table with special formatting.
|
|
2228
|
+
|
|
2229
|
+
Args:
|
|
2230
|
+
gt_table: The GT table object for missing values
|
|
2231
|
+
original_data: The original data source to extract column types
|
|
2232
|
+
"""
|
|
2233
|
+
try:
|
|
2234
|
+
# Extract the underlying data from the GT table
|
|
2235
|
+
df = None
|
|
2236
|
+
|
|
2237
|
+
if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
|
|
2238
|
+
df = gt_table._tbl_data
|
|
2239
|
+
elif hasattr(gt_table, "_data") and gt_table._data is not None:
|
|
2240
|
+
df = gt_table._data
|
|
2241
|
+
elif hasattr(gt_table, "data") and gt_table.data is not None:
|
|
2242
|
+
df = gt_table.data
|
|
2243
|
+
|
|
2244
|
+
if df is not None:
|
|
2245
|
+
from rich.box import SIMPLE_HEAD
|
|
2246
|
+
|
|
2247
|
+
# Create the missing values table
|
|
2248
|
+
rich_table = Table(show_header=True, header_style="bold magenta", box=SIMPLE_HEAD)
|
|
2249
|
+
|
|
2250
|
+
# Get column names
|
|
2251
|
+
columns = []
|
|
2252
|
+
try:
|
|
2253
|
+
if hasattr(df, "columns"):
|
|
2254
|
+
columns = list(df.columns)
|
|
2255
|
+
elif hasattr(df, "schema"):
|
|
2256
|
+
columns = list(df.schema.names)
|
|
2257
|
+
except Exception as e:
|
|
2258
|
+
console.print(f"[red]Error getting columns:[/red] {e}")
|
|
2259
|
+
columns = []
|
|
2260
|
+
|
|
2261
|
+
if not columns:
|
|
2262
|
+
columns = [f"Column {i + 1}" for i in range(10)] # Fallback
|
|
2263
|
+
|
|
2264
|
+
# Get original data to extract column types
|
|
2265
|
+
column_types = {}
|
|
2266
|
+
if original_data is not None:
|
|
2267
|
+
try:
|
|
2268
|
+
# Get column types from original data
|
|
2269
|
+
if hasattr(original_data, "columns"):
|
|
2270
|
+
original_columns = list(original_data.columns)
|
|
2271
|
+
column_types = _get_column_dtypes(original_data, original_columns)
|
|
2272
|
+
except Exception as e:
|
|
2273
|
+
console.print(f"[red]Error getting column types:[/red] {e}")
|
|
2274
|
+
pass # Use empty dict as fallback
|
|
2275
|
+
|
|
2276
|
+
# Add columns to Rich table with special formatting for missing values table
|
|
2277
|
+
sector_columns = [col for col in columns if col != "columns" and col.isdigit()]
|
|
2278
|
+
|
|
2279
|
+
# Two separate columns: Column name (20 chars) and Data type (10 chars)
|
|
2280
|
+
rich_table.add_column("Column", style="cyan", no_wrap=True, width=20)
|
|
2281
|
+
rich_table.add_column("Type", style="yellow", no_wrap=True, width=10)
|
|
2282
|
+
|
|
2283
|
+
# Sector columns: All same width, optimized for "100%" (4 chars + padding)
|
|
2284
|
+
for sector in sector_columns:
|
|
2285
|
+
rich_table.add_column(
|
|
2286
|
+
sector,
|
|
2287
|
+
style="cyan",
|
|
2288
|
+
justify="center",
|
|
2289
|
+
no_wrap=True,
|
|
2290
|
+
width=5, # Fixed width optimized for percentage values
|
|
2291
|
+
)
|
|
2292
|
+
|
|
2293
|
+
# Convert data to rows with special formatting
|
|
2294
|
+
rows = []
|
|
2295
|
+
try:
|
|
2296
|
+
if hasattr(df, "to_dicts"):
|
|
2297
|
+
data_dict = df.to_dicts()
|
|
2298
|
+
elif hasattr(df, "to_dict"):
|
|
2299
|
+
data_dict = df.to_dict("records")
|
|
2300
|
+
else:
|
|
2301
|
+
data_dict = []
|
|
2302
|
+
|
|
2303
|
+
for i, row in enumerate(data_dict):
|
|
2304
|
+
try:
|
|
2305
|
+
# Each row should have: [column_name, data_type, sector1, sector2, ...]
|
|
2306
|
+
column_name = str(row.get("columns", ""))
|
|
2307
|
+
|
|
2308
|
+
# Truncate column name to 20 characters with ellipsis if needed
|
|
2309
|
+
if len(column_name) > 20:
|
|
2310
|
+
truncated_name = column_name[:17] + "…"
|
|
2311
|
+
else:
|
|
2312
|
+
truncated_name = column_name
|
|
2313
|
+
|
|
2314
|
+
# Get data type for this column
|
|
2315
|
+
if column_name in column_types:
|
|
2316
|
+
dtype = column_types[column_name]
|
|
2317
|
+
if len(dtype) > 10:
|
|
2318
|
+
truncated_dtype = dtype[:9] + "…"
|
|
2319
|
+
else:
|
|
2320
|
+
truncated_dtype = dtype
|
|
2321
|
+
else:
|
|
2322
|
+
truncated_dtype = "?"
|
|
2323
|
+
|
|
2324
|
+
# Start building the row with column name and type
|
|
2325
|
+
formatted_row = [truncated_name, truncated_dtype]
|
|
2326
|
+
|
|
2327
|
+
# Add sector values (formatted percentages)
|
|
2328
|
+
for sector in sector_columns:
|
|
2329
|
+
value = row.get(sector, 0.0)
|
|
2330
|
+
if isinstance(value, (int, float)):
|
|
2331
|
+
formatted_row.append(_format_missing_percentage(float(value)))
|
|
2332
|
+
else:
|
|
2333
|
+
formatted_row.append(str(value))
|
|
2334
|
+
|
|
2335
|
+
rows.append(formatted_row)
|
|
2336
|
+
|
|
2337
|
+
except Exception as e:
|
|
2338
|
+
console.print(f"[red]Error processing row {i}:[/red] {e}")
|
|
2339
|
+
continue
|
|
2340
|
+
|
|
2341
|
+
except Exception as e:
|
|
2342
|
+
console.print(f"[red]Error extracting data:[/red] {e}")
|
|
2343
|
+
rows = [["Error extracting data", "?", *["" for _ in sector_columns]]]
|
|
2344
|
+
|
|
2345
|
+
# Add rows to Rich table
|
|
2346
|
+
for row in rows:
|
|
2347
|
+
try:
|
|
2348
|
+
rich_table.add_row(*row)
|
|
2349
|
+
except Exception as e:
|
|
2350
|
+
console.print(f"[red]Error adding row:[/red] {e}")
|
|
2351
|
+
break
|
|
2352
|
+
|
|
2353
|
+
# Show the table with custom spanner header if we have sector columns
|
|
2354
|
+
if sector_columns:
|
|
2355
|
+
# Create a custom header line that shows the spanner
|
|
2356
|
+
header_parts = []
|
|
2357
|
+
header_parts.append(" " * 20) # Space for Column header
|
|
2358
|
+
header_parts.append(" " * 10) # Space for Type header
|
|
2359
|
+
|
|
2360
|
+
# Left-align "Row Sectors" with the first numbered column
|
|
2361
|
+
row_sectors_text = "Row Sectors"
|
|
2362
|
+
header_parts.append(row_sectors_text)
|
|
2008
2363
|
|
|
2009
|
-
|
|
2364
|
+
# Print the custom spanner header
|
|
2365
|
+
console.print("[dim]" + " ".join(header_parts) + "[/dim]")
|
|
2010
2366
|
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
|
|
2367
|
+
# Add a horizontal rule below the spanner
|
|
2368
|
+
rule_parts = []
|
|
2369
|
+
rule_parts.append(" " * 20) # Space for Column header
|
|
2370
|
+
rule_parts.append(" " * 10) # Space for Type header
|
|
2015
2371
|
|
|
2016
|
-
|
|
2017
|
-
|
|
2018
|
-
|
|
2019
|
-
row_data.append(column_name)
|
|
2020
|
-
row_data.append(data_type)
|
|
2372
|
+
# Use a fixed width horizontal rule for "Row Sectors"
|
|
2373
|
+
horizontal_rule = "─" * 20
|
|
2374
|
+
rule_parts.append(horizontal_rule)
|
|
2021
2375
|
|
|
2022
|
-
|
|
2023
|
-
|
|
2024
|
-
row_data.append(format_value(missing_val, is_missing=True, max_width=6))
|
|
2376
|
+
# Print the horizontal rule
|
|
2377
|
+
console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
|
|
2025
2378
|
|
|
2026
|
-
#
|
|
2027
|
-
|
|
2028
|
-
|
|
2379
|
+
# Print the Rich table (will handle terminal width automatically)
|
|
2380
|
+
console.print(rich_table)
|
|
2381
|
+
footer_text = (
|
|
2382
|
+
"[dim]Symbols: [green]●[/green] = no missing values, "
|
|
2383
|
+
"[red]●[/red] = completely missing, "
|
|
2384
|
+
"<1% = less than 1% missing, "
|
|
2385
|
+
">99% = more than 99% missing[/dim]"
|
|
2386
|
+
)
|
|
2387
|
+
console.print(footer_text)
|
|
2029
2388
|
|
|
2030
|
-
|
|
2031
|
-
|
|
2032
|
-
|
|
2033
|
-
# Use appropriate width based on column type
|
|
2034
|
-
if stat_col in ["q_1", "iqr"]:
|
|
2035
|
-
width = 8
|
|
2036
|
-
elif stat_col in ["mean", "std", "min", "median", "max", "q_3"]:
|
|
2037
|
-
width = 9
|
|
2038
|
-
else:
|
|
2039
|
-
width = 8
|
|
2040
|
-
row_data.append(format_value(stat_val, max_width=width))
|
|
2389
|
+
else:
|
|
2390
|
+
# Fallback to regular table display
|
|
2391
|
+
_rich_print_gt_table(gt_table)
|
|
2041
2392
|
|
|
2042
|
-
|
|
2393
|
+
except Exception as e:
|
|
2394
|
+
console.print(f"[red]Error rendering missing values table:[/red] {e}")
|
|
2395
|
+
# Fallback to regular table display
|
|
2396
|
+
_rich_print_gt_table(gt_table)
|
|
2043
2397
|
|
|
2044
|
-
# Display the results
|
|
2045
|
-
console.print()
|
|
2046
|
-
console.print(scan_table) # Add informational footer about the scan scope
|
|
2047
|
-
try:
|
|
2048
|
-
if total_rows is not None:
|
|
2049
|
-
# Full table scan
|
|
2050
|
-
footer_text = f"[dim]Scan from all {total_rows:,} rows in the table.[/dim]"
|
|
2051
|
-
|
|
2052
|
-
# Create a simple footer
|
|
2053
|
-
footer_table = Table(
|
|
2054
|
-
show_header=False,
|
|
2055
|
-
show_lines=False,
|
|
2056
|
-
box=None,
|
|
2057
|
-
padding=(0, 0),
|
|
2058
|
-
)
|
|
2059
|
-
footer_table.add_column("", style="dim", width=80)
|
|
2060
|
-
footer_table.add_row(footer_text)
|
|
2061
|
-
console.print(footer_table)
|
|
2062
2398
|
|
|
2063
|
-
|
|
2064
|
-
|
|
2065
|
-
|
|
2399
|
+
def _map_parameters_to_checks(
|
|
2400
|
+
checks_list: list[str], columns_list: list[str], sets_list: list[str], values_list: list[float]
|
|
2401
|
+
) -> tuple[list[str], list[str], list[float]]:
|
|
2402
|
+
"""
|
|
2403
|
+
Map parameters to checks intelligently, handling flexible parameter ordering.
|
|
2066
2404
|
|
|
2067
|
-
|
|
2068
|
-
|
|
2069
|
-
console.print(f"[yellow]Scan results available for {data_source}[/yellow]")
|
|
2070
|
-
console.print(f"[red]Error displaying table: {str(e)}[/red]")
|
|
2405
|
+
This function distributes the provided parameters across checks based on what each check needs.
|
|
2406
|
+
For checks that don't need certain parameters, None/empty values are assigned.
|
|
2071
2407
|
|
|
2408
|
+
Args:
|
|
2409
|
+
checks_list: List of validation check types
|
|
2410
|
+
columns_list: List of column names provided by user
|
|
2411
|
+
sets_list: List of set values provided by user
|
|
2412
|
+
values_list: List of numeric values provided by user
|
|
2072
2413
|
|
|
2073
|
-
|
|
2074
|
-
|
|
2075
|
-
|
|
2076
|
-
"
|
|
2077
|
-
|
|
2078
|
-
|
|
2079
|
-
|
|
2414
|
+
Returns:
|
|
2415
|
+
Tuple of (mapped_columns, mapped_sets, mapped_values) where each list
|
|
2416
|
+
has the same length as checks_list
|
|
2417
|
+
"""
|
|
2418
|
+
mapped_columns = []
|
|
2419
|
+
mapped_sets = []
|
|
2420
|
+
mapped_values = []
|
|
2421
|
+
|
|
2422
|
+
# Keep track of which parameters we've used
|
|
2423
|
+
column_index = 0
|
|
2424
|
+
set_index = 0
|
|
2425
|
+
value_index = 0
|
|
2426
|
+
|
|
2427
|
+
for check in checks_list:
|
|
2428
|
+
# Determine what parameters this check needs
|
|
2429
|
+
needs_column = check in [
|
|
2080
2430
|
"col-vals-not-null",
|
|
2081
|
-
"rows-complete",
|
|
2082
2431
|
"col-exists",
|
|
2083
2432
|
"col-vals-in-set",
|
|
2084
2433
|
"col-vals-gt",
|
|
@@ -2086,652 +2435,1210 @@ def _rich_print_scan_table(
|
|
|
2086
2435
|
"col-vals-lt",
|
|
2087
2436
|
"col-vals-le",
|
|
2088
2437
|
]
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
)
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
)
|
|
2109
|
-
|
|
2110
|
-
|
|
2438
|
+
needs_set = check == "col-vals-in-set"
|
|
2439
|
+
needs_value = check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"]
|
|
2440
|
+
|
|
2441
|
+
# Assign column parameter if needed
|
|
2442
|
+
if needs_column:
|
|
2443
|
+
if column_index < len(columns_list):
|
|
2444
|
+
mapped_columns.append(columns_list[column_index])
|
|
2445
|
+
column_index += 1
|
|
2446
|
+
else:
|
|
2447
|
+
mapped_columns.append(None) # Will cause validation error later
|
|
2448
|
+
else:
|
|
2449
|
+
mapped_columns.append(None)
|
|
2450
|
+
|
|
2451
|
+
# Assign set parameter if needed
|
|
2452
|
+
if needs_set:
|
|
2453
|
+
if set_index < len(sets_list):
|
|
2454
|
+
mapped_sets.append(sets_list[set_index])
|
|
2455
|
+
set_index += 1
|
|
2456
|
+
else:
|
|
2457
|
+
mapped_sets.append(None) # Will cause validation error later
|
|
2458
|
+
else:
|
|
2459
|
+
mapped_sets.append(None)
|
|
2460
|
+
|
|
2461
|
+
# Assign value parameter if needed
|
|
2462
|
+
if needs_value:
|
|
2463
|
+
if value_index < len(values_list):
|
|
2464
|
+
mapped_values.append(values_list[value_index])
|
|
2465
|
+
value_index += 1
|
|
2466
|
+
else:
|
|
2467
|
+
mapped_values.append(None) # Will cause validation error later
|
|
2468
|
+
else:
|
|
2469
|
+
mapped_values.append(None)
|
|
2470
|
+
|
|
2471
|
+
return mapped_columns, mapped_sets, mapped_values
|
|
2472
|
+
|
|
2473
|
+
|
|
2474
|
+
def _resolve_column_indices(columns_list, data):
|
|
2475
|
+
"""
|
|
2476
|
+
Replace any '#N' entries in columns_list with the actual column name from data (1-based).
|
|
2477
|
+
"""
|
|
2478
|
+
# Get column names from the data
|
|
2479
|
+
if hasattr(data, "columns"):
|
|
2480
|
+
all_columns = list(data.columns)
|
|
2481
|
+
elif hasattr(data, "schema"):
|
|
2482
|
+
all_columns = list(data.schema.names)
|
|
2483
|
+
else:
|
|
2484
|
+
return columns_list # Can't resolve, return as-is
|
|
2485
|
+
|
|
2486
|
+
resolved = []
|
|
2487
|
+
for col in columns_list:
|
|
2488
|
+
if isinstance(col, str) and col.startswith("#"):
|
|
2489
|
+
try:
|
|
2490
|
+
idx = int(col[1:]) - 1 # 1-based to 0-based
|
|
2491
|
+
if 0 <= idx < len(all_columns):
|
|
2492
|
+
resolved.append(all_columns[idx])
|
|
2493
|
+
else:
|
|
2494
|
+
resolved.append(col) # Out of range, keep as-is
|
|
2495
|
+
except Exception:
|
|
2496
|
+
resolved.append(col) # Not a valid number, keep as-is
|
|
2497
|
+
else:
|
|
2498
|
+
resolved.append(col)
|
|
2499
|
+
return resolved
|
|
2500
|
+
|
|
2501
|
+
|
|
2502
|
+
def _display_validation_result(
|
|
2503
|
+
validation: Any,
|
|
2504
|
+
checks_list: list[str],
|
|
2505
|
+
columns_list: list[str],
|
|
2506
|
+
sets_list: list[str],
|
|
2507
|
+
values_list: list[float],
|
|
2508
|
+
data_source: str,
|
|
2509
|
+
step_index: int,
|
|
2510
|
+
total_checks: int,
|
|
2511
|
+
show_extract: bool,
|
|
2512
|
+
write_extract: str | None,
|
|
2513
|
+
limit: int,
|
|
2514
|
+
) -> None:
|
|
2515
|
+
"""Display a single validation result with proper formatting for single or multiple checks."""
|
|
2516
|
+
from rich.box import SIMPLE_HEAD
|
|
2517
|
+
|
|
2518
|
+
# Get parameters for this specific check
|
|
2519
|
+
check = checks_list[step_index]
|
|
2520
|
+
column = columns_list[step_index] if step_index < len(columns_list) else None
|
|
2521
|
+
set_val = sets_list[step_index] if step_index < len(sets_list) else None
|
|
2522
|
+
value = values_list[step_index] if step_index < len(values_list) else None
|
|
2523
|
+
|
|
2524
|
+
# Get validation step info
|
|
2525
|
+
step_info = None
|
|
2526
|
+
if hasattr(validation, "validation_info") and len(validation.validation_info) > step_index:
|
|
2527
|
+
step_info = validation.validation_info[step_index]
|
|
2528
|
+
|
|
2529
|
+
# Create friendly title for table
|
|
2530
|
+
if total_checks == 1:
|
|
2531
|
+
# Single check - use original title format
|
|
2532
|
+
if check == "rows-distinct":
|
|
2533
|
+
table_title = "Validation Result: Rows Distinct"
|
|
2534
|
+
elif check == "col-vals-not-null":
|
|
2535
|
+
table_title = "Validation Result: Column Values Not Null"
|
|
2536
|
+
elif check == "rows-complete":
|
|
2537
|
+
table_title = "Validation Result: Rows Complete"
|
|
2538
|
+
elif check == "col-exists":
|
|
2539
|
+
table_title = "Validation Result: Column Exists"
|
|
2540
|
+
elif check == "col-vals-in-set":
|
|
2541
|
+
table_title = "Validation Result: Column Values In Set"
|
|
2542
|
+
elif check == "col-vals-gt":
|
|
2543
|
+
table_title = "Validation Result: Column Values Greater Than"
|
|
2544
|
+
elif check == "col-vals-ge":
|
|
2545
|
+
table_title = "Validation Result: Column Values Greater Than Or Equal"
|
|
2546
|
+
elif check == "col-vals-lt":
|
|
2547
|
+
table_title = "Validation Result: Column Values Less Than"
|
|
2548
|
+
elif check == "col-vals-le":
|
|
2549
|
+
table_title = "Validation Result: Column Values Less Than Or Equal"
|
|
2550
|
+
else:
|
|
2551
|
+
table_title = f"Validation Result: {check.replace('-', ' ').title()}"
|
|
2552
|
+
else:
|
|
2553
|
+
# Multiple checks - add numbering
|
|
2554
|
+
if check == "rows-distinct":
|
|
2555
|
+
base_title = "Rows Distinct"
|
|
2556
|
+
elif check == "col-vals-not-null":
|
|
2557
|
+
base_title = "Column Values Not Null"
|
|
2558
|
+
elif check == "rows-complete":
|
|
2559
|
+
base_title = "Rows Complete"
|
|
2560
|
+
elif check == "col-exists":
|
|
2561
|
+
base_title = "Column Exists"
|
|
2562
|
+
elif check == "col-vals-in-set":
|
|
2563
|
+
base_title = "Column Values In Set"
|
|
2564
|
+
elif check == "col-vals-gt":
|
|
2565
|
+
base_title = "Column Values Greater Than"
|
|
2566
|
+
elif check == "col-vals-ge":
|
|
2567
|
+
base_title = "Column Values Greater Than Or Equal"
|
|
2568
|
+
elif check == "col-vals-lt":
|
|
2569
|
+
base_title = "Column Values Less Than"
|
|
2570
|
+
elif check == "col-vals-le":
|
|
2571
|
+
base_title = "Column Values Less Than Or Equal"
|
|
2572
|
+
else:
|
|
2573
|
+
base_title = check.replace("-", " ").title()
|
|
2574
|
+
|
|
2575
|
+
table_title = f"Validation Result ({step_index + 1} of {total_checks}): {base_title}"
|
|
2576
|
+
|
|
2577
|
+
# Create the validation results table
|
|
2578
|
+
result_table = Table(
|
|
2579
|
+
title=table_title,
|
|
2580
|
+
show_header=True,
|
|
2581
|
+
header_style="bold magenta",
|
|
2582
|
+
box=SIMPLE_HEAD,
|
|
2583
|
+
title_style="bold cyan",
|
|
2584
|
+
title_justify="left",
|
|
2585
|
+
)
|
|
2586
|
+
result_table.add_column("Property", style="cyan", no_wrap=True)
|
|
2587
|
+
result_table.add_column("Value", style="white")
|
|
2588
|
+
|
|
2589
|
+
# Add basic info
|
|
2590
|
+
result_table.add_row("Data Source", data_source)
|
|
2591
|
+
result_table.add_row("Check Type", check)
|
|
2592
|
+
|
|
2593
|
+
# Add column info for column-specific checks
|
|
2594
|
+
if check in [
|
|
2595
|
+
"col-vals-not-null",
|
|
2596
|
+
"col-exists",
|
|
2597
|
+
"col-vals-in-set",
|
|
2598
|
+
"col-vals-gt",
|
|
2599
|
+
"col-vals-ge",
|
|
2600
|
+
"col-vals-lt",
|
|
2601
|
+
"col-vals-le",
|
|
2602
|
+
]:
|
|
2603
|
+
result_table.add_row("Column", column)
|
|
2604
|
+
|
|
2605
|
+
# Add set info for col-vals-in-set check
|
|
2606
|
+
if check == "col-vals-in-set" and set_val:
|
|
2607
|
+
allowed_values = [v.strip() for v in set_val.split(",")]
|
|
2608
|
+
result_table.add_row("Allowed Values", ", ".join(allowed_values))
|
|
2609
|
+
|
|
2610
|
+
# Add value info for range checks
|
|
2611
|
+
if check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"] and value is not None:
|
|
2612
|
+
if check == "col-vals-gt":
|
|
2613
|
+
operator = ">"
|
|
2614
|
+
elif check == "col-vals-ge":
|
|
2615
|
+
operator = ">="
|
|
2616
|
+
elif check == "col-vals-lt":
|
|
2617
|
+
operator = "<"
|
|
2618
|
+
elif check == "col-vals-le":
|
|
2619
|
+
operator = "<="
|
|
2620
|
+
result_table.add_row("Threshold", f"{operator} {value}")
|
|
2621
|
+
|
|
2622
|
+
# Get validation details
|
|
2623
|
+
if step_info:
|
|
2624
|
+
result_table.add_row("Total Rows Tested", f"{step_info.n:,}")
|
|
2625
|
+
result_table.add_row("Passing Rows", f"{step_info.n_passed:,}")
|
|
2626
|
+
result_table.add_row("Failing Rows", f"{step_info.n_failed:,}")
|
|
2627
|
+
|
|
2628
|
+
# Check if this step passed
|
|
2629
|
+
step_passed = step_info.n_failed == 0
|
|
2630
|
+
|
|
2631
|
+
# Overall result with color coding
|
|
2632
|
+
if step_passed:
|
|
2633
|
+
result_table.add_row("Result", "[green]✓ PASSED[/green]")
|
|
2634
|
+
if check == "rows-distinct":
|
|
2635
|
+
result_table.add_row("Duplicate Rows", "[green]None found[/green]")
|
|
2636
|
+
elif check == "col-vals-not-null":
|
|
2637
|
+
result_table.add_row("Null Values", "[green]None found[/green]")
|
|
2638
|
+
elif check == "rows-complete":
|
|
2639
|
+
result_table.add_row("Incomplete Rows", "[green]None found[/green]")
|
|
2640
|
+
elif check == "col-exists":
|
|
2641
|
+
result_table.add_row("Column Status", "[green]Column exists[/green]")
|
|
2642
|
+
elif check == "col-vals-in-set":
|
|
2643
|
+
result_table.add_row("Values Status", "[green]All values in allowed set[/green]")
|
|
2644
|
+
elif check == "col-vals-gt":
|
|
2645
|
+
result_table.add_row("Values Status", f"[green]All values > {value}[/green]")
|
|
2646
|
+
elif check == "col-vals-ge":
|
|
2647
|
+
result_table.add_row("Values Status", f"[green]All values >= {value}[/green]")
|
|
2648
|
+
elif check == "col-vals-lt":
|
|
2649
|
+
result_table.add_row("Values Status", f"[green]All values < {value}[/green]")
|
|
2650
|
+
elif check == "col-vals-le":
|
|
2651
|
+
result_table.add_row("Values Status", f"[green]All values <= {value}[/green]")
|
|
2652
|
+
else:
|
|
2653
|
+
result_table.add_row("Result", "[red]✗ FAILED[/red]")
|
|
2654
|
+
if check == "rows-distinct":
|
|
2655
|
+
result_table.add_row("Duplicate Rows", f"[red]{step_info.n_failed:,} found[/red]")
|
|
2656
|
+
elif check == "col-vals-not-null":
|
|
2657
|
+
result_table.add_row("Null Values", f"[red]{step_info.n_failed:,} found[/red]")
|
|
2658
|
+
elif check == "rows-complete":
|
|
2659
|
+
result_table.add_row("Incomplete Rows", f"[red]{step_info.n_failed:,} found[/red]")
|
|
2660
|
+
elif check == "col-exists":
|
|
2661
|
+
result_table.add_row("Column Status", "[red]Column does not exist[/red]")
|
|
2662
|
+
elif check == "col-vals-in-set":
|
|
2663
|
+
result_table.add_row("Invalid Values", f"[red]{step_info.n_failed:,} found[/red]")
|
|
2664
|
+
elif check == "col-vals-gt":
|
|
2665
|
+
result_table.add_row(
|
|
2666
|
+
"Invalid Values", f"[red]{step_info.n_failed:,} values <= {value}[/red]"
|
|
2667
|
+
)
|
|
2668
|
+
elif check == "col-vals-ge":
|
|
2669
|
+
result_table.add_row(
|
|
2670
|
+
"Invalid Values", f"[red]{step_info.n_failed:,} values < {value}[/red]"
|
|
2671
|
+
)
|
|
2672
|
+
elif check == "col-vals-lt":
|
|
2673
|
+
result_table.add_row(
|
|
2674
|
+
"Invalid Values", f"[red]{step_info.n_failed:,} values >= {value}[/red]"
|
|
2675
|
+
)
|
|
2676
|
+
elif check == "col-vals-le":
|
|
2677
|
+
result_table.add_row(
|
|
2678
|
+
"Invalid Values", f"[red]{step_info.n_failed:,} values > {value}[/red]"
|
|
2679
|
+
)
|
|
2680
|
+
|
|
2681
|
+
console.print()
|
|
2682
|
+
console.print(result_table)
|
|
2683
|
+
|
|
2684
|
+
# Show extract and summary for single check only, or if this is a failed step in multiple checks
|
|
2685
|
+
if total_checks == 1:
|
|
2686
|
+
# For single check, show extract and summary as before
|
|
2687
|
+
_show_extract_and_summary(
|
|
2688
|
+
validation,
|
|
2689
|
+
check,
|
|
2690
|
+
column,
|
|
2691
|
+
set_val,
|
|
2692
|
+
value,
|
|
2693
|
+
data_source,
|
|
2694
|
+
step_index,
|
|
2695
|
+
step_info,
|
|
2696
|
+
show_extract,
|
|
2697
|
+
write_extract,
|
|
2698
|
+
limit,
|
|
2699
|
+
)
|
|
2700
|
+
else:
|
|
2701
|
+
# For multiple checks, show summary panel and handle extract if needed
|
|
2702
|
+
if step_info:
|
|
2703
|
+
step_passed = step_info.n_failed == 0
|
|
2704
|
+
if step_passed:
|
|
2705
|
+
# Create success message for this step
|
|
2706
|
+
if check == "rows-distinct":
|
|
2707
|
+
success_message = f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
|
|
2708
|
+
elif check == "col-vals-not-null":
|
|
2709
|
+
success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
|
|
2710
|
+
elif check == "rows-complete":
|
|
2711
|
+
success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
|
|
2712
|
+
elif check == "col-exists":
|
|
2713
|
+
success_message = f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
|
|
2714
|
+
elif check == "col-vals-in-set":
|
|
2715
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
|
|
2716
|
+
elif check == "col-vals-gt":
|
|
2717
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
|
|
2718
|
+
elif check == "col-vals-ge":
|
|
2719
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
|
|
2720
|
+
elif check == "col-vals-lt":
|
|
2721
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
|
|
2722
|
+
elif check == "col-vals-le":
|
|
2723
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
|
|
2724
|
+
else:
|
|
2725
|
+
success_message = f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
|
|
2726
|
+
|
|
2727
|
+
console.print(
|
|
2728
|
+
Panel(
|
|
2729
|
+
success_message,
|
|
2730
|
+
border_style="green",
|
|
2731
|
+
)
|
|
2732
|
+
)
|
|
2733
|
+
else:
|
|
2734
|
+
# Create failure message for this step (without tip)
|
|
2735
|
+
if check == "rows-distinct":
|
|
2736
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
|
|
2737
|
+
elif check == "col-vals-not-null":
|
|
2738
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
|
|
2739
|
+
elif check == "rows-complete":
|
|
2740
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
|
|
2741
|
+
elif check == "col-exists":
|
|
2742
|
+
failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
|
|
2743
|
+
elif check == "col-vals-in-set":
|
|
2744
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
|
|
2745
|
+
elif check == "col-vals-gt":
|
|
2746
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
|
|
2747
|
+
elif check == "col-vals-ge":
|
|
2748
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
|
|
2749
|
+
elif check == "col-vals-lt":
|
|
2750
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
|
|
2751
|
+
elif check == "col-vals-le":
|
|
2752
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
|
|
2753
|
+
else:
|
|
2754
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
|
|
2755
|
+
|
|
2756
|
+
console.print(
|
|
2757
|
+
Panel(
|
|
2758
|
+
failure_message,
|
|
2759
|
+
border_style="red",
|
|
2760
|
+
)
|
|
2761
|
+
)
|
|
2762
|
+
|
|
2763
|
+
# For multiple checks, show extract if requested and this step failed
|
|
2764
|
+
if (show_extract or write_extract) and not step_passed:
|
|
2765
|
+
_show_extract_for_multi_check(
|
|
2766
|
+
validation,
|
|
2767
|
+
check,
|
|
2768
|
+
column,
|
|
2769
|
+
set_val,
|
|
2770
|
+
value,
|
|
2771
|
+
data_source,
|
|
2772
|
+
step_index,
|
|
2773
|
+
step_info,
|
|
2774
|
+
show_extract,
|
|
2775
|
+
write_extract,
|
|
2776
|
+
limit,
|
|
2777
|
+
)
|
|
2778
|
+
|
|
2779
|
+
|
|
2780
|
+
def _show_extract_for_multi_check(
|
|
2781
|
+
validation: Any,
|
|
2782
|
+
check: str,
|
|
2783
|
+
column: str | None,
|
|
2784
|
+
set_val: str | None,
|
|
2785
|
+
value: float | None,
|
|
2111
2786
|
data_source: str,
|
|
2787
|
+
step_index: int,
|
|
2788
|
+
step_info: Any,
|
|
2789
|
+
show_extract: bool,
|
|
2790
|
+
write_extract: str | None,
|
|
2791
|
+
limit: int,
|
|
2792
|
+
) -> None:
|
|
2793
|
+
"""Show extract for a single validation step in multiple checks scenario."""
|
|
2794
|
+
# Dynamic message based on check type
|
|
2795
|
+
if check == "rows-distinct":
|
|
2796
|
+
extract_message = "[yellow]Extract of failing rows (duplicates):[/yellow]"
|
|
2797
|
+
row_type = "duplicate rows"
|
|
2798
|
+
elif check == "rows-complete":
|
|
2799
|
+
extract_message = "[yellow]Extract of failing rows (incomplete rows):[/yellow]"
|
|
2800
|
+
row_type = "incomplete rows"
|
|
2801
|
+
elif check == "col-exists":
|
|
2802
|
+
extract_message = f"[yellow]Column '{column}' does not exist in the dataset[/yellow]"
|
|
2803
|
+
row_type = "missing column"
|
|
2804
|
+
elif check == "col-vals-not-null":
|
|
2805
|
+
extract_message = f"[yellow]Extract of failing rows (null values in '{column}'):[/yellow]"
|
|
2806
|
+
row_type = "rows with null values"
|
|
2807
|
+
elif check == "col-vals-in-set":
|
|
2808
|
+
extract_message = (
|
|
2809
|
+
f"[yellow]Extract of failing rows (invalid values in '{column}'):[/yellow]"
|
|
2810
|
+
)
|
|
2811
|
+
row_type = "rows with invalid values"
|
|
2812
|
+
elif check == "col-vals-gt":
|
|
2813
|
+
extract_message = (
|
|
2814
|
+
f"[yellow]Extract of failing rows (values in '{column}' <= {value}):[/yellow]"
|
|
2815
|
+
)
|
|
2816
|
+
row_type = f"rows with values <= {value}"
|
|
2817
|
+
elif check == "col-vals-ge":
|
|
2818
|
+
extract_message = (
|
|
2819
|
+
f"[yellow]Extract of failing rows (values in '{column}' < {value}):[/yellow]"
|
|
2820
|
+
)
|
|
2821
|
+
row_type = f"rows with values < {value}"
|
|
2822
|
+
elif check == "col-vals-lt":
|
|
2823
|
+
extract_message = (
|
|
2824
|
+
f"[yellow]Extract of failing rows (values in '{column}' >= {value}):[/yellow]"
|
|
2825
|
+
)
|
|
2826
|
+
row_type = f"rows with values >= {value}"
|
|
2827
|
+
elif check == "col-vals-le":
|
|
2828
|
+
extract_message = (
|
|
2829
|
+
f"[yellow]Extract of failing rows (values in '{column}' > {value}):[/yellow]"
|
|
2830
|
+
)
|
|
2831
|
+
row_type = f"rows with values > {value}"
|
|
2832
|
+
else:
|
|
2833
|
+
extract_message = "[yellow]Extract of failing rows:[/yellow]"
|
|
2834
|
+
row_type = "failing rows"
|
|
2835
|
+
|
|
2836
|
+
if show_extract:
|
|
2837
|
+
console.print()
|
|
2838
|
+
console.print(extract_message)
|
|
2839
|
+
|
|
2840
|
+
# Special handling for col-exists check - no rows to show when column doesn't exist
|
|
2841
|
+
if check == "col-exists":
|
|
2842
|
+
if show_extract:
|
|
2843
|
+
console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
|
|
2844
|
+
console.print(
|
|
2845
|
+
"[dim]Use --show-extract with other check types to see failing data rows.[/dim]"
|
|
2846
|
+
)
|
|
2847
|
+
if write_extract:
|
|
2848
|
+
console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
|
|
2849
|
+
else:
|
|
2850
|
+
try:
|
|
2851
|
+
# Get failing rows extract - use step_index + 1 since extracts are 1-indexed
|
|
2852
|
+
failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
|
|
2853
|
+
|
|
2854
|
+
if failing_rows is not None and len(failing_rows) > 0:
|
|
2855
|
+
if show_extract:
|
|
2856
|
+
# Limit the number of rows shown
|
|
2857
|
+
if len(failing_rows) > limit:
|
|
2858
|
+
display_rows = failing_rows.head(limit)
|
|
2859
|
+
console.print(
|
|
2860
|
+
f"[dim]Showing first {limit} of {len(failing_rows)} {row_type}[/dim]"
|
|
2861
|
+
)
|
|
2862
|
+
else:
|
|
2863
|
+
display_rows = failing_rows
|
|
2864
|
+
console.print(f"[dim]Showing all {len(failing_rows)} {row_type}[/dim]")
|
|
2865
|
+
|
|
2866
|
+
# Create a preview table using pointblank's preview function
|
|
2867
|
+
import pointblank as pb
|
|
2868
|
+
|
|
2869
|
+
preview_table = pb.preview(
|
|
2870
|
+
data=display_rows,
|
|
2871
|
+
n_head=min(limit, len(display_rows)),
|
|
2872
|
+
n_tail=0,
|
|
2873
|
+
limit=limit,
|
|
2874
|
+
show_row_numbers=True,
|
|
2875
|
+
)
|
|
2876
|
+
|
|
2877
|
+
# Display using our Rich table function
|
|
2878
|
+
_rich_print_gt_table(preview_table, show_summary=False)
|
|
2879
|
+
|
|
2880
|
+
if write_extract:
|
|
2881
|
+
try:
|
|
2882
|
+
from pathlib import Path
|
|
2883
|
+
|
|
2884
|
+
folder_name = write_extract
|
|
2885
|
+
|
|
2886
|
+
# Create the output folder
|
|
2887
|
+
output_folder = Path(folder_name)
|
|
2888
|
+
output_folder.mkdir(parents=True, exist_ok=True)
|
|
2889
|
+
|
|
2890
|
+
# Create safe filename from check type
|
|
2891
|
+
safe_check_type = check.replace("-", "_")
|
|
2892
|
+
filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
|
|
2893
|
+
filepath = output_folder / filename
|
|
2894
|
+
|
|
2895
|
+
# Limit the output if needed
|
|
2896
|
+
write_rows = failing_rows
|
|
2897
|
+
if len(failing_rows) > limit:
|
|
2898
|
+
write_rows = failing_rows.head(limit)
|
|
2899
|
+
|
|
2900
|
+
# Save to CSV
|
|
2901
|
+
if hasattr(write_rows, "write_csv"):
|
|
2902
|
+
# Polars
|
|
2903
|
+
write_rows.write_csv(str(filepath))
|
|
2904
|
+
elif hasattr(write_rows, "to_csv"):
|
|
2905
|
+
# Pandas
|
|
2906
|
+
write_rows.to_csv(str(filepath), index=False)
|
|
2907
|
+
else:
|
|
2908
|
+
# Try converting to pandas as fallback
|
|
2909
|
+
import pandas as pd
|
|
2910
|
+
|
|
2911
|
+
pd_data = pd.DataFrame(write_rows)
|
|
2912
|
+
pd_data.to_csv(str(filepath), index=False)
|
|
2913
|
+
|
|
2914
|
+
rows_saved = len(write_rows) if hasattr(write_rows, "__len__") else limit
|
|
2915
|
+
console.print(
|
|
2916
|
+
f"[green]✓[/green] Failing rows saved to folder: {output_folder}"
|
|
2917
|
+
)
|
|
2918
|
+
console.print(f"[dim] - {filename}: {rows_saved} rows[/dim]")
|
|
2919
|
+
except Exception as e:
|
|
2920
|
+
console.print(f"[yellow]Warning: Could not save failing rows: {e}[/yellow]")
|
|
2921
|
+
else:
|
|
2922
|
+
if show_extract:
|
|
2923
|
+
console.print("[yellow]No failing rows could be extracted[/yellow]")
|
|
2924
|
+
if write_extract:
|
|
2925
|
+
console.print("[yellow]No failing rows could be extracted to save[/yellow]")
|
|
2926
|
+
except Exception as e:
|
|
2927
|
+
if show_extract:
|
|
2928
|
+
console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
|
|
2929
|
+
if write_extract:
|
|
2930
|
+
console.print(f"[yellow]Could not extract failing rows to save: {e}[/yellow]")
|
|
2931
|
+
|
|
2932
|
+
|
|
2933
|
+
def _show_extract_and_summary(
|
|
2934
|
+
validation: Any,
|
|
2112
2935
|
check: str,
|
|
2113
2936
|
column: str | None,
|
|
2114
|
-
|
|
2937
|
+
set_val: str | None,
|
|
2115
2938
|
value: float | None,
|
|
2939
|
+
data_source: str,
|
|
2940
|
+
step_index: int,
|
|
2941
|
+
step_info: Any,
|
|
2116
2942
|
show_extract: bool,
|
|
2943
|
+
write_extract: str | None,
|
|
2117
2944
|
limit: int,
|
|
2118
|
-
|
|
2119
|
-
)
|
|
2120
|
-
|
|
2121
|
-
Perform simple, single-step validations directly from the command line.
|
|
2945
|
+
) -> None:
|
|
2946
|
+
"""Show extract and summary for a validation step (used for single checks)."""
|
|
2947
|
+
step_passed = step_info.n_failed == 0 if step_info else True
|
|
2122
2948
|
|
|
2123
|
-
|
|
2124
|
-
|
|
2949
|
+
# Show extract if requested and validation failed
|
|
2950
|
+
if (show_extract or write_extract) and not step_passed:
|
|
2951
|
+
console.print()
|
|
2125
2952
|
|
|
2126
|
-
|
|
2953
|
+
# Dynamic message based on check type
|
|
2954
|
+
if check == "rows-distinct":
|
|
2955
|
+
extract_message = "[yellow]Extract of failing rows (duplicates):[/yellow]"
|
|
2956
|
+
row_type = "duplicate rows"
|
|
2957
|
+
elif check == "rows-complete":
|
|
2958
|
+
extract_message = "[yellow]Extract of failing rows (incomplete rows):[/yellow]"
|
|
2959
|
+
row_type = "incomplete rows"
|
|
2960
|
+
elif check == "col-exists":
|
|
2961
|
+
extract_message = f"[yellow]Column '{column}' does not exist in the dataset[/yellow]"
|
|
2962
|
+
row_type = "missing column"
|
|
2963
|
+
elif check == "col-vals-not-null":
|
|
2964
|
+
extract_message = (
|
|
2965
|
+
f"[yellow]Extract of failing rows (null values in '{column}'):[/yellow]"
|
|
2966
|
+
)
|
|
2967
|
+
row_type = "rows with null values"
|
|
2968
|
+
elif check == "col-vals-in-set":
|
|
2969
|
+
extract_message = (
|
|
2970
|
+
f"[yellow]Extract of failing rows (invalid values in '{column}'):[/yellow]"
|
|
2971
|
+
)
|
|
2972
|
+
row_type = "rows with invalid values"
|
|
2973
|
+
elif check == "col-vals-gt":
|
|
2974
|
+
extract_message = (
|
|
2975
|
+
f"[yellow]Extract of failing rows (values in '{column}' <= {value}):[/yellow]"
|
|
2976
|
+
)
|
|
2977
|
+
row_type = f"rows with values <= {value}"
|
|
2978
|
+
elif check == "col-vals-ge":
|
|
2979
|
+
extract_message = (
|
|
2980
|
+
f"[yellow]Extract of failing rows (values in '{column}' < {value}):[/yellow]"
|
|
2981
|
+
)
|
|
2982
|
+
row_type = f"rows with values < {value}"
|
|
2983
|
+
elif check == "col-vals-lt":
|
|
2984
|
+
extract_message = (
|
|
2985
|
+
f"[yellow]Extract of failing rows (values in '{column}' >= {value}):[/yellow]"
|
|
2986
|
+
)
|
|
2987
|
+
row_type = f"rows with values >= {value}"
|
|
2988
|
+
elif check == "col-vals-le":
|
|
2989
|
+
extract_message = (
|
|
2990
|
+
f"[yellow]Extract of failing rows (values in '{column}' > {value}):[/yellow]"
|
|
2991
|
+
)
|
|
2992
|
+
row_type = f"rows with values > {value}"
|
|
2993
|
+
else:
|
|
2994
|
+
extract_message = "[yellow]Extract of failing rows:[/yellow]"
|
|
2995
|
+
row_type = "failing rows"
|
|
2127
2996
|
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
2131
|
-
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
2132
|
-
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
2997
|
+
if show_extract:
|
|
2998
|
+
console.print(extract_message)
|
|
2133
2999
|
|
|
2134
|
-
|
|
3000
|
+
# Special handling for col-exists check - no rows to show when column doesn't exist
|
|
3001
|
+
if check == "col-exists" and not step_passed:
|
|
3002
|
+
if show_extract:
|
|
3003
|
+
console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
|
|
3004
|
+
console.print(
|
|
3005
|
+
"[dim]Use --show-extract with other check types to see failing data rows.[/dim]"
|
|
3006
|
+
)
|
|
3007
|
+
if write_extract:
|
|
3008
|
+
console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
|
|
3009
|
+
else:
|
|
3010
|
+
try:
|
|
3011
|
+
# Get failing rows extract - use step_index + 1 since extracts are 1-indexed
|
|
3012
|
+
failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
|
|
2135
3013
|
|
|
2136
|
-
|
|
2137
|
-
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
|
|
2144
|
-
|
|
2145
|
-
|
|
3014
|
+
if failing_rows is not None and len(failing_rows) > 0:
|
|
3015
|
+
if show_extract:
|
|
3016
|
+
# Limit the number of rows shown
|
|
3017
|
+
if len(failing_rows) > limit:
|
|
3018
|
+
display_rows = failing_rows.head(limit)
|
|
3019
|
+
console.print(
|
|
3020
|
+
f"[dim]Showing first {limit} of {len(failing_rows)} {row_type}[/dim]"
|
|
3021
|
+
)
|
|
3022
|
+
else:
|
|
3023
|
+
display_rows = failing_rows
|
|
3024
|
+
console.print(f"[dim]Showing all {len(failing_rows)} {row_type}[/dim]")
|
|
2146
3025
|
|
|
2147
|
-
|
|
3026
|
+
# Create a preview table using pointblank's preview function
|
|
3027
|
+
import pointblank as pb
|
|
2148
3028
|
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
pb validate-simple data.csv --check col-vals-gt --column score --value 50
|
|
2157
|
-
pb validate-simple data.csv --check col-vals-in-set --column status --set "active,inactive,pending"
|
|
2158
|
-
"""
|
|
2159
|
-
try:
|
|
2160
|
-
# Validate required parameters for different check types
|
|
2161
|
-
if check == "col-vals-not-null" and not column:
|
|
2162
|
-
console.print(f"[red]Error:[/red] --column is required for {check} check")
|
|
2163
|
-
console.print(
|
|
2164
|
-
"Example: pb validate-simple data.csv --check col-vals-not-null --column email"
|
|
2165
|
-
)
|
|
2166
|
-
sys.exit(1)
|
|
2167
|
-
sys.exit(1)
|
|
3029
|
+
preview_table = pb.preview(
|
|
3030
|
+
data=display_rows,
|
|
3031
|
+
n_head=min(limit, len(display_rows)),
|
|
3032
|
+
n_tail=0,
|
|
3033
|
+
limit=limit,
|
|
3034
|
+
show_row_numbers=True,
|
|
3035
|
+
)
|
|
2168
3036
|
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
3037
|
+
# Display using our Rich table function
|
|
3038
|
+
_rich_print_gt_table(preview_table, show_summary=False)
|
|
3039
|
+
|
|
3040
|
+
if write_extract:
|
|
3041
|
+
try:
|
|
3042
|
+
from pathlib import Path
|
|
3043
|
+
|
|
3044
|
+
folder_name = write_extract
|
|
3045
|
+
|
|
3046
|
+
# Create the output folder
|
|
3047
|
+
output_folder = Path(folder_name)
|
|
3048
|
+
output_folder.mkdir(parents=True, exist_ok=True)
|
|
3049
|
+
|
|
3050
|
+
# Create safe filename from check type
|
|
3051
|
+
safe_check_type = check.replace("-", "_")
|
|
3052
|
+
filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
|
|
3053
|
+
filepath = output_folder / filename
|
|
3054
|
+
|
|
3055
|
+
# Limit the output if needed
|
|
3056
|
+
write_rows = failing_rows
|
|
3057
|
+
if len(failing_rows) > limit:
|
|
3058
|
+
write_rows = failing_rows.head(limit)
|
|
3059
|
+
|
|
3060
|
+
# Save to CSV
|
|
3061
|
+
if hasattr(write_rows, "write_csv"):
|
|
3062
|
+
# Polars
|
|
3063
|
+
write_rows.write_csv(str(filepath))
|
|
3064
|
+
elif hasattr(write_rows, "to_csv"):
|
|
3065
|
+
# Pandas
|
|
3066
|
+
write_rows.to_csv(str(filepath), index=False)
|
|
3067
|
+
else:
|
|
3068
|
+
# Try converting to pandas as fallback
|
|
3069
|
+
import pandas as pd
|
|
2173
3070
|
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
console.print(
|
|
2177
|
-
"Example: pb validate-simple data.csv --check col-vals-in-set --column status --set 'active,inactive'"
|
|
2178
|
-
)
|
|
2179
|
-
sys.exit(1)
|
|
3071
|
+
pd_data = pd.DataFrame(write_rows)
|
|
3072
|
+
pd_data.to_csv(str(filepath), index=False)
|
|
2180
3073
|
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
3074
|
+
rows_saved = (
|
|
3075
|
+
len(write_rows) if hasattr(write_rows, "__len__") else limit
|
|
3076
|
+
)
|
|
3077
|
+
console.print(
|
|
3078
|
+
f"[green]✓[/green] Failing rows saved to folder: {output_folder}"
|
|
3079
|
+
)
|
|
3080
|
+
console.print(f"[dim] - {filename}: {rows_saved} rows[/dim]")
|
|
3081
|
+
except Exception as e:
|
|
3082
|
+
console.print(
|
|
3083
|
+
f"[yellow]Warning: Could not save failing rows: {e}[/yellow]"
|
|
3084
|
+
)
|
|
3085
|
+
else:
|
|
3086
|
+
if show_extract:
|
|
3087
|
+
console.print("[yellow]No failing rows could be extracted[/yellow]")
|
|
3088
|
+
if write_extract:
|
|
3089
|
+
console.print("[yellow]No failing rows could be extracted to save[/yellow]")
|
|
3090
|
+
except Exception as e:
|
|
3091
|
+
if show_extract:
|
|
3092
|
+
console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
|
|
3093
|
+
if write_extract:
|
|
3094
|
+
console.print(f"[yellow]Could not extract failing rows to save: {e}[/yellow]")
|
|
2187
3095
|
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
3096
|
+
# Summary message
|
|
3097
|
+
console.print()
|
|
3098
|
+
if step_passed:
|
|
3099
|
+
if check == "rows-distinct":
|
|
3100
|
+
success_message = (
|
|
3101
|
+
f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
|
|
2192
3102
|
)
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2197
|
-
|
|
2198
|
-
|
|
3103
|
+
elif check == "col-vals-not-null":
|
|
3104
|
+
success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
|
|
3105
|
+
elif check == "rows-complete":
|
|
3106
|
+
success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
|
|
3107
|
+
elif check == "col-exists":
|
|
3108
|
+
success_message = (
|
|
3109
|
+
f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
|
|
2199
3110
|
)
|
|
2200
|
-
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
|
|
2205
|
-
|
|
3111
|
+
elif check == "col-vals-in-set":
|
|
3112
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
|
|
3113
|
+
elif check == "col-vals-gt":
|
|
3114
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
|
|
3115
|
+
elif check == "col-vals-ge":
|
|
3116
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
|
|
3117
|
+
elif check == "col-vals-lt":
|
|
3118
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
|
|
3119
|
+
elif check == "col-vals-le":
|
|
3120
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
|
|
3121
|
+
else:
|
|
3122
|
+
success_message = (
|
|
3123
|
+
f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
|
|
2206
3124
|
)
|
|
2207
|
-
sys.exit(1)
|
|
2208
3125
|
|
|
2209
|
-
|
|
2210
|
-
|
|
2211
|
-
|
|
2212
|
-
|
|
2213
|
-
|
|
2214
|
-
|
|
3126
|
+
console.print(Panel(success_message, border_style="green"))
|
|
3127
|
+
else:
|
|
3128
|
+
if step_info:
|
|
3129
|
+
if check == "rows-distinct":
|
|
3130
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
|
|
3131
|
+
elif check == "col-vals-not-null":
|
|
3132
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
|
|
3133
|
+
elif check == "rows-complete":
|
|
3134
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
|
|
3135
|
+
elif check == "col-exists":
|
|
3136
|
+
failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
|
|
3137
|
+
elif check == "col-vals-in-set":
|
|
3138
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
|
|
3139
|
+
elif check == "col-vals-gt":
|
|
3140
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
|
|
3141
|
+
elif check == "col-vals-ge":
|
|
3142
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
|
|
3143
|
+
elif check == "col-vals-lt":
|
|
3144
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
|
|
3145
|
+
elif check == "col-vals-le":
|
|
3146
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
|
|
3147
|
+
else:
|
|
3148
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
|
|
2215
3149
|
|
|
2216
|
-
|
|
2217
|
-
|
|
2218
|
-
|
|
2219
|
-
"Example: pb validate-simple data.csv --check col-vals-lt --column age --value 65"
|
|
2220
|
-
)
|
|
2221
|
-
sys.exit(1)
|
|
3150
|
+
# Add hint about --show-extract if not already used (except for col-exists which has no rows to show)
|
|
3151
|
+
if not show_extract and check != "col-exists":
|
|
3152
|
+
failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
|
|
2222
3153
|
|
|
2223
|
-
|
|
2224
|
-
|
|
2225
|
-
|
|
2226
|
-
|
|
2227
|
-
|
|
2228
|
-
|
|
3154
|
+
console.print(Panel(failure_message, border_style="red"))
|
|
3155
|
+
else:
|
|
3156
|
+
if check == "rows-distinct":
|
|
3157
|
+
failure_message = (
|
|
3158
|
+
f"[red]✗ Validation FAILED: Duplicate rows found in {data_source}[/red]"
|
|
3159
|
+
)
|
|
3160
|
+
elif check == "rows-complete":
|
|
3161
|
+
failure_message = (
|
|
3162
|
+
f"[red]✗ Validation FAILED: Incomplete rows found in {data_source}[/red]"
|
|
3163
|
+
)
|
|
3164
|
+
else:
|
|
3165
|
+
failure_message = (
|
|
3166
|
+
f"[red]✗ Validation FAILED: {check} check failed for {data_source}[/red]"
|
|
3167
|
+
)
|
|
2229
3168
|
|
|
2230
|
-
|
|
2231
|
-
|
|
2232
|
-
|
|
2233
|
-
"Example: pb validate-simple data.csv --check col-vals-le --column score --value 100"
|
|
2234
|
-
)
|
|
2235
|
-
sys.exit(1)
|
|
3169
|
+
# Add hint about --show-extract if not already used
|
|
3170
|
+
if not show_extract:
|
|
3171
|
+
failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
|
|
2236
3172
|
|
|
2237
|
-
|
|
2238
|
-
console.print(f"[red]Error:[/red] --value is required for {check} check")
|
|
2239
|
-
console.print(
|
|
2240
|
-
"Example: pb validate-simple data.csv --check col-vals-le --column score --value 100"
|
|
2241
|
-
)
|
|
2242
|
-
sys.exit(1)
|
|
3173
|
+
console.print(Panel(failure_message, border_style="red"))
|
|
2243
3174
|
|
|
2244
|
-
with console.status("[bold green]Loading data..."):
|
|
2245
|
-
# Try to load as a pointblank dataset first
|
|
2246
|
-
if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
2247
|
-
data = pb.load_dataset(data_source)
|
|
2248
|
-
console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
|
|
2249
|
-
else:
|
|
2250
|
-
# Assume it's a file path or connection string
|
|
2251
|
-
data = data_source
|
|
2252
|
-
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
2253
3175
|
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
pb.Validate(
|
|
2260
|
-
data=data,
|
|
2261
|
-
tbl_name=f"Data from {data_source}",
|
|
2262
|
-
label=f"CLI Simple Validation: {check}",
|
|
2263
|
-
)
|
|
2264
|
-
.rows_distinct()
|
|
2265
|
-
.interrogate()
|
|
2266
|
-
)
|
|
3176
|
+
@cli.command()
|
|
3177
|
+
@click.argument("output_file", type=click.Path())
|
|
3178
|
+
def make_template(output_file: str):
|
|
3179
|
+
"""
|
|
3180
|
+
Create a validation script template.
|
|
2267
3181
|
|
|
2268
|
-
|
|
2269
|
-
|
|
3182
|
+
Creates a sample Python script with examples showing how to use Pointblank
|
|
3183
|
+
for data validation. Edit the template to add your own data loading and
|
|
3184
|
+
validation rules, then run it with 'pb run'.
|
|
2270
3185
|
|
|
2271
|
-
|
|
2272
|
-
f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
|
|
2273
|
-
)
|
|
2274
|
-
elif check == "col-vals-not-null":
|
|
2275
|
-
# Create validation for not null values in specified column
|
|
2276
|
-
validation = (
|
|
2277
|
-
pb.Validate(
|
|
2278
|
-
data=data,
|
|
2279
|
-
tbl_name=f"Data from {data_source}",
|
|
2280
|
-
label=f"CLI Simple Validation: {check} on column '{column}'",
|
|
2281
|
-
)
|
|
2282
|
-
.col_vals_not_null(columns=column)
|
|
2283
|
-
.interrogate()
|
|
2284
|
-
)
|
|
3186
|
+
OUTPUT_FILE is the path where the template script will be created.
|
|
2285
3187
|
|
|
2286
|
-
|
|
2287
|
-
all_passed = validation.all_passed()
|
|
3188
|
+
Examples:
|
|
2288
3189
|
|
|
2289
|
-
|
|
2290
|
-
|
|
2291
|
-
|
|
2292
|
-
|
|
2293
|
-
|
|
2294
|
-
|
|
2295
|
-
pb.Validate(
|
|
2296
|
-
data=data,
|
|
2297
|
-
tbl_name=f"Data from {data_source}",
|
|
2298
|
-
label=f"CLI Simple Validation: {check}",
|
|
2299
|
-
)
|
|
2300
|
-
.rows_complete()
|
|
2301
|
-
.interrogate()
|
|
2302
|
-
)
|
|
3190
|
+
\b
|
|
3191
|
+
pb make-template my_validation.py
|
|
3192
|
+
pb make-template validation_template.py
|
|
3193
|
+
"""
|
|
3194
|
+
example_script = '''"""
|
|
3195
|
+
Example Pointblank validation script.
|
|
2303
3196
|
|
|
2304
|
-
|
|
2305
|
-
|
|
3197
|
+
This script demonstrates how to create validation rules for your data.
|
|
3198
|
+
Modify the data loading and validation rules below to match your requirements.
|
|
3199
|
+
"""
|
|
2306
3200
|
|
|
2307
|
-
|
|
2308
|
-
f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
|
|
2309
|
-
)
|
|
2310
|
-
elif check == "col-exists":
|
|
2311
|
-
# Create validation for column existence
|
|
2312
|
-
validation = (
|
|
2313
|
-
pb.Validate(
|
|
2314
|
-
data=data,
|
|
2315
|
-
tbl_name=f"Data from {data_source}",
|
|
2316
|
-
label=f"CLI Simple Validation: {check} for column '{column}'",
|
|
2317
|
-
)
|
|
2318
|
-
.col_exists(columns=column)
|
|
2319
|
-
.interrogate()
|
|
2320
|
-
)
|
|
3201
|
+
import pointblank as pb
|
|
2321
3202
|
|
|
2322
|
-
|
|
2323
|
-
|
|
3203
|
+
# Load your data (replace this with your actual data source)
|
|
3204
|
+
# You can load from various sources:
|
|
3205
|
+
# data = pb.load_dataset("small_table") # Built-in dataset
|
|
3206
|
+
# data = pd.read_csv("your_data.csv") # CSV file
|
|
3207
|
+
# data = pl.read_parquet("data.parquet") # Parquet file
|
|
3208
|
+
# data = pb.load_data("database://connection") # Database
|
|
2324
3209
|
|
|
2325
|
-
|
|
2326
|
-
f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
|
|
2327
|
-
)
|
|
2328
|
-
elif check == "col-vals-in-set":
|
|
2329
|
-
# Parse the comma-separated set values
|
|
2330
|
-
allowed_values = [value.strip() for value in set.split(",")]
|
|
2331
|
-
|
|
2332
|
-
# Create validation for values in set
|
|
2333
|
-
validation = (
|
|
2334
|
-
pb.Validate(
|
|
2335
|
-
data=data,
|
|
2336
|
-
tbl_name=f"Data from {data_source}",
|
|
2337
|
-
label=f"CLI Simple Validation: {check} for column '{column}'",
|
|
2338
|
-
)
|
|
2339
|
-
.col_vals_in_set(columns=column, set=allowed_values)
|
|
2340
|
-
.interrogate()
|
|
2341
|
-
)
|
|
3210
|
+
data = pb.load_dataset("small_table") # Example with built-in dataset
|
|
2342
3211
|
|
|
2343
|
-
|
|
2344
|
-
|
|
3212
|
+
# Create a validation object
|
|
3213
|
+
validation = (
|
|
3214
|
+
pb.Validate(
|
|
3215
|
+
data=data,
|
|
3216
|
+
tbl_name="Example Data",
|
|
3217
|
+
label="Validation Example",
|
|
3218
|
+
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
3219
|
+
)
|
|
3220
|
+
# Add your validation rules here
|
|
3221
|
+
# Example rules (modify these based on your data structure):
|
|
2345
3222
|
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
|
|
2355
|
-
|
|
2356
|
-
|
|
2357
|
-
|
|
2358
|
-
|
|
2359
|
-
|
|
3223
|
+
# Check that specific columns exist
|
|
3224
|
+
# .col_exists(["column1", "column2"])
|
|
3225
|
+
|
|
3226
|
+
# Check for null values
|
|
3227
|
+
# .col_vals_not_null(columns="important_column")
|
|
3228
|
+
|
|
3229
|
+
# Check value ranges
|
|
3230
|
+
# .col_vals_gt(columns="amount", value=0)
|
|
3231
|
+
# .col_vals_between(columns="score", left=0, right=100)
|
|
3232
|
+
|
|
3233
|
+
# Check string patterns
|
|
3234
|
+
# .col_vals_regex(columns="email", pattern=r"^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$")
|
|
3235
|
+
|
|
3236
|
+
# Check unique values
|
|
3237
|
+
# .col_vals_unique(columns="id")
|
|
3238
|
+
|
|
3239
|
+
# Finalize the validation
|
|
3240
|
+
.interrogate()
|
|
3241
|
+
)
|
|
3242
|
+
|
|
3243
|
+
# The validation object will be automatically used by the CLI
|
|
3244
|
+
# You can also access results programmatically:
|
|
3245
|
+
# print(f"All passed: {validation.all_passed()}")
|
|
3246
|
+
# print(f"Failed steps: {validation.n_failed()}")
|
|
3247
|
+
'''
|
|
3248
|
+
|
|
3249
|
+
Path(output_file).write_text(example_script)
|
|
3250
|
+
console.print(f"[green]✓[/green] Validation script template created: {output_file}")
|
|
3251
|
+
console.print("\nEdit the template to add your data loading and validation rules, then run:")
|
|
3252
|
+
console.print(f"[cyan]pb run {output_file}[/cyan]")
|
|
3253
|
+
console.print(
|
|
3254
|
+
f"[cyan]pb run {output_file} --data your_data.csv[/cyan] [dim]# Override data source[/dim]"
|
|
3255
|
+
)
|
|
3256
|
+
|
|
3257
|
+
|
|
3258
|
+
@cli.command()
|
|
3259
|
+
@click.argument("validation_script", type=click.Path(exists=True))
|
|
3260
|
+
@click.option("--data", type=str, help="Optional data source to override script's data loading")
|
|
3261
|
+
@click.option("--output-html", type=click.Path(), help="Save HTML validation report to file")
|
|
3262
|
+
@click.option("--output-json", type=click.Path(), help="Save JSON validation summary to file")
|
|
3263
|
+
@click.option(
|
|
3264
|
+
"--show-extract", is_flag=True, help="Show extract of failing rows if validation fails"
|
|
3265
|
+
)
|
|
3266
|
+
@click.option(
|
|
3267
|
+
"--write-extract",
|
|
3268
|
+
type=str,
|
|
3269
|
+
help="Save failing rows to folders (one CSV per step). Provide base name for folder.",
|
|
3270
|
+
)
|
|
3271
|
+
@click.option(
|
|
3272
|
+
"--limit", "-l", default=10, help="Maximum number of failing rows to show/save (default: 10)"
|
|
3273
|
+
)
|
|
3274
|
+
@click.option(
|
|
3275
|
+
"--fail-on",
|
|
3276
|
+
type=click.Choice(["critical", "error", "warning", "any"], case_sensitive=False),
|
|
3277
|
+
help="Exit with non-zero code when validation reaches this threshold level",
|
|
3278
|
+
)
|
|
3279
|
+
def run(
|
|
3280
|
+
validation_script: str,
|
|
3281
|
+
data: str | None,
|
|
3282
|
+
output_html: str | None,
|
|
3283
|
+
output_json: str | None,
|
|
3284
|
+
show_extract: bool,
|
|
3285
|
+
write_extract: str | None,
|
|
3286
|
+
limit: int,
|
|
3287
|
+
fail_on: str | None,
|
|
3288
|
+
):
|
|
3289
|
+
"""
|
|
3290
|
+
Run a Pointblank validation script.
|
|
2360
3291
|
|
|
2361
|
-
|
|
2362
|
-
|
|
3292
|
+
VALIDATION_SCRIPT should be a Python file that defines validation logic.
|
|
3293
|
+
The script should load its own data and create validation objects.
|
|
2363
3294
|
|
|
2364
|
-
|
|
2365
|
-
|
|
2366
|
-
)
|
|
2367
|
-
elif check == "col-vals-ge":
|
|
2368
|
-
# Create validation for values greater than or equal to threshold
|
|
2369
|
-
validation = (
|
|
2370
|
-
pb.Validate(
|
|
2371
|
-
data=data,
|
|
2372
|
-
tbl_name=f"Data from {data_source}",
|
|
2373
|
-
label=f"CLI Simple Validation: {check} for column '{column}' >= {value}",
|
|
2374
|
-
)
|
|
2375
|
-
.col_vals_ge(columns=column, value=value)
|
|
2376
|
-
.interrogate()
|
|
2377
|
-
)
|
|
3295
|
+
If --data is provided, it will be available as a 'cli_data' variable in the script,
|
|
3296
|
+
allowing you to optionally override your script's data loading.
|
|
2378
3297
|
|
|
2379
|
-
|
|
2380
|
-
all_passed = validation.all_passed()
|
|
3298
|
+
DATA can be:
|
|
2381
3299
|
|
|
2382
|
-
|
|
2383
|
-
|
|
2384
|
-
|
|
2385
|
-
|
|
2386
|
-
|
|
2387
|
-
|
|
2388
|
-
pb.Validate(
|
|
2389
|
-
data=data,
|
|
2390
|
-
tbl_name=f"Data from {data_source}",
|
|
2391
|
-
label=f"CLI Simple Validation: {check} for column '{column}' < {value}",
|
|
2392
|
-
)
|
|
2393
|
-
.col_vals_lt(columns=column, value=value)
|
|
2394
|
-
.interrogate()
|
|
2395
|
-
)
|
|
3300
|
+
\b
|
|
3301
|
+
- CSV file path (e.g., data.csv)
|
|
3302
|
+
- Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
|
|
3303
|
+
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
3304
|
+
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
3305
|
+
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
2396
3306
|
|
|
2397
|
-
|
|
2398
|
-
all_passed = validation.all_passed()
|
|
3307
|
+
Examples:
|
|
2399
3308
|
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
|
|
2413
|
-
)
|
|
3309
|
+
\b
|
|
3310
|
+
pb run validation_script.py
|
|
3311
|
+
pb run validation_script.py --data data.csv
|
|
3312
|
+
pb run validation_script.py --data small_table --output-html report.html
|
|
3313
|
+
pb run validation_script.py --show-extract --fail-on error
|
|
3314
|
+
pb run validation_script.py --write-extract extracts_folder --fail-on critical
|
|
3315
|
+
"""
|
|
3316
|
+
try:
|
|
3317
|
+
# Load optional data override if provided
|
|
3318
|
+
cli_data = None
|
|
3319
|
+
if data:
|
|
3320
|
+
with console.status(f"[bold green]Loading data from {data}..."):
|
|
3321
|
+
cli_data = _load_data_source(data)
|
|
3322
|
+
console.print(f"[green]✓[/green] Loaded data override: {data}")
|
|
2414
3323
|
|
|
2415
|
-
|
|
2416
|
-
|
|
3324
|
+
# Execute the validation script
|
|
3325
|
+
with console.status("[bold green]Running validation script..."):
|
|
3326
|
+
# Read and execute the validation script
|
|
3327
|
+
script_content = Path(validation_script).read_text()
|
|
2417
3328
|
|
|
2418
|
-
|
|
2419
|
-
|
|
2420
|
-
|
|
2421
|
-
|
|
2422
|
-
#
|
|
2423
|
-
|
|
3329
|
+
# Create a namespace with pointblank and optional CLI data
|
|
3330
|
+
namespace = {
|
|
3331
|
+
"pb": pb,
|
|
3332
|
+
"pointblank": pb,
|
|
3333
|
+
"cli_data": cli_data, # Available if --data was provided
|
|
3334
|
+
"__name__": "__main__",
|
|
3335
|
+
"__file__": str(Path(validation_script).resolve()),
|
|
3336
|
+
}
|
|
3337
|
+
|
|
3338
|
+
# Execute the script
|
|
3339
|
+
try:
|
|
3340
|
+
exec(script_content, namespace)
|
|
3341
|
+
except Exception as e:
|
|
3342
|
+
console.print(f"[red]Error executing validation script:[/red] {e}")
|
|
2424
3343
|
sys.exit(1)
|
|
2425
3344
|
|
|
2426
|
-
|
|
2427
|
-
|
|
3345
|
+
# Look for validation objects in the namespace
|
|
3346
|
+
validations = []
|
|
2428
3347
|
|
|
2429
|
-
|
|
2430
|
-
|
|
2431
|
-
|
|
2432
|
-
elif check == "col-vals-not-null":
|
|
2433
|
-
table_title = "Validation Result: Column Values Not Null"
|
|
2434
|
-
elif check == "rows-complete":
|
|
2435
|
-
table_title = "Validation Result: Rows Complete"
|
|
2436
|
-
elif check == "col-exists":
|
|
2437
|
-
table_title = "Validation Result: Column Exists"
|
|
2438
|
-
elif check == "col-vals-in-set":
|
|
2439
|
-
table_title = "Validation Result: Column Values In Set"
|
|
2440
|
-
elif check == "col-vals-gt":
|
|
2441
|
-
table_title = "Validation Result: Column Values Greater Than"
|
|
2442
|
-
elif check == "col-vals-ge":
|
|
2443
|
-
table_title = "Validation Result: Column Values Greater Than Or Equal"
|
|
2444
|
-
elif check == "col-vals-lt":
|
|
2445
|
-
table_title = "Validation Result: Column Values Less Than"
|
|
2446
|
-
elif check == "col-vals-le":
|
|
2447
|
-
table_title = "Validation Result: Column Values Less Than Or Equal"
|
|
2448
|
-
else:
|
|
2449
|
-
table_title = f"Validation Result: {check.replace('-', ' ').title()}"
|
|
3348
|
+
# Look for the 'validation' variable specifically first
|
|
3349
|
+
if "validation" in namespace:
|
|
3350
|
+
validations.append(namespace["validation"])
|
|
2450
3351
|
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
|
|
2455
|
-
|
|
2456
|
-
|
|
2457
|
-
|
|
2458
|
-
|
|
2459
|
-
|
|
2460
|
-
|
|
3352
|
+
# Also look for any other validation objects
|
|
3353
|
+
for key, value in namespace.items():
|
|
3354
|
+
if (
|
|
3355
|
+
key != "validation"
|
|
3356
|
+
and hasattr(value, "interrogate")
|
|
3357
|
+
and hasattr(value, "validation_info")
|
|
3358
|
+
):
|
|
3359
|
+
validations.append(value)
|
|
3360
|
+
# Also check if it's a Validate object that has been interrogated
|
|
3361
|
+
elif key != "validation" and str(type(value)).find("Validate") != -1:
|
|
3362
|
+
validations.append(value)
|
|
2461
3363
|
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
|
-
|
|
3364
|
+
if not validations:
|
|
3365
|
+
raise ValueError(
|
|
3366
|
+
"No validation objects found in script. "
|
|
3367
|
+
"Script should create Validate objects and call .interrogate() on them."
|
|
3368
|
+
)
|
|
2465
3369
|
|
|
2466
|
-
|
|
2467
|
-
if check in [
|
|
2468
|
-
"col-vals-not-null",
|
|
2469
|
-
"col-exists",
|
|
2470
|
-
"col-vals-in-set",
|
|
2471
|
-
"col-vals-gt",
|
|
2472
|
-
"col-vals-ge",
|
|
2473
|
-
"col-vals-lt",
|
|
2474
|
-
"col-vals-le",
|
|
2475
|
-
]:
|
|
2476
|
-
result_table.add_row("Column", column)
|
|
2477
|
-
|
|
2478
|
-
# Add set info for col-vals-in-set check
|
|
2479
|
-
if check == "col-vals-in-set":
|
|
2480
|
-
allowed_values = [value.strip() for value in set.split(",")]
|
|
2481
|
-
result_table.add_row("Allowed Values", ", ".join(allowed_values))
|
|
2482
|
-
|
|
2483
|
-
# Add value info for range checks
|
|
2484
|
-
if check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"]:
|
|
2485
|
-
if check == "col-vals-gt":
|
|
2486
|
-
operator = ">"
|
|
2487
|
-
elif check == "col-vals-ge":
|
|
2488
|
-
operator = ">="
|
|
2489
|
-
elif check == "col-vals-lt":
|
|
2490
|
-
operator = "<"
|
|
2491
|
-
elif check == "col-vals-le":
|
|
2492
|
-
operator = "<="
|
|
2493
|
-
result_table.add_row("Threshold", f"{operator} {value}")
|
|
2494
|
-
|
|
2495
|
-
# Get validation details
|
|
2496
|
-
if hasattr(validation, "validation_info") and validation.validation_info:
|
|
2497
|
-
step_info = validation.validation_info[0] # Should only be one step
|
|
2498
|
-
result_table.add_row("Total Rows Tested", f"{step_info.n:,}")
|
|
2499
|
-
result_table.add_row("Passing Rows", f"{step_info.n_passed:,}")
|
|
2500
|
-
result_table.add_row("Failing Rows", f"{step_info.n_failed:,}")
|
|
2501
|
-
|
|
2502
|
-
# Overall result with color coding
|
|
2503
|
-
if all_passed:
|
|
2504
|
-
result_table.add_row("Result", "[green]✓ PASSED[/green]")
|
|
2505
|
-
if check == "rows-distinct":
|
|
2506
|
-
result_table.add_row("Duplicate Rows", "[green]None found[/green]")
|
|
2507
|
-
elif check == "col-vals-not-null":
|
|
2508
|
-
result_table.add_row("Null Values", "[green]None found[/green]")
|
|
2509
|
-
elif check == "rows-complete":
|
|
2510
|
-
result_table.add_row("Incomplete Rows", "[green]None found[/green]")
|
|
2511
|
-
elif check == "col-exists":
|
|
2512
|
-
result_table.add_row("Column Status", "[green]Column exists[/green]")
|
|
2513
|
-
elif check == "col-vals-in-set":
|
|
2514
|
-
result_table.add_row(
|
|
2515
|
-
"Values Status", "[green]All values in allowed set[/green]"
|
|
2516
|
-
)
|
|
2517
|
-
elif check == "col-vals-gt":
|
|
2518
|
-
result_table.add_row("Values Status", f"[green]All values > {value}[/green]")
|
|
2519
|
-
elif check == "col-vals-ge":
|
|
2520
|
-
result_table.add_row("Values Status", f"[green]All values >= {value}[/green]")
|
|
2521
|
-
else:
|
|
2522
|
-
result_table.add_row("Result", "[red]✗ FAILED[/red]")
|
|
2523
|
-
if check == "rows-distinct":
|
|
2524
|
-
result_table.add_row(
|
|
2525
|
-
"Duplicate Rows", f"[red]{step_info.n_failed:,} found[/red]"
|
|
2526
|
-
)
|
|
2527
|
-
elif check == "col-vals-not-null":
|
|
2528
|
-
result_table.add_row("Null Values", f"[red]{step_info.n_failed:,} found[/red]")
|
|
2529
|
-
elif check == "rows-complete":
|
|
2530
|
-
result_table.add_row(
|
|
2531
|
-
"Incomplete Rows", f"[red]{step_info.n_failed:,} found[/red]"
|
|
2532
|
-
)
|
|
2533
|
-
elif check == "col-exists":
|
|
2534
|
-
result_table.add_row("Column Status", "[red]Column does not exist[/red]")
|
|
2535
|
-
elif check == "col-vals-in-set":
|
|
2536
|
-
result_table.add_row(
|
|
2537
|
-
"Invalid Values", f"[red]{step_info.n_failed:,} found[/red]"
|
|
2538
|
-
)
|
|
2539
|
-
elif check == "col-vals-gt":
|
|
2540
|
-
result_table.add_row(
|
|
2541
|
-
"Invalid Values", f"[red]{step_info.n_failed:,} values <= {value}[/red]"
|
|
2542
|
-
)
|
|
2543
|
-
elif check == "col-vals-ge":
|
|
2544
|
-
result_table.add_row(
|
|
2545
|
-
"Invalid Values", f"[red]{step_info.n_failed:,} values < {value}[/red]"
|
|
2546
|
-
)
|
|
3370
|
+
console.print(f"[green]✓[/green] Found {len(validations)} validation object(s)")
|
|
2547
3371
|
|
|
2548
|
-
|
|
2549
|
-
|
|
3372
|
+
# Process each validation
|
|
3373
|
+
overall_failed = False
|
|
3374
|
+
overall_critical = False
|
|
3375
|
+
overall_error = False
|
|
3376
|
+
overall_warning = False
|
|
2550
3377
|
|
|
2551
|
-
|
|
2552
|
-
|
|
2553
|
-
|
|
3378
|
+
for i, validation in enumerate(validations, 1):
|
|
3379
|
+
if len(validations) > 1:
|
|
3380
|
+
console.print(f"\n[bold cyan]Validation {i}:[/bold cyan]")
|
|
2554
3381
|
|
|
2555
|
-
#
|
|
2556
|
-
|
|
2557
|
-
extract_message = "[yellow]Preview of failing rows (duplicates):[/yellow]"
|
|
2558
|
-
row_type = "duplicate rows"
|
|
2559
|
-
elif check == "rows-complete":
|
|
2560
|
-
extract_message = "[yellow]Preview of failing rows (incomplete rows):[/yellow]"
|
|
2561
|
-
row_type = "incomplete rows"
|
|
2562
|
-
elif check == "col-exists":
|
|
2563
|
-
extract_message = (
|
|
2564
|
-
f"[yellow]Column '{column}' does not exist in the dataset[/yellow]"
|
|
2565
|
-
)
|
|
2566
|
-
row_type = "missing column"
|
|
2567
|
-
elif check == "col-vals-in-set":
|
|
2568
|
-
extract_message = (
|
|
2569
|
-
f"[yellow]Preview of failing rows (invalid values in '{column}'):[/yellow]"
|
|
2570
|
-
)
|
|
2571
|
-
row_type = "rows with invalid values"
|
|
2572
|
-
elif check == "col-vals-gt":
|
|
2573
|
-
extract_message = (
|
|
2574
|
-
f"[yellow]Preview of failing rows (values in '{column}' <= {value}):[/yellow]"
|
|
2575
|
-
)
|
|
2576
|
-
row_type = f"rows with values <= {value}"
|
|
2577
|
-
elif check == "col-vals-ge":
|
|
2578
|
-
extract_message = (
|
|
2579
|
-
f"[yellow]Preview of failing rows (values in '{column}' < {value}):[/yellow]"
|
|
2580
|
-
)
|
|
2581
|
-
row_type = f"rows with values < {value}"
|
|
2582
|
-
else:
|
|
2583
|
-
extract_message = "[yellow]Preview of failing rows:[/yellow]"
|
|
2584
|
-
row_type = "failing rows"
|
|
3382
|
+
# Display summary
|
|
3383
|
+
_display_validation_summary(validation)
|
|
2585
3384
|
|
|
2586
|
-
|
|
3385
|
+
# Check failure status
|
|
3386
|
+
validation_failed = False
|
|
3387
|
+
has_critical = False
|
|
3388
|
+
has_error = False
|
|
3389
|
+
has_warning = False
|
|
2587
3390
|
|
|
2588
|
-
|
|
2589
|
-
|
|
2590
|
-
|
|
2591
|
-
|
|
2592
|
-
|
|
2593
|
-
|
|
2594
|
-
|
|
2595
|
-
|
|
2596
|
-
|
|
2597
|
-
|
|
3391
|
+
if hasattr(validation, "validation_info") and validation.validation_info:
|
|
3392
|
+
for step_info in validation.validation_info:
|
|
3393
|
+
if step_info.critical:
|
|
3394
|
+
has_critical = True
|
|
3395
|
+
overall_critical = True
|
|
3396
|
+
if step_info.error:
|
|
3397
|
+
has_error = True
|
|
3398
|
+
overall_error = True
|
|
3399
|
+
if step_info.warning:
|
|
3400
|
+
has_warning = True
|
|
3401
|
+
overall_warning = True
|
|
3402
|
+
if step_info.n_failed > 0:
|
|
3403
|
+
validation_failed = True
|
|
3404
|
+
overall_failed = True
|
|
3405
|
+
|
|
3406
|
+
# Handle extract functionality for failed validations
|
|
3407
|
+
failed_steps = []
|
|
3408
|
+
if (
|
|
3409
|
+
validation_failed
|
|
3410
|
+
and hasattr(validation, "validation_info")
|
|
3411
|
+
and validation.validation_info
|
|
3412
|
+
):
|
|
3413
|
+
for j, step_info in enumerate(validation.validation_info, 1):
|
|
3414
|
+
if step_info.n_failed > 0:
|
|
3415
|
+
failed_steps.append((j, step_info))
|
|
3416
|
+
|
|
3417
|
+
if validation_failed and failed_steps and (show_extract or write_extract):
|
|
3418
|
+
console.print()
|
|
3419
|
+
|
|
3420
|
+
if show_extract:
|
|
3421
|
+
extract_title = "Extract of failing rows from validation steps"
|
|
3422
|
+
if len(validations) > 1:
|
|
3423
|
+
extract_title += f" (Validation {i})"
|
|
3424
|
+
console.print(f"[yellow]{extract_title}:[/yellow]")
|
|
3425
|
+
|
|
3426
|
+
for step_num, step_info in failed_steps:
|
|
3427
|
+
try:
|
|
3428
|
+
failing_rows = validation.get_data_extracts(i=step_num, frame=True)
|
|
3429
|
+
|
|
3430
|
+
if failing_rows is not None and len(failing_rows) > 0:
|
|
3431
|
+
console.print(
|
|
3432
|
+
f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
|
|
3433
|
+
)
|
|
2598
3434
|
|
|
2599
|
-
|
|
2600
|
-
|
|
2601
|
-
|
|
2602
|
-
|
|
3435
|
+
# Limit the number of rows shown
|
|
3436
|
+
if len(failing_rows) > limit:
|
|
3437
|
+
display_rows = failing_rows.head(limit)
|
|
3438
|
+
console.print(
|
|
3439
|
+
f"[dim]Showing first {limit} of {len(failing_rows)} failing rows[/dim]"
|
|
3440
|
+
)
|
|
3441
|
+
else:
|
|
3442
|
+
display_rows = failing_rows
|
|
3443
|
+
console.print(
|
|
3444
|
+
f"[dim]Showing all {len(failing_rows)} failing rows[/dim]"
|
|
3445
|
+
)
|
|
3446
|
+
|
|
3447
|
+
# Create a preview table using pointblank's preview function
|
|
3448
|
+
preview_table = pb.preview(
|
|
3449
|
+
data=display_rows,
|
|
3450
|
+
n_head=min(limit, len(display_rows)),
|
|
3451
|
+
n_tail=0,
|
|
3452
|
+
limit=limit,
|
|
3453
|
+
show_row_numbers=True,
|
|
3454
|
+
)
|
|
3455
|
+
|
|
3456
|
+
# Display using our Rich table function
|
|
3457
|
+
_rich_print_gt_table(preview_table, show_summary=False)
|
|
3458
|
+
else:
|
|
3459
|
+
console.print(
|
|
3460
|
+
f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
|
|
3461
|
+
)
|
|
3462
|
+
console.print("[yellow]No failing rows could be extracted[/yellow]")
|
|
3463
|
+
except Exception as e:
|
|
2603
3464
|
console.print(
|
|
2604
|
-
f"[
|
|
3465
|
+
f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
|
|
3466
|
+
)
|
|
3467
|
+
console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
|
|
3468
|
+
|
|
3469
|
+
if write_extract:
|
|
3470
|
+
try:
|
|
3471
|
+
folder_name = write_extract
|
|
3472
|
+
|
|
3473
|
+
# Add validation number if multiple validations
|
|
3474
|
+
if len(validations) > 1:
|
|
3475
|
+
folder_name = f"{folder_name}_validation_{i}"
|
|
3476
|
+
|
|
3477
|
+
# Create the output folder
|
|
3478
|
+
output_folder = Path(folder_name)
|
|
3479
|
+
output_folder.mkdir(parents=True, exist_ok=True)
|
|
3480
|
+
|
|
3481
|
+
saved_files = []
|
|
3482
|
+
|
|
3483
|
+
# Save each failing step to its own CSV file
|
|
3484
|
+
for step_num, step_info in failed_steps:
|
|
3485
|
+
try:
|
|
3486
|
+
failing_rows = validation.get_data_extracts(i=step_num, frame=True)
|
|
3487
|
+
if failing_rows is not None and len(failing_rows) > 0:
|
|
3488
|
+
# Create safe filename from assertion type
|
|
3489
|
+
safe_assertion_type = (
|
|
3490
|
+
step_info.assertion_type.replace(" ", "_")
|
|
3491
|
+
.replace("/", "_")
|
|
3492
|
+
.replace("\\", "_")
|
|
3493
|
+
.replace(":", "_")
|
|
3494
|
+
.replace("<", "_")
|
|
3495
|
+
.replace(">", "_")
|
|
3496
|
+
.replace("|", "_")
|
|
3497
|
+
.replace("?", "_")
|
|
3498
|
+
.replace("*", "_")
|
|
3499
|
+
.replace('"', "_")
|
|
3500
|
+
)
|
|
3501
|
+
|
|
3502
|
+
filename = f"step_{step_num:02d}_{safe_assertion_type}.csv"
|
|
3503
|
+
filepath = output_folder / filename
|
|
3504
|
+
|
|
3505
|
+
# Limit the output if needed
|
|
3506
|
+
save_rows = failing_rows
|
|
3507
|
+
if hasattr(failing_rows, "head") and len(failing_rows) > limit:
|
|
3508
|
+
save_rows = failing_rows.head(limit)
|
|
3509
|
+
|
|
3510
|
+
# Save to CSV
|
|
3511
|
+
if hasattr(save_rows, "write_csv"):
|
|
3512
|
+
# Polars
|
|
3513
|
+
save_rows.write_csv(str(filepath))
|
|
3514
|
+
elif hasattr(save_rows, "to_csv"):
|
|
3515
|
+
# Pandas
|
|
3516
|
+
save_rows.to_csv(str(filepath), index=False)
|
|
3517
|
+
else:
|
|
3518
|
+
# Try converting to pandas as fallback
|
|
3519
|
+
import pandas as pd
|
|
3520
|
+
|
|
3521
|
+
pd_data = pd.DataFrame(save_rows)
|
|
3522
|
+
pd_data.to_csv(str(filepath), index=False)
|
|
3523
|
+
|
|
3524
|
+
saved_files.append((filename, len(failing_rows)))
|
|
3525
|
+
|
|
3526
|
+
except Exception as e:
|
|
3527
|
+
console.print(
|
|
3528
|
+
f"[yellow]Warning: Could not save failing rows from step {step_num}: {e}[/yellow]"
|
|
3529
|
+
)
|
|
3530
|
+
|
|
3531
|
+
if saved_files:
|
|
3532
|
+
console.print(
|
|
3533
|
+
f"[green]✓[/green] Failing rows saved to folder: {output_folder}"
|
|
2605
3534
|
)
|
|
3535
|
+
for filename, row_count in saved_files:
|
|
3536
|
+
console.print(f"[dim] - {filename}: {row_count} rows[/dim]")
|
|
2606
3537
|
else:
|
|
2607
|
-
|
|
2608
|
-
|
|
3538
|
+
console.print(
|
|
3539
|
+
"[yellow]No failing rows could be extracted to save[/yellow]"
|
|
3540
|
+
)
|
|
2609
3541
|
|
|
2610
|
-
|
|
2611
|
-
|
|
2612
|
-
|
|
2613
|
-
n_head=min(limit, len(display_rows)),
|
|
2614
|
-
n_tail=0,
|
|
2615
|
-
limit=limit,
|
|
2616
|
-
show_row_numbers=True,
|
|
3542
|
+
except Exception as e:
|
|
3543
|
+
console.print(
|
|
3544
|
+
f"[yellow]Warning: Could not save failing rows to CSV: {e}[/yellow]"
|
|
2617
3545
|
)
|
|
2618
3546
|
|
|
2619
|
-
|
|
2620
|
-
|
|
2621
|
-
|
|
2622
|
-
|
|
2623
|
-
|
|
2624
|
-
|
|
3547
|
+
# Save HTML and JSON outputs (combine multiple validations if needed)
|
|
3548
|
+
if output_html:
|
|
3549
|
+
try:
|
|
3550
|
+
if len(validations) == 1:
|
|
3551
|
+
# Single validation - save directly
|
|
3552
|
+
html_content = validations[0]._repr_html_()
|
|
3553
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
3554
|
+
else:
|
|
3555
|
+
# Multiple validations - combine them
|
|
3556
|
+
html_parts = []
|
|
3557
|
+
html_parts.append("<html><body>")
|
|
3558
|
+
html_parts.append("<h1>Pointblank Validation Report</h1>")
|
|
2625
3559
|
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
if check == "rows-distinct":
|
|
2630
|
-
success_message = (
|
|
2631
|
-
f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
|
|
2632
|
-
)
|
|
2633
|
-
elif check == "col-vals-not-null":
|
|
2634
|
-
success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
|
|
2635
|
-
elif check == "rows-complete":
|
|
2636
|
-
success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
|
|
2637
|
-
elif check == "col-exists":
|
|
2638
|
-
success_message = (
|
|
2639
|
-
f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
|
|
2640
|
-
)
|
|
2641
|
-
elif check == "col-vals-in-set":
|
|
2642
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
|
|
2643
|
-
elif check == "col-vals-gt":
|
|
2644
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
|
|
2645
|
-
elif check == "col-vals-ge":
|
|
2646
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
|
|
2647
|
-
elif check == "col-vals-lt":
|
|
2648
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
|
|
2649
|
-
elif check == "col-vals-le":
|
|
2650
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
|
|
2651
|
-
else:
|
|
2652
|
-
success_message = (
|
|
2653
|
-
f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
|
|
2654
|
-
)
|
|
3560
|
+
for i, validation in enumerate(validations, 1):
|
|
3561
|
+
html_parts.append(f"<h2>Validation {i}</h2>")
|
|
3562
|
+
html_parts.append(validation._repr_html_())
|
|
2655
3563
|
|
|
2656
|
-
|
|
2657
|
-
|
|
2658
|
-
|
|
2659
|
-
border_style="green",
|
|
2660
|
-
)
|
|
2661
|
-
)
|
|
2662
|
-
else:
|
|
2663
|
-
if hasattr(validation, "validation_info") and validation.validation_info:
|
|
2664
|
-
step_info = validation.validation_info[0]
|
|
3564
|
+
html_parts.append("</body></html>")
|
|
3565
|
+
html_content = "\n".join(html_parts)
|
|
3566
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
2665
3567
|
|
|
2666
|
-
|
|
2667
|
-
|
|
2668
|
-
|
|
2669
|
-
|
|
2670
|
-
|
|
2671
|
-
|
|
2672
|
-
|
|
2673
|
-
|
|
2674
|
-
|
|
2675
|
-
|
|
2676
|
-
elif check == "col-vals-gt":
|
|
2677
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
|
|
2678
|
-
elif check == "col-vals-ge":
|
|
2679
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
|
|
2680
|
-
elif check == "col-vals-lt":
|
|
2681
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
|
|
2682
|
-
elif check == "col-vals-le":
|
|
2683
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
|
|
3568
|
+
console.print(f"[green]✓[/green] HTML report saved to: {output_html}")
|
|
3569
|
+
except Exception as e:
|
|
3570
|
+
console.print(f"[yellow]Warning: Could not save HTML report: {e}[/yellow]")
|
|
3571
|
+
|
|
3572
|
+
if output_json:
|
|
3573
|
+
try:
|
|
3574
|
+
if len(validations) == 1:
|
|
3575
|
+
# Single validation - save directly
|
|
3576
|
+
json_report = validations[0].get_json_report()
|
|
3577
|
+
Path(output_json).write_text(json_report, encoding="utf-8")
|
|
2684
3578
|
else:
|
|
2685
|
-
|
|
3579
|
+
# Multiple validations - combine them
|
|
3580
|
+
import json
|
|
2686
3581
|
|
|
2687
|
-
|
|
2688
|
-
if not show_extract and check != "col-exists":
|
|
2689
|
-
failure_message += (
|
|
2690
|
-
"\n[dim]💡 Tip: Use --show-extract to see the failing rows[/dim]"
|
|
2691
|
-
)
|
|
3582
|
+
combined_report = {"validations": []}
|
|
2692
3583
|
|
|
2693
|
-
|
|
2694
|
-
|
|
2695
|
-
|
|
2696
|
-
|
|
2697
|
-
)
|
|
2698
|
-
)
|
|
2699
|
-
else:
|
|
2700
|
-
if check == "rows-distinct":
|
|
2701
|
-
failure_message = (
|
|
2702
|
-
f"[red]✗ Validation FAILED: Duplicate rows found in {data_source}[/red]"
|
|
2703
|
-
)
|
|
2704
|
-
elif check == "rows-complete":
|
|
2705
|
-
failure_message = (
|
|
2706
|
-
f"[red]✗ Validation FAILED: Incomplete rows found in {data_source}[/red]"
|
|
2707
|
-
)
|
|
2708
|
-
else:
|
|
2709
|
-
failure_message = (
|
|
2710
|
-
f"[red]✗ Validation FAILED: {check} check failed for {data_source}[/red]"
|
|
2711
|
-
)
|
|
3584
|
+
for i, validation in enumerate(validations, 1):
|
|
3585
|
+
validation_json = json.loads(validation.get_json_report())
|
|
3586
|
+
validation_json["validation_id"] = i
|
|
3587
|
+
combined_report["validations"].append(validation_json)
|
|
2712
3588
|
|
|
2713
|
-
|
|
2714
|
-
|
|
2715
|
-
failure_message += (
|
|
2716
|
-
"\n[dim]💡 Tip: Use --show-extract to see the failing rows[/dim]"
|
|
3589
|
+
Path(output_json).write_text(
|
|
3590
|
+
json.dumps(combined_report, indent=2), encoding="utf-8"
|
|
2717
3591
|
)
|
|
2718
3592
|
|
|
2719
|
-
console.print(
|
|
2720
|
-
|
|
2721
|
-
|
|
2722
|
-
border_style="red",
|
|
2723
|
-
)
|
|
2724
|
-
)
|
|
3593
|
+
console.print(f"[green]✓[/green] JSON summary saved to: {output_json}")
|
|
3594
|
+
except Exception as e:
|
|
3595
|
+
console.print(f"[yellow]Warning: Could not save JSON report: {e}[/yellow]")
|
|
2725
3596
|
|
|
2726
|
-
#
|
|
2727
|
-
if
|
|
2728
|
-
|
|
2729
|
-
|
|
3597
|
+
# Check if we should fail based on threshold
|
|
3598
|
+
if fail_on:
|
|
3599
|
+
should_exit = False
|
|
3600
|
+
exit_reason = ""
|
|
3601
|
+
|
|
3602
|
+
if fail_on.lower() == "critical" and overall_critical:
|
|
3603
|
+
should_exit = True
|
|
3604
|
+
exit_reason = "critical validation failures"
|
|
3605
|
+
elif fail_on.lower() == "error" and (overall_critical or overall_error):
|
|
3606
|
+
should_exit = True
|
|
3607
|
+
exit_reason = "error or critical validation failures"
|
|
3608
|
+
elif fail_on.lower() == "warning" and (
|
|
3609
|
+
overall_critical or overall_error or overall_warning
|
|
3610
|
+
):
|
|
3611
|
+
should_exit = True
|
|
3612
|
+
exit_reason = "warning, error, or critical validation failures"
|
|
3613
|
+
elif fail_on.lower() == "any" and overall_failed:
|
|
3614
|
+
should_exit = True
|
|
3615
|
+
exit_reason = "validation failures"
|
|
3616
|
+
|
|
3617
|
+
if should_exit:
|
|
3618
|
+
console.print(f"[red]Exiting with error due to {exit_reason}[/red]")
|
|
3619
|
+
sys.exit(1)
|
|
2730
3620
|
|
|
2731
3621
|
except Exception as e:
|
|
2732
3622
|
console.print(f"[red]Error:[/red] {e}")
|
|
2733
3623
|
sys.exit(1)
|
|
2734
3624
|
|
|
2735
3625
|
|
|
2736
|
-
|
|
2737
|
-
|
|
3626
|
+
def _format_missing_percentage(value: float) -> str:
|
|
3627
|
+
"""Format missing value percentages for display.
|
|
3628
|
+
|
|
3629
|
+
Args:
|
|
3630
|
+
value: The percentage value (0-100)
|
|
3631
|
+
|
|
3632
|
+
Returns:
|
|
3633
|
+
Formatted string with proper percentage display
|
|
3634
|
+
"""
|
|
3635
|
+
if value == 0.0:
|
|
3636
|
+
return "[green]●[/green]" # Large green circle for no missing values
|
|
3637
|
+
elif value == 100.0:
|
|
3638
|
+
return "[red]●[/red]" # Large red circle for completely missing values
|
|
3639
|
+
elif value < 1.0 and value > 0:
|
|
3640
|
+
return "<1%" # Less than 1%
|
|
3641
|
+
elif value > 99.0 and value < 100.0:
|
|
3642
|
+
return ">99%" # More than 99%
|
|
3643
|
+
else:
|
|
3644
|
+
return f"{int(round(value))}%" # Round to nearest integer with % sign
|