pointblank 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/cli.py ADDED
@@ -0,0 +1,2737 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import click
8
+ from rich.console import Console
9
+ from rich.panel import Panel
10
+ from rich.table import Table
11
+
12
+ import pointblank as pb
13
+ from pointblank._utils import _get_tbl_type, _is_lib_present
14
+
15
+ console = Console()
16
+
17
+
18
+ def _format_cell_value(
19
+ value: Any, is_row_number: bool = False, max_width: int = 50, num_columns: int = 10
20
+ ) -> str:
21
+ """Format a cell value for Rich table display, highlighting None/NA values in red.
22
+
23
+ Args:
24
+ value: The raw cell value from the dataframe
25
+ is_row_number: Whether this is a row number column value
26
+ max_width: Maximum character width for text truncation
27
+ num_columns: Number of columns in the table (affects truncation aggressiveness)
28
+
29
+ Returns:
30
+ Formatted string with Rich markup for None/NA values or row numbers
31
+ """
32
+ # Special formatting for row numbers: never truncate them
33
+ if is_row_number:
34
+ return f"[dim]{value}[/dim]"
35
+
36
+ # Check for actual None/null values (not string representations)
37
+ if value is None:
38
+ return "[red]None[/red]"
39
+
40
+ # Check for pandas/numpy specific NA values
41
+ try:
42
+ import numpy as np
43
+ import pandas as pd
44
+
45
+ # Check for pandas NA
46
+ if pd.isna(value):
47
+ # If it's specifically numpy.nan, show as NaN
48
+ if isinstance(value, float) and np.isnan(value):
49
+ return "[red]NaN[/red]"
50
+ # If it's pandas NA, show as NA
51
+ elif str(type(value)).find("pandas") != -1:
52
+ return "[red]NA[/red]"
53
+ # Generic NA for other pandas missing values
54
+ else:
55
+ return "[red]NA[/red]"
56
+
57
+ except (ImportError, TypeError, ValueError): # pragma: no cover
58
+ # If pandas/numpy not available, value not compatible, or ambiguous array
59
+ pass
60
+
61
+ # Check for empty strings (but only actual empty strings, not whitespace)
62
+ if isinstance(value, str) and value == "":
63
+ return "[red][/red]" # Empty string shown as red empty space
64
+
65
+ # Convert to string and apply intelligent truncation
66
+ str_value = str(value)
67
+
68
+ # Adjust max_width based on number of columns to prevent overly wide tables
69
+ if num_columns > 15:
70
+ adjusted_max_width = min(max_width, 30) # Be more aggressive with many columns
71
+ elif num_columns > 10:
72
+ adjusted_max_width = min(max_width, 40)
73
+ else:
74
+ adjusted_max_width = max_width
75
+
76
+ # Apply truncation if the string is too long
77
+ if len(str_value) > adjusted_max_width:
78
+ # For very long text, truncate more aggressively
79
+ if len(str_value) > adjusted_max_width * 2:
80
+ # For extremely long text, use a shorter truncation
81
+ truncated = str_value[: adjusted_max_width // 2] + "…"
82
+ else:
83
+ # For moderately long text, use a more generous truncation
84
+ truncated = str_value[: adjusted_max_width - 1] + "…"
85
+
86
+ return truncated
87
+
88
+ return str_value
89
+
90
+
91
+ def _get_column_dtypes(df: Any, columns: list[str]) -> dict[str, str]:
92
+ """Extract data types for columns and format them in a compact way.
93
+
94
+ Args:
95
+ df: The dataframe object
96
+ columns: List of column names
97
+
98
+ Returns:
99
+ Dictionary mapping column names to formatted data type strings
100
+ """
101
+ dtypes_dict = {}
102
+
103
+ try:
104
+ if hasattr(df, "dtypes"):
105
+ # Polars/Pandas style
106
+ if hasattr(df.dtypes, "to_dict"):
107
+ # Polars DataFrame dtypes
108
+ raw_dtypes = df.dtypes.to_dict() if hasattr(df.dtypes, "to_dict") else {}
109
+ for col in columns:
110
+ if col in raw_dtypes:
111
+ dtype_str = str(raw_dtypes[col])
112
+ # Convert to compact format similar to Polars glimpse()
113
+ dtypes_dict[col] = _format_dtype_compact(dtype_str)
114
+ else:
115
+ dtypes_dict[col] = "?"
116
+ else:
117
+ # Pandas DataFrame dtypes (Series-like)
118
+ for i, col in enumerate(columns):
119
+ if i < len(df.dtypes):
120
+ dtype_str = str(
121
+ df.dtypes.iloc[i] if hasattr(df.dtypes, "iloc") else df.dtypes[i]
122
+ )
123
+ dtypes_dict[col] = _format_dtype_compact(dtype_str)
124
+ else:
125
+ dtypes_dict[col] = "?"
126
+ elif hasattr(df, "schema"):
127
+ # Other schema-based systems (e.g., Ibis)
128
+ schema = df.schema
129
+ if hasattr(schema, "to_dict"): # pragma: no cover
130
+ raw_dtypes = schema.to_dict()
131
+ for col in columns:
132
+ if col in raw_dtypes:
133
+ dtypes_dict[col] = _format_dtype_compact(str(raw_dtypes[col]))
134
+ else: # pragma: no cover
135
+ dtypes_dict[col] = "?"
136
+ else: # pragma: no cover
137
+ for col in columns:
138
+ try:
139
+ dtype_str = str(getattr(schema, col, "Unknown"))
140
+ dtypes_dict[col] = _format_dtype_compact(dtype_str)
141
+ except Exception: # pragma: no cover
142
+ dtypes_dict[col] = "?"
143
+ else:
144
+ # Fallback: no type information available
145
+ for col in columns:
146
+ dtypes_dict[col] = "?"
147
+
148
+ except Exception: # pragma: no cover
149
+ # If any error occurs, fall back to unknown types
150
+ for col in columns:
151
+ dtypes_dict[col] = "?"
152
+
153
+ return dtypes_dict
154
+
155
+
156
+ def _format_dtype_compact(dtype_str: str) -> str:
157
+ """Format a data type string to a compact representation.
158
+
159
+ Args:
160
+ dtype_str: The raw data type string
161
+
162
+ Returns:
163
+ Compact formatted data type string
164
+ """
165
+ # Remove common prefixes and make compact
166
+ dtype_str = dtype_str.lower()
167
+
168
+ # Polars types
169
+ if "utf8" in dtype_str or "string" in dtype_str:
170
+ return "str"
171
+ elif "int64" in dtype_str:
172
+ return "i64"
173
+ elif "int32" in dtype_str:
174
+ return "i32"
175
+ elif "float64" in dtype_str:
176
+ return "f64"
177
+ elif "float32" in dtype_str:
178
+ return "f32"
179
+ elif "boolean" in dtype_str or "bool" in dtype_str:
180
+ return "bool"
181
+ elif "datetime" in dtype_str:
182
+ return "datetime"
183
+ elif "date" in dtype_str and "datetime" not in dtype_str:
184
+ return "date"
185
+ elif "time" in dtype_str:
186
+ return "time"
187
+
188
+ # Pandas types
189
+ elif "object" in dtype_str:
190
+ return "obj"
191
+ elif "category" in dtype_str:
192
+ return "cat"
193
+
194
+ # Generic fallbacks
195
+ elif "int" in dtype_str:
196
+ return "int"
197
+ elif "float" in dtype_str:
198
+ return "float"
199
+ elif "str" in dtype_str:
200
+ return "str"
201
+
202
+ # Unknown or complex types - truncate if too long
203
+ elif len(dtype_str) > 8:
204
+ return dtype_str[:8] + "…"
205
+ else:
206
+ return dtype_str
207
+
208
+
209
+ def _rich_print_gt_table(gt_table: Any, preview_info: dict | None = None) -> None:
210
+ """Convert a GT table to Rich table and display it in the terminal.
211
+
212
+ Args:
213
+ gt_table: The GT table object to display
214
+ preview_info: Optional dict with preview context info:
215
+ - total_rows: Total rows in the dataset
216
+ - head_rows: Number of head rows shown
217
+ - tail_rows: Number of tail rows shown
218
+ - is_complete: Whether the entire dataset is shown
219
+ """
220
+ try:
221
+ # Try to extract the underlying data from the GT table
222
+ df = None
223
+
224
+ # Great Tables stores the original data in different places depending on how it was created
225
+ # Let's try multiple approaches to get the data
226
+ if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
227
+ df = gt_table._tbl_data
228
+ elif (
229
+ hasattr(gt_table, "_body")
230
+ and hasattr(gt_table._body, "body")
231
+ and gt_table._body.body is not None
232
+ ):
233
+ df = gt_table._body.body
234
+ elif hasattr(gt_table, "_data") and gt_table._data is not None:
235
+ df = gt_table._data
236
+ elif hasattr(gt_table, "data") and gt_table.data is not None:
237
+ df = gt_table.data
238
+
239
+ if df is not None:
240
+ # Create a Rich table with horizontal lines
241
+ from rich.box import SIMPLE_HEAD
242
+
243
+ # Create enhanced title if preview_info contains metadata
244
+ table_title = None
245
+ if preview_info and "source_type" in preview_info and "table_type" in preview_info:
246
+ source_type = preview_info["source_type"]
247
+ table_type = preview_info["table_type"]
248
+ table_title = f"Data Preview / {source_type} / {table_type}"
249
+
250
+ rich_table = Table(
251
+ title=table_title,
252
+ show_header=True,
253
+ header_style="bold magenta",
254
+ box=SIMPLE_HEAD,
255
+ title_style="bold cyan",
256
+ title_justify="left",
257
+ )
258
+
259
+ # Get column names
260
+ columns = []
261
+ if hasattr(df, "columns"):
262
+ columns = list(df.columns)
263
+ elif hasattr(df, "schema"): # pragma: no cover
264
+ columns = list(df.schema.names)
265
+ elif hasattr(df, "column_names"): # pragma: no cover
266
+ columns = list(df.column_names)
267
+
268
+ if not columns: # pragma: no cover
269
+ # Fallback: try to determine columns from first row
270
+ try:
271
+ if hasattr(df, "to_dicts") and len(df) > 0:
272
+ first_dict = df.to_dicts()[0]
273
+ columns = list(first_dict.keys())
274
+ elif hasattr(df, "to_dict") and len(df) > 0:
275
+ first_dict = df.to_dict("records")[0]
276
+ columns = list(first_dict.keys())
277
+ except Exception: # pragma: no cover
278
+ columns = [f"Column {i + 1}" for i in range(10)] # Default fallback
279
+
280
+ # Add columns to Rich table
281
+ # Handle wide tables by limiting columns displayed
282
+ max_terminal_cols = 15 # Reasonable limit for terminal display
283
+
284
+ # Get terminal width to adjust column behavior
285
+ try:
286
+ terminal_width = console.size.width
287
+ # Estimate max column width based on terminal size and number of columns
288
+ if len(columns) <= 5:
289
+ max_col_width = min(60, terminal_width // 4)
290
+ elif len(columns) <= 10:
291
+ max_col_width = min(40, terminal_width // 6)
292
+ else:
293
+ max_col_width = min(30, terminal_width // 8)
294
+ except Exception: # pragma: no cover
295
+ # Fallback if we can't get terminal width
296
+ max_col_width = 40 if len(columns) <= 10 else 25
297
+
298
+ if len(columns) > max_terminal_cols:
299
+ # For wide tables, show first few, middle indicator, and last few columns
300
+ first_cols = 7
301
+ last_cols = 7
302
+
303
+ display_columns = columns[:first_cols] + ["...more..."] + columns[-last_cols:]
304
+
305
+ console.print(
306
+ f"\n[yellow]⚠ Table has {len(columns)} columns. Showing first {first_cols} and last {last_cols} columns.[/yellow]"
307
+ )
308
+ console.print("[dim]Use --columns to specify which columns to display.[/dim]")
309
+ console.print(
310
+ f"[dim]Full column list: {', '.join(columns[:5])}...{', '.join(columns[-5:])}[/dim]\n"
311
+ )
312
+ else:
313
+ display_columns = columns
314
+
315
+ # Get data types for columns
316
+ dtypes_dict = _get_column_dtypes(df, columns)
317
+
318
+ # Calculate row number column width if needed
319
+ row_num_width = 6 # Default width
320
+ if "_row_num_" in columns:
321
+ try:
322
+ # Get the maximum row number to calculate appropriate width
323
+ if hasattr(df, "to_dicts"):
324
+ data_dict = df.to_dicts()
325
+ if data_dict:
326
+ row_nums = [row.get("_row_num_", 0) for row in data_dict]
327
+ max_row_num = max(row_nums) if row_nums else 0
328
+ row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
329
+ elif hasattr(df, "to_dict"):
330
+ data_dict = df.to_dict("records")
331
+ if data_dict:
332
+ row_nums = [row.get("_row_num_", 0) for row in data_dict]
333
+ max_row_num = max(row_nums) if row_nums else 0
334
+ row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
335
+ except Exception: # pragma: no cover
336
+ # If we can't determine max row number, use default
337
+ row_num_width = 8 # Slightly larger default for safety
338
+
339
+ for i, col in enumerate(display_columns):
340
+ if col == "...more...":
341
+ # Add a special indicator column
342
+ rich_table.add_column("···", style="dim", width=3, no_wrap=True)
343
+ else:
344
+ # Handle row number column specially
345
+ if col == "_row_num_":
346
+ # Row numbers get no header, right alignment, and dim gray style
347
+ # Use dynamic width to prevent truncation
348
+ rich_table.add_column(
349
+ "", style="dim", justify="right", no_wrap=True, width=row_num_width
350
+ )
351
+ else:
352
+ display_col = str(col)
353
+
354
+ # Get data type for this column (if available)
355
+ if col in dtypes_dict:
356
+ dtype_display = f"<{dtypes_dict[col]}>"
357
+ # Create header with column name and data type
358
+ header_text = f"{display_col}\n[dim yellow]{dtype_display}[/dim yellow]"
359
+ else:
360
+ header_text = display_col
361
+
362
+ rich_table.add_column(
363
+ header_text,
364
+ style="cyan",
365
+ no_wrap=False,
366
+ overflow="ellipsis",
367
+ max_width=max_col_width,
368
+ )
369
+
370
+ # Convert data to list of rows
371
+ rows = []
372
+ try:
373
+ if hasattr(df, "to_dicts"):
374
+ # Polars interface
375
+ data_dict = df.to_dicts()
376
+ if len(columns) > max_terminal_cols:
377
+ # For wide tables, extract only the displayed columns
378
+ display_data_columns = (
379
+ columns[:7] + columns[-7:]
380
+ ) # Skip the "...more..." placeholder
381
+ rows = [
382
+ [
383
+ _format_cell_value(
384
+ row.get(col, ""),
385
+ is_row_number=(col == "_row_num_"),
386
+ max_width=max_col_width,
387
+ num_columns=len(columns),
388
+ )
389
+ for col in display_data_columns
390
+ ]
391
+ for row in data_dict
392
+ ]
393
+ # Add the "..." column in the middle
394
+ for i, row in enumerate(rows):
395
+ rows[i] = row[:7] + ["···"] + row[7:]
396
+ else:
397
+ rows = [
398
+ [
399
+ _format_cell_value(
400
+ row.get(col, ""),
401
+ is_row_number=(col == "_row_num_"),
402
+ max_width=max_col_width,
403
+ num_columns=len(columns),
404
+ )
405
+ for col in columns
406
+ ]
407
+ for row in data_dict
408
+ ]
409
+ elif hasattr(df, "to_dict"):
410
+ # Pandas-like interface
411
+ data_dict = df.to_dict("records")
412
+ if len(columns) > max_terminal_cols:
413
+ # For wide tables, extract only the displayed columns
414
+ display_data_columns = columns[:7] + columns[-7:]
415
+ rows = [
416
+ [
417
+ _format_cell_value(
418
+ row.get(col, ""),
419
+ is_row_number=(col == "_row_num_"),
420
+ max_width=max_col_width,
421
+ num_columns=len(columns),
422
+ )
423
+ for col in display_data_columns
424
+ ]
425
+ for row in data_dict
426
+ ]
427
+ # Add the "..." column in the middle
428
+ for i, row in enumerate(rows):
429
+ rows[i] = row[:7] + ["···"] + row[7:]
430
+ else:
431
+ rows = [
432
+ [
433
+ _format_cell_value(
434
+ row.get(col, ""),
435
+ is_row_number=(col == "_row_num_"),
436
+ max_width=max_col_width,
437
+ num_columns=len(columns),
438
+ )
439
+ for col in columns
440
+ ]
441
+ for row in data_dict
442
+ ]
443
+ elif hasattr(df, "iter_rows"):
444
+ # Polars lazy frame
445
+ rows = [
446
+ [
447
+ _format_cell_value(
448
+ val,
449
+ is_row_number=(i == 0 and columns[0] == "_row_num_"),
450
+ max_width=max_col_width,
451
+ num_columns=len(columns),
452
+ )
453
+ for i, val in enumerate(row)
454
+ ]
455
+ for row in df.iter_rows()
456
+ ]
457
+ elif hasattr(df, "__iter__"):
458
+ # Try to iterate directly
459
+ rows = [
460
+ [
461
+ _format_cell_value(
462
+ val,
463
+ is_row_number=(i == 0 and columns[0] == "_row_num_"),
464
+ max_width=max_col_width,
465
+ num_columns=len(columns),
466
+ )
467
+ for i, val in enumerate(row)
468
+ ]
469
+ for row in df
470
+ ]
471
+ else:
472
+ rows = [["Could not extract data from this format"]] # pragma: no cover
473
+ except Exception as e:
474
+ rows = [[f"Error extracting data: {e}"]] # pragma: no cover
475
+
476
+ # Add rows to Rich table with separator between head and tail
477
+ max_rows = 50 # Reasonable limit for terminal display
478
+
479
+ # Get preview info to determine head/tail separation
480
+ head_rows_count = 0
481
+ tail_rows_count = 0
482
+ total_dataset_rows = 0
483
+
484
+ if preview_info:
485
+ head_rows_count = preview_info.get("head_rows", 0)
486
+ tail_rows_count = preview_info.get("tail_rows", 0)
487
+ total_dataset_rows = preview_info.get("total_rows", len(rows))
488
+ is_complete = preview_info.get("is_complete", False)
489
+ else:
490
+ # Fallback: assume all rows are shown
491
+ is_complete = True
492
+
493
+ # Add rows with optional separator
494
+ for i, row in enumerate(rows[:max_rows]):
495
+ try:
496
+ # Add separator between head and tail rows
497
+ if (
498
+ not is_complete
499
+ and head_rows_count > 0
500
+ and tail_rows_count > 0
501
+ and i == head_rows_count
502
+ ):
503
+ # Add a visual separator row with dashes
504
+ separator_row = [
505
+ "─" * 3 if col != "_row_num_" else "⋮"
506
+ for col in (
507
+ display_columns if "display_columns" in locals() else columns
508
+ )
509
+ ]
510
+ rich_table.add_row(*separator_row, style="dim")
511
+
512
+ rich_table.add_row(*row)
513
+ except Exception as e: # pragma: no cover
514
+ # If there's an issue with row data, show error
515
+ rich_table.add_row(*[f"Error: {e}" for _ in columns]) # pragma: no cover
516
+ break # pragma: no cover
517
+
518
+ # Show the table
519
+ console.print()
520
+ console.print(rich_table)
521
+
522
+ # Show summary info
523
+ total_rows = len(rows)
524
+
525
+ # Use preview info if available, otherwise fall back to old logic
526
+ if preview_info:
527
+ total_dataset_rows = preview_info.get("total_rows", total_rows)
528
+ head_rows = preview_info.get("head_rows", 0)
529
+ tail_rows = preview_info.get("tail_rows", 0)
530
+ is_complete = preview_info.get("is_complete", False)
531
+
532
+ if is_complete:
533
+ console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
534
+ elif head_rows > 0 and tail_rows > 0:
535
+ console.print(
536
+ f"\n[dim]Showing first {head_rows} and last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
537
+ )
538
+ elif head_rows > 0:
539
+ console.print(
540
+ f"\n[dim]Showing first {head_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
541
+ )
542
+ elif tail_rows > 0:
543
+ console.print(
544
+ f"\n[dim]Showing last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
545
+ )
546
+ else:
547
+ # Fallback for other cases
548
+ console.print(
549
+ f"\n[dim]Showing {total_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
550
+ )
551
+ else:
552
+ # Original logic as fallback
553
+ max_rows = 50 # This should match the limit used above
554
+ if total_rows > max_rows:
555
+ console.print(
556
+ f"\n[dim]Showing first {max_rows} of {total_rows} rows. Use --output-html to see all data.[/dim]"
557
+ )
558
+ else:
559
+ console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
560
+
561
+ else:
562
+ # If we can't extract data, show the success message
563
+ console.print(
564
+ Panel(
565
+ "[green]✓[/green] Table rendered successfully. "
566
+ "Use --output-html to save the full interactive report.",
567
+ title="Table Preview",
568
+ border_style="green",
569
+ )
570
+ )
571
+
572
+ except Exception as e: # pragma: no cover
573
+ console.print(f"[red]Error rendering table:[/red] {e}")
574
+ console.print(
575
+ f"[dim]GT table type: {type(gt_table) if 'gt_table' in locals() else 'undefined'}[/dim]"
576
+ )
577
+
578
+ # Fallback: show the success message
579
+ console.print(
580
+ Panel(
581
+ "[green]✓[/green] Table rendered successfully. "
582
+ "Use --output-html to save the full interactive report.",
583
+ title="Table Preview",
584
+ border_style="green",
585
+ )
586
+ )
587
+
588
+
589
+ def _display_validation_summary(validation: Any) -> None:
590
+ """Display a validation summary in a Rich table format."""
591
+ try:
592
+ # Try to get the summary from the validation report
593
+ if hasattr(validation, "validation_info") and validation.validation_info is not None:
594
+ # Use the validation_info to create a summary
595
+ info = validation.validation_info
596
+ n_steps = len(info)
597
+ n_passed = sum(1 for step in info if step.all_passed)
598
+ n_failed = n_steps - n_passed
599
+
600
+ # Calculate severity counts
601
+ n_warning = sum(1 for step in info if step.warning)
602
+ n_error = sum(1 for step in info if step.error)
603
+ n_critical = sum(1 for step in info if step.critical)
604
+
605
+ all_passed = n_failed == 0
606
+
607
+ # Determine highest severity
608
+ if n_critical > 0:
609
+ highest_severity = "critical"
610
+ elif n_error > 0:
611
+ highest_severity = "error"
612
+ elif n_warning > 0:
613
+ highest_severity = "warning"
614
+ elif n_failed > 0:
615
+ highest_severity = "some failing"
616
+ else:
617
+ highest_severity = "all passed"
618
+
619
+ # Create a summary table
620
+ table = Table(title="Validation Summary", show_header=True, header_style="bold magenta")
621
+ table.add_column("Metric", style="cyan", no_wrap=True)
622
+ table.add_column("Value", style="green")
623
+
624
+ # Add summary statistics
625
+ table.add_row("Total Steps", str(n_steps))
626
+ table.add_row("Passing Steps", str(n_passed))
627
+ table.add_row("Failing Steps", str(n_failed))
628
+ table.add_row("Warning Steps", str(n_warning))
629
+ table.add_row("Error Steps", str(n_error))
630
+ table.add_row("Critical Steps", str(n_critical))
631
+ table.add_row("All Passed", str(all_passed))
632
+ table.add_row("Highest Severity", highest_severity)
633
+
634
+ console.print(table)
635
+
636
+ # Display step details
637
+ if n_steps > 0:
638
+ steps_table = Table(
639
+ title="Validation Steps", show_header=True, header_style="bold cyan"
640
+ )
641
+ steps_table.add_column("Step", style="dim")
642
+ steps_table.add_column("Type", style="white")
643
+ steps_table.add_column("Column", style="cyan")
644
+ steps_table.add_column("Status", style="white")
645
+ steps_table.add_column("Passed/Total", style="green")
646
+
647
+ for step in info:
648
+ status_icon = "✓" if step.all_passed else "✗"
649
+ status_color = "green" if step.all_passed else "red"
650
+
651
+ severity = ""
652
+ if step.critical:
653
+ severity = " [red](CRITICAL)[/red]"
654
+ elif step.error:
655
+ severity = " [red](ERROR)[/red]"
656
+ elif step.warning:
657
+ severity = " [yellow](WARNING)[/yellow]"
658
+
659
+ steps_table.add_row(
660
+ str(step.i),
661
+ step.assertion_type,
662
+ str(step.column) if step.column else "—",
663
+ f"[{status_color}]{status_icon}[/{status_color}]{severity}",
664
+ f"{step.n_passed}/{step.n}",
665
+ )
666
+
667
+ console.print(steps_table)
668
+
669
+ # Display status with appropriate color
670
+ if highest_severity == "all passed":
671
+ console.print(
672
+ Panel("[green]✓ All validations passed![/green]", border_style="green")
673
+ )
674
+ elif highest_severity == "some failing":
675
+ console.print(
676
+ Panel("[yellow]⚠ Some validations failed[/yellow]", border_style="yellow")
677
+ )
678
+ elif highest_severity in ["warning", "error", "critical"]:
679
+ color = "yellow" if highest_severity == "warning" else "red"
680
+ console.print(
681
+ Panel(
682
+ f"[{color}]✗ Validation failed with {highest_severity} severity[/{color}]",
683
+ border_style=color,
684
+ )
685
+ )
686
+ else:
687
+ console.print("[yellow]Validation object does not contain validation results.[/yellow]")
688
+
689
+ except Exception as e: # pragma: no cover
690
+ console.print(f"[red]Error displaying validation summary:[/red] {e}")
691
+ import traceback # pragma: no cover
692
+
693
+ console.print(f"[dim]{traceback.format_exc()}[/dim]") # pragma: no cover
694
+
695
+
696
+ @click.group()
697
+ @click.version_option(version=pb.__version__, prog_name="pb")
698
+ def cli():
699
+ """
700
+ Pointblank CLI - Data validation and quality tools for data engineers.
701
+
702
+ Use this CLI to validate data, preview tables, and generate reports
703
+ directly from the command line.
704
+ """
705
+ pass
706
+
707
+
708
+ @cli.command()
709
+ def datasets():
710
+ """
711
+ List available built-in datasets.
712
+ """
713
+ datasets_info = [
714
+ ("small_table", "13 rows × 8 columns", "Small demo dataset for testing"),
715
+ ("game_revenue", "2,000 rows × 11 columns", "Game development company revenue data"),
716
+ ("nycflights", "336,776 rows × 18 columns", "NYC airport flights data from 2013"),
717
+ ("global_sales", "50,000 rows × 20 columns", "Global sales data across regions"),
718
+ ]
719
+
720
+ table = Table(
721
+ title="Available Pointblank Datasets", show_header=True, header_style="bold magenta"
722
+ )
723
+ table.add_column("Dataset Name", style="cyan", no_wrap=True)
724
+ table.add_column("Dimensions", style="green")
725
+ table.add_column("Description", style="white")
726
+
727
+ for name, dims, desc in datasets_info:
728
+ table.add_row(name, dims, desc)
729
+
730
+ console.print(table)
731
+ console.print("\n[dim]Use these dataset names directly with any pb CLI command.[/dim]")
732
+ console.print("[dim]Example: pb preview small_table[/dim]")
733
+
734
+
735
+ @cli.command()
736
+ def requirements():
737
+ """
738
+ Check installed dependencies and their availability.
739
+ """
740
+ dependencies = [
741
+ ("polars", "Polars DataFrame support"),
742
+ ("pandas", "Pandas DataFrame support"),
743
+ ("ibis", "Ibis backend support (DuckDB, etc.)"),
744
+ ("duckdb", "DuckDB database support"),
745
+ ("pyarrow", "Parquet file support"),
746
+ ]
747
+
748
+ table = Table(title="Dependency Status", show_header=True, header_style="bold magenta")
749
+ table.add_column("Package", style="cyan", no_wrap=True)
750
+ table.add_column("Status", style="white")
751
+ table.add_column("Description", style="dim")
752
+
753
+ for package, description in dependencies:
754
+ if _is_lib_present(package):
755
+ status = "[green]✓ Installed[/green]"
756
+ else:
757
+ status = "[red]✗ Not installed[/red]"
758
+
759
+ table.add_row(package, status, description)
760
+
761
+ console.print(table)
762
+ console.print("\n[dim]Install missing packages to enable additional functionality.[/dim]")
763
+
764
+
765
+ @cli.command()
766
+ @click.argument("data_source", type=str)
767
+ @click.option("--columns", "-c", help="Comma-separated list of columns to display")
768
+ @click.option("--col-range", help="Column range like '1:10' or '5:' or ':15' (1-based indexing)")
769
+ @click.option("--col-first", type=int, help="Show first N columns")
770
+ @click.option("--col-last", type=int, help="Show last N columns")
771
+ @click.option("--head", "-h", default=5, help="Number of rows from the top (default: 5)")
772
+ @click.option("--tail", "-t", default=5, help="Number of rows from the bottom (default: 5)")
773
+ @click.option("--limit", "-l", default=50, help="Maximum total rows to display (default: 50)")
774
+ @click.option("--no-row-numbers", is_flag=True, help="Hide row numbers")
775
+ @click.option("--max-col-width", default=250, help="Maximum column width in pixels (default: 250)")
776
+ @click.option("--min-table-width", default=500, help="Minimum table width in pixels (default: 500)")
777
+ @click.option("--no-header", is_flag=True, help="Hide table header")
778
+ @click.option("--output-html", type=click.Path(), help="Save HTML output to file")
779
+ def preview(
780
+ data_source: str,
781
+ columns: str | None,
782
+ col_range: str | None,
783
+ col_first: int | None,
784
+ col_last: int | None,
785
+ head: int,
786
+ tail: int,
787
+ limit: int,
788
+ no_row_numbers: bool,
789
+ max_col_width: int,
790
+ min_table_width: int,
791
+ no_header: bool,
792
+ output_html: str | None,
793
+ ):
794
+ """
795
+ Preview a data table showing head and tail rows.
796
+
797
+ DATA_SOURCE can be:
798
+
799
+ \b
800
+ - CSV file path (e.g., data.csv)
801
+ - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
802
+ - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
803
+ - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
804
+
805
+ COLUMN SELECTION OPTIONS:
806
+
807
+ For tables with many columns, use these options to control which columns are displayed:
808
+
809
+ \b
810
+ - --columns: Specify exact columns (e.g., --columns "name,age,email")
811
+ - --col-range: Select column range (e.g., --col-range "1:10", --col-range "5:", --col-range ":15")
812
+ - --col-first: Show first N columns (e.g., --col-first 5)
813
+ - --col-last: Show last N columns (e.g., --col-last 3)
814
+
815
+ Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
816
+ """
817
+ try:
818
+ with console.status("[bold green]Loading data..."):
819
+ # Try to load as a pointblank dataset first
820
+ if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
821
+ data = pb.load_dataset(data_source)
822
+ console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
823
+ else:
824
+ # Assume it's a file path or connection string
825
+ data = data_source
826
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
827
+
828
+ # Parse columns if provided
829
+ columns_list = None
830
+ if columns:
831
+ columns_list = [col.strip() for col in columns.split(",")]
832
+
833
+ # If data has _row_num_ and it's not explicitly included, add it at the beginning
834
+ try:
835
+ from pointblank.validate import (
836
+ _process_connection_string,
837
+ _process_csv_input,
838
+ _process_parquet_input,
839
+ )
840
+
841
+ # Process the data source to get actual data object to check for _row_num_
842
+ processed_data = data
843
+ if isinstance(data, str):
844
+ processed_data = _process_connection_string(data)
845
+ processed_data = _process_csv_input(processed_data)
846
+ processed_data = _process_parquet_input(processed_data)
847
+
848
+ # Get column names from the processed data
849
+ all_columns = []
850
+ if hasattr(processed_data, "columns"):
851
+ all_columns = list(processed_data.columns)
852
+ elif hasattr(processed_data, "schema"):
853
+ all_columns = list(processed_data.schema.names)
854
+
855
+ # If _row_num_ exists in data but not in user selection, add it at beginning
856
+ if all_columns and "_row_num_" in all_columns and "_row_num_" not in columns_list:
857
+ columns_list = ["_row_num_"] + columns_list
858
+ except Exception: # pragma: no cover
859
+ # If we can't process the data, just use the user's column list as-is
860
+ pass
861
+ elif col_range or col_first or col_last:
862
+ # Need to get column names to apply range/first/last selection
863
+ # Load the data to get column names
864
+ from pointblank.validate import (
865
+ _process_connection_string,
866
+ _process_csv_input,
867
+ _process_parquet_input,
868
+ )
869
+
870
+ # Process the data source to get actual data object
871
+ processed_data = data
872
+ if isinstance(data, str):
873
+ processed_data = _process_connection_string(data)
874
+ processed_data = _process_csv_input(processed_data)
875
+ processed_data = _process_parquet_input(processed_data)
876
+
877
+ # Get column names from the processed data
878
+ all_columns = []
879
+ if hasattr(processed_data, "columns"):
880
+ all_columns = list(processed_data.columns)
881
+ elif hasattr(processed_data, "schema"):
882
+ all_columns = list(processed_data.schema.names)
883
+ else:
884
+ console.print(
885
+ "[yellow]Warning: Could not determine column names for range selection[/yellow]"
886
+ )
887
+
888
+ if all_columns:
889
+ # Check if _row_num_ exists and preserve it
890
+ has_row_num = "_row_num_" in all_columns
891
+
892
+ if col_range:
893
+ # Parse range like "1:10", "5:", ":15"
894
+ if ":" in col_range:
895
+ parts = col_range.split(":")
896
+ start_idx = int(parts[0]) - 1 if parts[0] else 0 # Convert to 0-based
897
+ end_idx = int(parts[1]) if parts[1] else len(all_columns)
898
+
899
+ # Filter out _row_num_ from the range selection, we'll add it back later
900
+ columns_for_range = [col for col in all_columns if col != "_row_num_"]
901
+ selected_columns = columns_for_range[start_idx:end_idx]
902
+
903
+ # Always include _row_num_ at the beginning if it exists
904
+ if has_row_num:
905
+ columns_list = ["_row_num_"] + selected_columns
906
+ else:
907
+ columns_list = selected_columns
908
+ else:
909
+ console.print(
910
+ "[yellow]Warning: Invalid range format. Use 'start:end' format[/yellow]"
911
+ )
912
+ elif col_first:
913
+ # Filter out _row_num_ from the first N selection, we'll add it back later
914
+ columns_for_first = [col for col in all_columns if col != "_row_num_"]
915
+ selected_columns = columns_for_first[:col_first]
916
+
917
+ # Always include _row_num_ at the beginning if it exists
918
+ if has_row_num:
919
+ columns_list = ["_row_num_"] + selected_columns
920
+ else:
921
+ columns_list = selected_columns
922
+ elif col_last:
923
+ # Filter out _row_num_ from the last N selection, we'll add it back later
924
+ columns_for_last = [col for col in all_columns if col != "_row_num_"]
925
+ selected_columns = columns_for_last[-col_last:]
926
+
927
+ # Always include _row_num_ at the beginning if it exists
928
+ if has_row_num:
929
+ columns_list = ["_row_num_"] + selected_columns
930
+ else:
931
+ columns_list = selected_columns
932
+
933
+ # Generate preview
934
+ with console.status("[bold green]Generating preview..."):
935
+ # Get total dataset size before preview and gather metadata
936
+ try:
937
+ # Process the data to get the actual data object for row count and metadata
938
+ from pointblank.validate import (
939
+ _process_connection_string,
940
+ _process_csv_input,
941
+ _process_parquet_input,
942
+ )
943
+
944
+ processed_data = data
945
+ if isinstance(data, str):
946
+ processed_data = _process_connection_string(data)
947
+ processed_data = _process_csv_input(processed_data)
948
+ processed_data = _process_parquet_input(processed_data)
949
+
950
+ total_dataset_rows = pb.get_row_count(processed_data)
951
+
952
+ # Determine source type and table type for enhanced preview title
953
+ if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
954
+ source_type = f"Pointblank dataset: {data_source}"
955
+ else:
956
+ source_type = f"External source: {data_source}"
957
+
958
+ table_type = _get_tbl_type(processed_data)
959
+ except Exception:
960
+ # If we can't get metadata, set defaults
961
+ total_dataset_rows = None
962
+ source_type = f"Data source: {data_source}"
963
+ table_type = "unknown"
964
+
965
+ gt_table = pb.preview(
966
+ data=data,
967
+ columns_subset=columns_list,
968
+ n_head=head,
969
+ n_tail=tail,
970
+ limit=limit,
971
+ show_row_numbers=not no_row_numbers,
972
+ max_col_width=max_col_width,
973
+ min_tbl_width=min_table_width,
974
+ incl_header=not no_header,
975
+ )
976
+
977
+ if output_html:
978
+ # Save HTML to file
979
+ html_content = gt_table.as_raw_html()
980
+ Path(output_html).write_text(html_content, encoding="utf-8")
981
+ console.print(f"[green]✓[/green] HTML saved to: {output_html}")
982
+ else:
983
+ # Display in terminal with preview context info
984
+ preview_info = None
985
+ if total_dataset_rows is not None:
986
+ # Determine if we're showing the complete dataset
987
+ expected_rows = min(head + tail, limit, total_dataset_rows)
988
+ is_complete = total_dataset_rows <= expected_rows
989
+
990
+ preview_info = {
991
+ "total_rows": total_dataset_rows,
992
+ "head_rows": head,
993
+ "tail_rows": tail,
994
+ "is_complete": is_complete,
995
+ "source_type": source_type,
996
+ "table_type": table_type,
997
+ }
998
+
999
+ _rich_print_gt_table(gt_table, preview_info)
1000
+
1001
+ except Exception as e: # pragma: no cover
1002
+ console.print(f"[red]Error:[/red] {e}")
1003
+ sys.exit(1) # pragma: no cover
1004
+
1005
+
1006
+ @cli.command()
1007
+ @click.argument("data_source", type=str)
1008
+ def info(data_source: str):
1009
+ """
1010
+ Display information about a data source.
1011
+
1012
+ Shows table type, dimensions, column names, and data types.
1013
+ """
1014
+ try:
1015
+ with console.status("[bold green]Loading data..."):
1016
+ # Try to load as a pointblank dataset first
1017
+ if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1018
+ data = pb.load_dataset(data_source)
1019
+ source_type = f"Pointblank dataset: {data_source}"
1020
+ console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
1021
+ else:
1022
+ # Assume it's a file path or connection string
1023
+ data = data_source
1024
+ source_type = f"External source: {data_source}"
1025
+
1026
+ # Process the data to get actual table object for inspection
1027
+ from pointblank.validate import (
1028
+ _process_connection_string,
1029
+ _process_csv_input,
1030
+ _process_parquet_input,
1031
+ )
1032
+
1033
+ data = _process_connection_string(data)
1034
+ data = _process_csv_input(data)
1035
+ data = _process_parquet_input(data)
1036
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1037
+
1038
+ # Get table information
1039
+ tbl_type = _get_tbl_type(data)
1040
+ row_count = pb.get_row_count(data)
1041
+ col_count = pb.get_column_count(data)
1042
+
1043
+ # Import the box style for consistent styling with scan table
1044
+ from rich.box import SIMPLE_HEAD
1045
+
1046
+ # Create info table with same styling as scan table
1047
+ info_table = Table(
1048
+ title="Data Source Information",
1049
+ show_header=True,
1050
+ header_style="bold magenta",
1051
+ box=SIMPLE_HEAD,
1052
+ title_style="bold cyan",
1053
+ title_justify="left",
1054
+ )
1055
+ info_table.add_column("Property", style="cyan", no_wrap=True)
1056
+ info_table.add_column("Value", style="green")
1057
+
1058
+ info_table.add_row("Source", source_type)
1059
+ info_table.add_row("Table Type", tbl_type)
1060
+ info_table.add_row("Rows", f"{row_count:,}")
1061
+ info_table.add_row("Columns", f"{col_count:,}")
1062
+
1063
+ console.print()
1064
+ console.print(info_table)
1065
+
1066
+ except Exception as e:
1067
+ console.print(f"[red]Error:[/red] {e}")
1068
+ sys.exit(1)
1069
+
1070
+
1071
+ @cli.command()
1072
+ @click.argument("data_source", type=str)
1073
+ @click.option("--output-html", type=click.Path(), help="Save HTML scan report to file")
1074
+ @click.option("--columns", "-c", help="Comma-separated list of columns to scan")
1075
+ def scan(
1076
+ data_source: str,
1077
+ output_html: str | None,
1078
+ columns: str | None,
1079
+ ):
1080
+ """
1081
+ Generate a data scan profile report.
1082
+
1083
+ Produces a comprehensive data profile including:
1084
+
1085
+ \b
1086
+ - Column types and distributions
1087
+ - Missing value patterns
1088
+ - Basic statistics
1089
+ - Data quality indicators
1090
+
1091
+ DATA_SOURCE can be:
1092
+
1093
+ \b
1094
+ - CSV file path (e.g., data.csv)
1095
+ - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
1096
+ - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1097
+ - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1098
+ """
1099
+ try:
1100
+ import time
1101
+
1102
+ start_time = time.time()
1103
+
1104
+ with console.status("[bold green]Loading data..."):
1105
+ # Try to load as a pointblank dataset first
1106
+ if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1107
+ data = pb.load_dataset(data_source)
1108
+ console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
1109
+ else:
1110
+ # Assume it's a file path or connection string
1111
+ data = data_source
1112
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1113
+
1114
+ # Parse columns if provided
1115
+ columns_list = None
1116
+ if columns:
1117
+ columns_list = [col.strip() for col in columns.split(",")]
1118
+
1119
+ # Generate data scan
1120
+ with console.status("[bold green]Generating data scan..."):
1121
+ # Use col_summary_tbl for comprehensive column scanning
1122
+ if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1123
+ # For pointblank datasets, data is already the loaded dataframe
1124
+ scan_result = pb.col_summary_tbl(data=data)
1125
+ source_type = f"Pointblank dataset: {data_source}"
1126
+ table_type = _get_tbl_type(data)
1127
+ # Get row count for footer
1128
+ try:
1129
+ total_rows = pb.get_row_count(data)
1130
+ except Exception:
1131
+ total_rows = None
1132
+ else:
1133
+ # For file paths and connection strings, load the data first
1134
+ from pointblank.validate import (
1135
+ _process_connection_string,
1136
+ _process_csv_input,
1137
+ _process_parquet_input,
1138
+ )
1139
+
1140
+ processed_data = _process_connection_string(data)
1141
+ processed_data = _process_csv_input(processed_data)
1142
+ processed_data = _process_parquet_input(processed_data)
1143
+ scan_result = pb.col_summary_tbl(data=processed_data)
1144
+ source_type = f"External source: {data_source}"
1145
+ table_type = _get_tbl_type(processed_data)
1146
+ # Get row count for footer
1147
+ try:
1148
+ total_rows = pb.get_row_count(processed_data)
1149
+ except Exception:
1150
+ total_rows = None
1151
+
1152
+ scan_time = time.time() - start_time
1153
+
1154
+ if output_html:
1155
+ # Save HTML to file
1156
+ try:
1157
+ html_content = scan_result.as_raw_html()
1158
+ Path(output_html).write_text(html_content, encoding="utf-8")
1159
+ console.print(f"[green]✓[/green] Data scan report saved to: {output_html}")
1160
+ except Exception as e:
1161
+ console.print(f"[yellow]Warning: Could not save HTML report: {e}[/yellow]")
1162
+ else:
1163
+ # Display rich scan table in terminal
1164
+ console.print(f"[green]✓[/green] Data scan completed in {scan_time:.2f}s")
1165
+ console.print("Use --output-html to save the full interactive scan report.")
1166
+
1167
+ # Display detailed column summary using rich formatting
1168
+ try:
1169
+ _rich_print_scan_table(
1170
+ scan_result, data_source, source_type, table_type, total_rows
1171
+ )
1172
+
1173
+ except Exception as e:
1174
+ console.print(f"[yellow]Could not display scan summary: {e}[/yellow]")
1175
+
1176
+ except Exception as e:
1177
+ console.print(f"[red]Error:[/red] {e}")
1178
+ sys.exit(1)
1179
+
1180
+
1181
+ @cli.command()
1182
+ @click.argument("data_source", type=str)
1183
+ @click.option("--output-html", type=click.Path(), help="Save HTML output to file")
1184
+ def missing(data_source: str, output_html: str | None):
1185
+ """
1186
+ Generate a missing values report for a data table.
1187
+
1188
+ DATA_SOURCE can be:
1189
+
1190
+ \b
1191
+ - CSV file path (e.g., data.csv)
1192
+ - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
1193
+ - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1194
+ - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1195
+ """
1196
+ try:
1197
+ with console.status("[bold green]Loading data..."):
1198
+ # Try to load as a pointblank dataset first
1199
+ if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1200
+ data = pb.load_dataset(data_source)
1201
+ console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
1202
+ else:
1203
+ # Assume it's a file path or connection string
1204
+ data = data_source
1205
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1206
+
1207
+ # Generate missing values table
1208
+ with console.status("[bold green]Analyzing missing values..."):
1209
+ gt_table = pb.missing_vals_tbl(data)
1210
+
1211
+ # Get original data for column types
1212
+ original_data = data
1213
+ if isinstance(data, str):
1214
+ # Process the data to get the actual data object
1215
+ from pointblank.validate import (
1216
+ _process_connection_string,
1217
+ _process_csv_input,
1218
+ _process_parquet_input,
1219
+ )
1220
+
1221
+ try:
1222
+ original_data = _process_connection_string(data)
1223
+ original_data = _process_csv_input(original_data)
1224
+ original_data = _process_parquet_input(original_data)
1225
+ except Exception: # pragma: no cover
1226
+ pass # Use the string data as fallback
1227
+
1228
+ if output_html:
1229
+ # Save HTML to file
1230
+ html_content = gt_table.as_raw_html()
1231
+ Path(output_html).write_text(html_content, encoding="utf-8")
1232
+ console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
1233
+ else:
1234
+ # Display in terminal with special missing values formatting
1235
+ _rich_print_missing_table(gt_table, original_data)
1236
+
1237
+ except Exception as e:
1238
+ console.print(f"[red]Error:[/red] {e}")
1239
+ sys.exit(1)
1240
+
1241
+
1242
+ @cli.command()
1243
+ @click.argument("output_file", type=click.Path())
1244
+ def validate_example(output_file: str):
1245
+ """
1246
+ Generate an example validation script.
1247
+
1248
+ Creates a sample Python script showing how to use Pointblank for validation.
1249
+ """
1250
+ example_script = '''"""
1251
+ Example Pointblank validation script.
1252
+
1253
+ This script demonstrates how to create validation rules for your data.
1254
+ Modify the validation rules below to match your data requirements.
1255
+ """
1256
+
1257
+ import pointblank as pb
1258
+
1259
+ # Create a validation object
1260
+ # The 'data' variable is automatically provided by the CLI
1261
+ validation = (
1262
+ pb.Validate(
1263
+ data=data,
1264
+ tbl_name="Example Data",
1265
+ label="CLI Validation Example",
1266
+ thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
1267
+ )
1268
+ # Add your validation rules here
1269
+ # Example rules (modify these based on your data structure):
1270
+
1271
+ # Check that specific columns exist
1272
+ # .col_exists(["column1", "column2"])
1273
+
1274
+ # Check for null values
1275
+ # .col_vals_not_null(columns="important_column")
1276
+
1277
+ # Check value ranges
1278
+ # .col_vals_gt(columns="amount", value=0)
1279
+ # .col_vals_between(columns="score", left=0, right=100)
1280
+
1281
+ # Check string patterns
1282
+ # .col_vals_regex(columns="email", pattern=r"^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$")
1283
+
1284
+ # Check unique values
1285
+ # .col_vals_unique(columns="id")
1286
+
1287
+ # Finalize the validation
1288
+ .interrogate()
1289
+ )
1290
+
1291
+ # The validation object will be automatically used by the CLI
1292
+ '''
1293
+
1294
+ Path(output_file).write_text(example_script)
1295
+ console.print(f"[green]✓[/green] Example validation script created: {output_file}")
1296
+ console.print("\nEdit the script to add your validation rules, then run:")
1297
+ console.print(f"[cyan]pb validate your_data.csv {output_file}[/cyan]")
1298
+
1299
+
1300
+ @cli.command()
1301
+ @click.argument("data_source", type=str)
1302
+ @click.argument("validation_script", type=click.Path(exists=True))
1303
+ @click.option("--output-html", type=click.Path(), help="Save HTML validation report to file")
1304
+ @click.option("--output-json", type=click.Path(), help="Save JSON validation summary to file")
1305
+ @click.option("--fail-on-error", is_flag=True, help="Exit with non-zero code if validation fails")
1306
+ def validate(
1307
+ data_source: str,
1308
+ validation_script: str,
1309
+ output_html: str | None,
1310
+ output_json: str | None,
1311
+ fail_on_error: bool,
1312
+ ):
1313
+ """
1314
+ Run validation using a Python validation script.
1315
+
1316
+ DATA_SOURCE can be:
1317
+
1318
+ \b
1319
+ - CSV file path (e.g., data.csv)
1320
+ - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
1321
+ - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1322
+ - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1323
+
1324
+ VALIDATION_SCRIPT should be a Python file that defines validation rules.
1325
+ See 'pb validate-example' for a sample script.
1326
+ """
1327
+ try:
1328
+ with console.status("[bold green]Loading data..."):
1329
+ # Try to load as a pointblank dataset first
1330
+ if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1331
+ data = pb.load_dataset(data_source)
1332
+ console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
1333
+ else:
1334
+ # Assume it's a file path or connection string
1335
+ data = data_source
1336
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1337
+
1338
+ # Execute the validation script
1339
+ with console.status("[bold green]Running validation..."):
1340
+ # Read and execute the validation script
1341
+ script_content = Path(validation_script).read_text()
1342
+
1343
+ # Create a namespace with pointblank and the data
1344
+ namespace = {
1345
+ "pb": pb,
1346
+ "pointblank": pb,
1347
+ "data": data,
1348
+ "__name__": "__main__",
1349
+ }
1350
+
1351
+ # Execute the script
1352
+ try:
1353
+ exec(script_content, namespace)
1354
+ except Exception as e:
1355
+ console.print(f"[red]Error executing validation script:[/red] {e}")
1356
+ sys.exit(1)
1357
+
1358
+ # Look for a validation object in the namespace
1359
+ validation = None
1360
+
1361
+ # Try to find the 'validation' variable specifically first
1362
+ if "validation" in namespace:
1363
+ validation = namespace["validation"]
1364
+ else:
1365
+ # Look for any validation object in the namespace
1366
+ for key, value in namespace.items():
1367
+ if hasattr(value, "interrogate") and hasattr(value, "validation_info"):
1368
+ validation = value
1369
+ break
1370
+ # Also check if it's a Validate object that has been interrogated
1371
+ elif str(type(value)).find("Validate") != -1:
1372
+ validation = value
1373
+ break
1374
+
1375
+ if validation is None:
1376
+ raise ValueError(
1377
+ "No validation object found in script. "
1378
+ "Script should create a Validate object and assign it to a variable named 'validation'."
1379
+ )
1380
+
1381
+ console.print("[green]✓[/green] Validation completed")
1382
+
1383
+ # Display summary
1384
+ _display_validation_summary(validation)
1385
+
1386
+ # Save outputs
1387
+ if output_html:
1388
+ try:
1389
+ # Get HTML representation
1390
+ html_content = validation._repr_html_()
1391
+ Path(output_html).write_text(html_content, encoding="utf-8")
1392
+ console.print(f"[green]✓[/green] HTML report saved to: {output_html}")
1393
+ except Exception as e:
1394
+ console.print(f"[yellow]Warning: Could not save HTML report: {e}[/yellow]")
1395
+
1396
+ if output_json:
1397
+ try:
1398
+ # Get JSON report
1399
+ json_report = validation.get_json_report()
1400
+ Path(output_json).write_text(json_report, encoding="utf-8")
1401
+ console.print(f"[green]✓[/green] JSON summary saved to: {output_json}")
1402
+ except Exception as e:
1403
+ console.print(f"[yellow]Warning: Could not save JSON report: {e}[/yellow]")
1404
+
1405
+ # Check if we should fail on error
1406
+ if fail_on_error:
1407
+ try:
1408
+ if (
1409
+ hasattr(validation, "validation_info")
1410
+ and validation.validation_info is not None
1411
+ ):
1412
+ info = validation.validation_info
1413
+ n_critical = sum(1 for step in info if step.critical)
1414
+ n_error = sum(1 for step in info if step.error)
1415
+
1416
+ if n_critical > 0 or n_error > 0:
1417
+ severity = "critical" if n_critical > 0 else "error"
1418
+ console.print(
1419
+ f"[red]Exiting with error due to {severity} validation failures[/red]"
1420
+ )
1421
+ sys.exit(1)
1422
+ except Exception as e:
1423
+ console.print(
1424
+ f"[yellow]Warning: Could not check validation status for fail-on-error: {e}[/yellow]"
1425
+ )
1426
+
1427
+ except Exception as e:
1428
+ console.print(f"[red]Error:[/red] {e}")
1429
+ sys.exit(1)
1430
+
1431
+
1432
+ @cli.command()
1433
+ @click.argument("data_source", type=str)
1434
+ @click.argument("validation_script", type=click.Path(exists=True))
1435
+ @click.argument("step_number", type=int)
1436
+ @click.option(
1437
+ "--limit", "-l", default=100, help="Maximum number of failing rows to show (default: 100)"
1438
+ )
1439
+ @click.option("--output-csv", type=click.Path(), help="Save failing rows to CSV file")
1440
+ @click.option("--output-html", type=click.Path(), help="Save failing rows table to HTML file")
1441
+ def extract(
1442
+ data_source: str,
1443
+ validation_script: str,
1444
+ step_number: int,
1445
+ limit: int,
1446
+ output_csv: str | None,
1447
+ output_html: str | None,
1448
+ ):
1449
+ """
1450
+ Extract failing rows from a specific validation step.
1451
+
1452
+ This command runs a validation and extracts the rows that failed
1453
+ a specific validation step, which is useful for debugging data quality issues.
1454
+
1455
+ DATA_SOURCE: Same as validate command
1456
+ VALIDATION_SCRIPT: Path to validation script
1457
+ STEP_NUMBER: The step number to extract failing rows from (1-based)
1458
+ """
1459
+ try:
1460
+ with console.status("[bold green]Loading data..."):
1461
+ # Try to load as a pointblank dataset first
1462
+ if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1463
+ data = pb.load_dataset(data_source)
1464
+ console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
1465
+ else:
1466
+ # Assume it's a file path or connection string
1467
+ data = data_source
1468
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1469
+
1470
+ # Execute the validation script
1471
+ with console.status("[bold green]Running validation..."):
1472
+ # Read and execute the validation script
1473
+ script_content = Path(validation_script).read_text()
1474
+
1475
+ # Create a namespace with pointblank and the data
1476
+ namespace = {
1477
+ "pb": pb,
1478
+ "pointblank": pb,
1479
+ "data": data,
1480
+ "__name__": "__main__",
1481
+ }
1482
+
1483
+ # Execute the script
1484
+ try:
1485
+ exec(script_content, namespace)
1486
+ except Exception as e:
1487
+ console.print(f"[red]Error executing validation script:[/red] {e}")
1488
+ sys.exit(1)
1489
+
1490
+ # Look for a validation object in the namespace
1491
+ validation = None
1492
+ if "validation" in namespace:
1493
+ validation = namespace["validation"]
1494
+ else:
1495
+ # Look for any validation object in the namespace
1496
+ for key, value in namespace.items():
1497
+ if hasattr(value, "interrogate") and hasattr(value, "validation_info"):
1498
+ validation = value
1499
+ break
1500
+ elif str(type(value)).find("Validate") != -1:
1501
+ validation = value
1502
+ break
1503
+
1504
+ if validation is None:
1505
+ raise ValueError(
1506
+ "No validation object found in script. "
1507
+ "Script should create a Validate object and assign it to a variable named 'validation'."
1508
+ )
1509
+
1510
+ console.print("[green]✓[/green] Validation completed")
1511
+
1512
+ # Extract failing rows from the specified step
1513
+ with console.status(f"[bold green]Extracting failing rows from step {step_number}..."):
1514
+ try:
1515
+ # Get the data extracts for the specific step
1516
+ step_extract = validation.get_data_extracts(i=step_number, frame=True)
1517
+
1518
+ if step_extract is None or len(step_extract) == 0:
1519
+ console.print(f"[yellow]No failing rows found for step {step_number}[/yellow]")
1520
+ return
1521
+
1522
+ # Limit the results
1523
+ if len(step_extract) > limit:
1524
+ step_extract = step_extract.head(limit)
1525
+ console.print(f"[yellow]Limited to first {limit} failing rows[/yellow]")
1526
+
1527
+ console.print(f"[green]✓[/green] Extracted {len(step_extract)} failing rows")
1528
+
1529
+ # Save outputs
1530
+ if output_csv:
1531
+ if hasattr(step_extract, "write_csv"):
1532
+ step_extract.write_csv(output_csv)
1533
+ else:
1534
+ step_extract.to_csv(output_csv, index=False)
1535
+ console.print(f"[green]✓[/green] Failing rows saved to CSV: {output_csv}")
1536
+
1537
+ if output_html:
1538
+ # Create a preview of the failing rows
1539
+ preview_table = pb.preview(
1540
+ step_extract, n_head=min(10, len(step_extract)), n_tail=0
1541
+ )
1542
+ html_content = preview_table._repr_html_()
1543
+ Path(output_html).write_text(html_content, encoding="utf-8")
1544
+ console.print(
1545
+ f"[green]✓[/green] Failing rows table saved to HTML: {output_html}"
1546
+ )
1547
+
1548
+ if not output_csv and not output_html:
1549
+ # Display basic info about the failing rows
1550
+ info_table = Table(
1551
+ title=f"Failing Rows - Step {step_number}",
1552
+ show_header=True,
1553
+ header_style="bold red",
1554
+ )
1555
+ info_table.add_column("Property", style="cyan")
1556
+ info_table.add_column("Value", style="white")
1557
+
1558
+ info_table.add_row("Total Failing Rows", f"{len(step_extract):,}")
1559
+ info_table.add_row(
1560
+ "Columns",
1561
+ f"{len(step_extract.columns) if hasattr(step_extract, 'columns') else 'N/A'}",
1562
+ )
1563
+
1564
+ console.print(info_table)
1565
+ console.print(
1566
+ "\n[dim]Use --output-csv or --output-html to save the failing rows.[/dim]"
1567
+ )
1568
+
1569
+ except Exception as e:
1570
+ console.print(f"[red]Error extracting failing rows:[/red] {e}")
1571
+ # Try to provide helpful information
1572
+ if hasattr(validation, "validation_info") and validation.validation_info:
1573
+ max_step = len(validation.validation_info)
1574
+ console.print(f"[yellow]Available steps: 1 to {max_step}[/yellow]")
1575
+
1576
+ # Show step information
1577
+ steps_table = Table(title="Available Validation Steps", show_header=True)
1578
+ steps_table.add_column("Step", style="cyan")
1579
+ steps_table.add_column("Type", style="white")
1580
+ steps_table.add_column("Column", style="green")
1581
+ steps_table.add_column("Has Failures", style="yellow")
1582
+
1583
+ for i, step in enumerate(validation.validation_info, 1):
1584
+ has_failures = "Yes" if not step.all_passed else "No"
1585
+ steps_table.add_row(
1586
+ str(i),
1587
+ step.assertion_type,
1588
+ str(step.column) if step.column else "—",
1589
+ has_failures,
1590
+ )
1591
+
1592
+ console.print(steps_table)
1593
+ sys.exit(1)
1594
+
1595
+ except Exception as e:
1596
+ console.print(f"[red]Error:[/red] {e}")
1597
+ sys.exit(1)
1598
+
1599
+
1600
+ def _format_missing_percentage(value: float) -> str:
1601
+ """Format missing value percentages for display.
1602
+
1603
+ Args:
1604
+ value: The percentage value (0-100)
1605
+
1606
+ Returns:
1607
+ Formatted string with proper percentage display
1608
+ """
1609
+ if value == 0.0:
1610
+ return "[green]●[/green]" # Large green circle for no missing values
1611
+ elif value == 100.0:
1612
+ return "[red]●[/red]" # Large red circle for completely missing values
1613
+ elif value < 1.0 and value > 0:
1614
+ return "<1%" # Less than 1%
1615
+ elif value > 99.0 and value < 100.0:
1616
+ return ">99%" # More than 99%
1617
+ else:
1618
+ return f"{int(round(value))}%" # Round to nearest integer with % sign
1619
+
1620
+
1621
+ def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
1622
+ """Convert a missing values GT table to Rich table with special formatting.
1623
+
1624
+ Args:
1625
+ gt_table: The GT table object for missing values
1626
+ original_data: The original data source to extract column types
1627
+ """
1628
+ try:
1629
+ # Extract the underlying data from the GT table
1630
+ df = None
1631
+
1632
+ if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
1633
+ df = gt_table._tbl_data
1634
+ elif hasattr(gt_table, "_data") and gt_table._data is not None:
1635
+ df = gt_table._data
1636
+ elif hasattr(gt_table, "data") and gt_table.data is not None:
1637
+ df = gt_table.data
1638
+
1639
+ if df is not None:
1640
+ # Create a Rich table with horizontal lines
1641
+ from rich.box import SIMPLE_HEAD
1642
+
1643
+ rich_table = Table(show_header=True, header_style="bold magenta", box=SIMPLE_HEAD)
1644
+
1645
+ # Get column names
1646
+ columns = []
1647
+ try:
1648
+ if hasattr(df, "columns"):
1649
+ columns = list(df.columns)
1650
+ elif hasattr(df, "schema"):
1651
+ columns = list(df.schema.names)
1652
+ except Exception as e:
1653
+ console.print(f"[red]Error getting columns:[/red] {e}")
1654
+ columns = []
1655
+
1656
+ if not columns:
1657
+ columns = [f"Column {i + 1}" for i in range(10)] # Fallback
1658
+
1659
+ # Get original data to extract column types
1660
+ column_types = {}
1661
+ if original_data is not None:
1662
+ try:
1663
+ # Get column types from original data
1664
+ if hasattr(original_data, "columns"):
1665
+ original_columns = list(original_data.columns)
1666
+ column_types = _get_column_dtypes(original_data, original_columns)
1667
+ except Exception as e:
1668
+ console.print(f"[red]Error getting column types:[/red] {e}")
1669
+ pass # Use empty dict as fallback
1670
+
1671
+ # Add columns to Rich table with special formatting for missing values table
1672
+ sector_columns = [col for col in columns if col != "columns" and col.isdigit()]
1673
+
1674
+ # Two separate columns: Column name (20 chars) and Data type (10 chars)
1675
+ rich_table.add_column("Column", style="cyan", no_wrap=True, width=20)
1676
+ rich_table.add_column("Type", style="yellow", no_wrap=True, width=10)
1677
+
1678
+ # Sector columns: All same width, optimized for "100%" (4 chars + padding)
1679
+ for sector in sector_columns:
1680
+ rich_table.add_column(
1681
+ sector,
1682
+ style="cyan",
1683
+ justify="center",
1684
+ no_wrap=True,
1685
+ width=5, # Fixed width optimized for percentage values
1686
+ )
1687
+
1688
+ # Convert data to rows with special formatting
1689
+ rows = []
1690
+ try:
1691
+ if hasattr(df, "to_dicts"):
1692
+ data_dict = df.to_dicts()
1693
+ elif hasattr(df, "to_dict"):
1694
+ data_dict = df.to_dict("records")
1695
+ else:
1696
+ data_dict = []
1697
+
1698
+ for i, row in enumerate(data_dict):
1699
+ try:
1700
+ # Each row should have: [column_name, data_type, sector1, sector2, ...]
1701
+ column_name = str(row.get("columns", ""))
1702
+
1703
+ # Truncate column name to 20 characters with ellipsis if needed
1704
+ if len(column_name) > 20:
1705
+ truncated_name = column_name[:17] + "…"
1706
+ else:
1707
+ truncated_name = column_name
1708
+
1709
+ # Get data type for this column
1710
+ if column_name in column_types:
1711
+ dtype = column_types[column_name]
1712
+ if len(dtype) > 10:
1713
+ truncated_dtype = dtype[:9] + "…"
1714
+ else:
1715
+ truncated_dtype = dtype
1716
+ else:
1717
+ truncated_dtype = "?"
1718
+
1719
+ # Start building the row with column name and type
1720
+ formatted_row = [truncated_name, truncated_dtype]
1721
+
1722
+ # Add sector values (formatted percentages)
1723
+ for sector in sector_columns:
1724
+ value = row.get(sector, 0.0)
1725
+ if isinstance(value, (int, float)):
1726
+ formatted_row.append(_format_missing_percentage(float(value)))
1727
+ else:
1728
+ formatted_row.append(str(value))
1729
+
1730
+ rows.append(formatted_row)
1731
+
1732
+ except Exception as e:
1733
+ console.print(f"[red]Error processing row {i}:[/red] {e}")
1734
+ continue
1735
+
1736
+ except Exception as e:
1737
+ console.print(f"[red]Error extracting data:[/red] {e}")
1738
+ rows = [["Error extracting data", "?", *["" for _ in sector_columns]]]
1739
+
1740
+ # Add rows to Rich table
1741
+ for row in rows:
1742
+ try:
1743
+ rich_table.add_row(*row)
1744
+ except Exception as e:
1745
+ console.print(f"[red]Error adding row:[/red] {e}")
1746
+ break
1747
+
1748
+ # Show the table with custom spanner header if we have sector columns
1749
+ if sector_columns:
1750
+ # Create a custom header line that shows the spanner
1751
+ header_parts = []
1752
+ header_parts.append(" " * 20) # Space for Column header
1753
+ header_parts.append(" " * 10) # Space for Type header
1754
+
1755
+ # Left-align "Row Sectors" with the first numbered column
1756
+ row_sectors_text = "Row Sectors"
1757
+ header_parts.append(row_sectors_text)
1758
+
1759
+ # Print the custom spanner header
1760
+ console.print("[dim]" + " ".join(header_parts) + "[/dim]")
1761
+
1762
+ # Add a horizontal rule below the spanner
1763
+ rule_parts = []
1764
+ rule_parts.append(" " * 20) # Space for Column header
1765
+ rule_parts.append(" " * 10) # Space for Type header
1766
+
1767
+ # Use a fixed width horizontal rule for "Row Sectors"
1768
+ horizontal_rule = "─" * 20
1769
+ rule_parts.append(horizontal_rule)
1770
+
1771
+ # Print the horizontal rule
1772
+ console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
1773
+
1774
+ # Print the Rich table (will handle terminal width automatically)
1775
+ console.print(rich_table)
1776
+ footer_text = (
1777
+ "[dim]Symbols: [green]●[/green] = no missing values, "
1778
+ "[red]●[/red] = completely missing, "
1779
+ "<1% = less than 1% missing, "
1780
+ ">99% = more than 99% missing[/dim]"
1781
+ )
1782
+ console.print(footer_text)
1783
+
1784
+ else:
1785
+ # Fallback to regular table display
1786
+ _rich_print_gt_table(gt_table)
1787
+
1788
+ except Exception as e:
1789
+ console.print(f"[red]Error rendering missing values table:[/red] {e}")
1790
+ # Fallback to regular table display
1791
+ _rich_print_gt_table(gt_table)
1792
+
1793
+
1794
+ def _rich_print_scan_table(
1795
+ scan_result: Any,
1796
+ data_source: str,
1797
+ source_type: str,
1798
+ table_type: str,
1799
+ total_rows: int | None = None,
1800
+ ) -> None:
1801
+ """
1802
+ Display scan results as a Rich table in the terminal with statistical measures.
1803
+
1804
+ Args:
1805
+ scan_result: The GT object from col_summary_tbl()
1806
+ data_source: Name of the data source being scanned
1807
+ source_type: Type of data source (e.g., "Pointblank dataset: small_table")
1808
+ table_type: Type of table (e.g., "polars.LazyFrame")
1809
+ total_rows: Total number of rows in the dataset
1810
+ """
1811
+ try:
1812
+ import re
1813
+
1814
+ import narwhals as nw
1815
+ from rich.box import SIMPLE_HEAD
1816
+
1817
+ # Extract the underlying DataFrame from the GT object
1818
+ # The GT object has a _tbl_data attribute that contains the DataFrame
1819
+ gt_data = scan_result._tbl_data
1820
+
1821
+ # Convert to Narwhals DataFrame for consistent handling
1822
+ nw_data = nw.from_native(gt_data)
1823
+
1824
+ # Convert to dictionary for easier access
1825
+ data_dict = nw_data.to_dict(as_series=False)
1826
+
1827
+ # Create main scan table with missing data table styling
1828
+ # Create a comprehensive title with data source, source type, and table type
1829
+ title_text = f"Column Summary / {source_type} / {table_type}"
1830
+
1831
+ scan_table = Table(
1832
+ title=title_text,
1833
+ show_header=True,
1834
+ header_style="bold magenta",
1835
+ box=SIMPLE_HEAD,
1836
+ title_style="bold cyan",
1837
+ title_justify="left",
1838
+ )
1839
+
1840
+ # Add columns with specific styling and appropriate widths
1841
+ scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
1842
+ scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
1843
+ scan_table.add_column(
1844
+ "NA", style="red", width=6, justify="right"
1845
+ ) # Adjusted for better formatting
1846
+ scan_table.add_column(
1847
+ "UQ", style="green", width=8, justify="right"
1848
+ ) # Adjusted for boolean values
1849
+
1850
+ # Add statistical columns if they exist with appropriate widths
1851
+ stat_columns = []
1852
+ column_mapping = {
1853
+ "mean": ("Mean", "blue", 9),
1854
+ "std": ("SD", "blue", 9),
1855
+ "min": ("Min", "yellow", 9),
1856
+ "median": ("Med", "yellow", 9),
1857
+ "max": ("Max", "yellow", 9),
1858
+ "q_1": ("Q₁", "magenta", 8),
1859
+ "q_3": ("Q₃", "magenta", 9),
1860
+ "iqr": ("IQR", "magenta", 8),
1861
+ }
1862
+
1863
+ for col_key, (display_name, color, width) in column_mapping.items():
1864
+ if col_key in data_dict:
1865
+ scan_table.add_column(display_name, style=color, width=width, justify="right")
1866
+ stat_columns.append(col_key)
1867
+
1868
+ # Helper function to extract column name and type from HTML
1869
+ def extract_column_info(html_content: str) -> tuple[str, str]:
1870
+ """Extract column name and type from HTML formatted content."""
1871
+ # Extract column name from first div
1872
+ name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
1873
+ column_name = name_match.group(1) if name_match else "Unknown"
1874
+
1875
+ # Extract data type from second div (with gray color)
1876
+ type_match = re.search(r"<div[^>]*color: gray[^>]*>([^<]+)</div>", html_content)
1877
+ if type_match:
1878
+ data_type = type_match.group(1)
1879
+ # Convert to compact format using the existing function
1880
+ compact_type = _format_dtype_compact(data_type)
1881
+ data_type = compact_type
1882
+ else:
1883
+ data_type = "unknown"
1884
+
1885
+ return column_name, data_type
1886
+
1887
+ # Helper function to format values with improved number formatting
1888
+ def format_value(
1889
+ value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
1890
+ ) -> str:
1891
+ """Format values for display with smart number formatting and HTML cleanup."""
1892
+ if value is None or (isinstance(value, str) and value.strip() == ""):
1893
+ return "[dim]—[/dim]"
1894
+
1895
+ # Handle missing values indicator
1896
+ if is_missing and str(value) == "0":
1897
+ return "[green]●[/green]" # No missing values
1898
+
1899
+ # Clean up HTML formatting from the raw data
1900
+ str_val = str(value)
1901
+
1902
+ # Handle multi-line values with <br> tags FIRST - take the first line (absolute number)
1903
+ if "<br>" in str_val:
1904
+ str_val = str_val.split("<br>")[0].strip()
1905
+ # For unique values, we want just the integer part
1906
+ if is_unique:
1907
+ try:
1908
+ # Try to extract just the integer part for unique counts
1909
+ num_val = float(str_val)
1910
+ return str(int(num_val))
1911
+ except (ValueError, TypeError):
1912
+ pass
1913
+
1914
+ # Now handle HTML content (especially from boolean unique values)
1915
+ if "<" in str_val and ">" in str_val:
1916
+ # Remove HTML tags completely for cleaner display
1917
+ str_val = re.sub(r"<[^>]+>", "", str_val).strip()
1918
+ # Clean up extra whitespace
1919
+ str_val = re.sub(r"\s+", " ", str_val).strip()
1920
+
1921
+ # Handle values like "2<.01" - extract the first number
1922
+ if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
1923
+ # Extract number before the < symbol
1924
+ before_lt = str_val.split("<")[0].strip()
1925
+ if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
1926
+ str_val = before_lt
1927
+
1928
+ # Handle boolean unique values like "T0.62F0.38" - extract the more readable format
1929
+ if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
1930
+ # Extract T and F values
1931
+ t_match = re.search(r"T(\d+\.\d+)", str_val)
1932
+ f_match = re.search(r"F(\d+\.\d+)", str_val)
1933
+ if t_match and f_match:
1934
+ t_val = float(t_match.group(1))
1935
+ f_val = float(f_match.group(1))
1936
+ # Show as "T0.62F0.38" but truncated if needed
1937
+ formatted = f"T{t_val:.2f}F{f_val:.2f}"
1938
+ if len(formatted) > max_width:
1939
+ # Truncate to fit, showing dominant value
1940
+ if t_val > f_val:
1941
+ return f"T{t_val:.1f}"
1942
+ else:
1943
+ return f"F{f_val:.1f}"
1944
+ return formatted
1945
+
1946
+ # Try to parse as a number for better formatting
1947
+ try:
1948
+ # Try to convert to float first
1949
+ num_val = float(str_val)
1950
+
1951
+ # Handle special cases
1952
+ if num_val == 0:
1953
+ return "0"
1954
+ elif abs(num_val) == int(abs(num_val)) and abs(num_val) < 10000:
1955
+ # Simple integers under 10000
1956
+ return str(int(num_val))
1957
+ elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
1958
+ # Likely dates in YYYYMMDD format - format as date-like
1959
+ int_val = int(num_val)
1960
+ if 19000101 <= int_val <= 29991231: # Reasonable date range
1961
+ str_date = str(int_val)
1962
+ if len(str_date) == 8:
1963
+ return (
1964
+ f"{str_date[:4]}-{str_date[4:6]}-{str_date[6:]}"[: max_width - 1]
1965
+ + "…"
1966
+ )
1967
+ # Otherwise treat as large number
1968
+ return f"{num_val / 1000000:.1f}M"
1969
+ elif abs(num_val) >= 1000000:
1970
+ # Large numbers - use scientific notation or M/k notation
1971
+
1972
+ if abs(num_val) >= 1000000000:
1973
+ return f"{num_val:.1e}"
1974
+ else:
1975
+ return f"{num_val / 1000000:.1f}M"
1976
+ elif abs(num_val) >= 10000:
1977
+ # Numbers >= 10k - use compact notation
1978
+ return f"{num_val / 1000:.1f}k"
1979
+ elif abs(num_val) >= 100:
1980
+ # Numbers 100-9999 - show with minimal decimals
1981
+ return f"{num_val:.1f}"
1982
+ elif abs(num_val) >= 10:
1983
+ # Numbers 10-99 - show with one decimal
1984
+ return f"{num_val:.1f}"
1985
+ elif abs(num_val) >= 1:
1986
+ # Numbers 1-9 - show with two decimals
1987
+ return f"{num_val:.2f}"
1988
+ elif abs(num_val) >= 0.01:
1989
+ # Small numbers - show with appropriate precision
1990
+ return f"{num_val:.2f}"
1991
+ else:
1992
+ # Very small numbers - use scientific notation
1993
+ return f"{num_val:.1e}"
1994
+
1995
+ except (ValueError, TypeError):
1996
+ # Not a number, handle as string
1997
+ pass
1998
+
1999
+ # Handle date/datetime strings - show abbreviated format
2000
+ if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
2001
+ # Likely a date/datetime, show abbreviated
2002
+ if len(str_val) > max_width:
2003
+ return str_val[: max_width - 1] + "…"
2004
+
2005
+ # General string truncation with ellipsis
2006
+ if len(str_val) > max_width:
2007
+ return str_val[: max_width - 1] + "…"
2008
+
2009
+ return str_val
2010
+
2011
+ # Populate table rows
2012
+ num_rows = len(data_dict["colname"])
2013
+ for i in range(num_rows):
2014
+ row_data = []
2015
+
2016
+ # Column name and type from HTML content
2017
+ colname_html = data_dict["colname"][i]
2018
+ column_name, data_type = extract_column_info(colname_html)
2019
+ row_data.append(column_name)
2020
+ row_data.append(data_type)
2021
+
2022
+ # Missing values (NA)
2023
+ missing_val = data_dict.get("n_missing", [None] * num_rows)[i]
2024
+ row_data.append(format_value(missing_val, is_missing=True, max_width=6))
2025
+
2026
+ # Unique values (UQ)
2027
+ unique_val = data_dict.get("n_unique", [None] * num_rows)[i]
2028
+ row_data.append(format_value(unique_val, is_unique=True, max_width=8))
2029
+
2030
+ # Statistical columns
2031
+ for stat_col in stat_columns:
2032
+ stat_val = data_dict.get(stat_col, [None] * num_rows)[i]
2033
+ # Use appropriate width based on column type
2034
+ if stat_col in ["q_1", "iqr"]:
2035
+ width = 8
2036
+ elif stat_col in ["mean", "std", "min", "median", "max", "q_3"]:
2037
+ width = 9
2038
+ else:
2039
+ width = 8
2040
+ row_data.append(format_value(stat_val, max_width=width))
2041
+
2042
+ scan_table.add_row(*row_data)
2043
+
2044
+ # Display the results
2045
+ console.print()
2046
+ console.print(scan_table) # Add informational footer about the scan scope
2047
+ try:
2048
+ if total_rows is not None:
2049
+ # Full table scan
2050
+ footer_text = f"[dim]Scan from all {total_rows:,} rows in the table.[/dim]"
2051
+
2052
+ # Create a simple footer
2053
+ footer_table = Table(
2054
+ show_header=False,
2055
+ show_lines=False,
2056
+ box=None,
2057
+ padding=(0, 0),
2058
+ )
2059
+ footer_table.add_column("", style="dim", width=80)
2060
+ footer_table.add_row(footer_text)
2061
+ console.print(footer_table)
2062
+
2063
+ except Exception:
2064
+ # If we can't determine the scan scope, don't show a footer
2065
+ pass
2066
+
2067
+ except Exception as e:
2068
+ # Fallback to simple message if table creation fails
2069
+ console.print(f"[yellow]Scan results available for {data_source}[/yellow]")
2070
+ console.print(f"[red]Error displaying table: {str(e)}[/red]")
2071
+
2072
+
2073
+ @cli.command(name="validate-simple")
2074
+ @click.argument("data_source", type=str)
2075
+ @click.option(
2076
+ "--check",
2077
+ type=click.Choice(
2078
+ [
2079
+ "rows-distinct",
2080
+ "col-vals-not-null",
2081
+ "rows-complete",
2082
+ "col-exists",
2083
+ "col-vals-in-set",
2084
+ "col-vals-gt",
2085
+ "col-vals-ge",
2086
+ "col-vals-lt",
2087
+ "col-vals-le",
2088
+ ]
2089
+ ),
2090
+ default="rows-distinct",
2091
+ help="Type of validation check to perform",
2092
+ )
2093
+ @click.option(
2094
+ "--column",
2095
+ help="Column name to validate (required for col-vals-not-null, col-exists, col-vals-in-set, col-vals-gt, col-vals-ge, col-vals-lt, and col-vals-le checks)",
2096
+ )
2097
+ @click.option("--set", help="Comma-separated allowed values (required for col-vals-in-set check)")
2098
+ @click.option(
2099
+ "--value",
2100
+ type=float,
2101
+ help="Numeric value for comparison (required for col-vals-gt, col-vals-ge, col-vals-lt, and col-vals-le checks)",
2102
+ )
2103
+ @click.option(
2104
+ "--show-extract", is_flag=True, help="Show preview of failing rows if validation fails"
2105
+ )
2106
+ @click.option(
2107
+ "--limit", "-l", default=10, help="Maximum number of failing rows to show (default: 10)"
2108
+ )
2109
+ @click.option("--exit-code", is_flag=True, help="Exit with non-zero code if validation fails")
2110
+ def validate_simple(
2111
+ data_source: str,
2112
+ check: str,
2113
+ column: str | None,
2114
+ set: str | None,
2115
+ value: float | None,
2116
+ show_extract: bool,
2117
+ limit: int,
2118
+ exit_code: bool,
2119
+ ):
2120
+ """
2121
+ Perform simple, single-step validations directly from the command line.
2122
+
2123
+ This command provides a quick way to perform common data validation checks
2124
+ without needing to write a validation script.
2125
+
2126
+ DATA_SOURCE can be:
2127
+
2128
+ \b
2129
+ - CSV file path (e.g., data.csv)
2130
+ - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
2131
+ - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
2132
+ - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
2133
+
2134
+ AVAILABLE CHECKS:
2135
+
2136
+ \b
2137
+ - rows-distinct: Check if all rows in the dataset are unique (no duplicates)
2138
+ - rows-complete: Check if all rows are complete (no missing values in any column)
2139
+ - col-exists: Check if a specific column exists in the dataset (requires --column)
2140
+ - col-vals-not-null: Check if all values in a column are not null/missing (requires --column)
2141
+ - col-vals-gt: Check if all values in a column are greater than a threshold (requires --column and --value)
2142
+ - col-vals-ge: Check if all values in a column are greater than or equal to a threshold (requires --column and --value)
2143
+ - col-vals-lt: Check if all values in a column are less than a threshold (requires --column and --value)
2144
+ - col-vals-le: Check if all values in a column are less than or equal to a threshold (requires --column and --value)
2145
+ - col-vals-in-set: Check if all values in a column are in an allowed set (requires --column and --set)
2146
+
2147
+ Examples:
2148
+
2149
+ \b
2150
+ pb validate-simple data.csv --check rows-distinct
2151
+ pb validate-simple data.csv --check rows-distinct --show-extract
2152
+ pb validate-simple data.csv --check rows-distinct --exit-code
2153
+ pb validate-simple data.csv --check rows-complete
2154
+ pb validate-simple data.csv --check col-exists --column price
2155
+ pb validate-simple data.csv --check col-vals-not-null --column email
2156
+ pb validate-simple data.csv --check col-vals-gt --column score --value 50
2157
+ pb validate-simple data.csv --check col-vals-in-set --column status --set "active,inactive,pending"
2158
+ """
2159
+ try:
2160
+ # Validate required parameters for different check types
2161
+ if check == "col-vals-not-null" and not column:
2162
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
2163
+ console.print(
2164
+ "Example: pb validate-simple data.csv --check col-vals-not-null --column email"
2165
+ )
2166
+ sys.exit(1)
2167
+ sys.exit(1)
2168
+
2169
+ if check == "col-exists" and not column:
2170
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
2171
+ console.print("Example: pb validate-simple data.csv --check col-exists --column price")
2172
+ sys.exit(1)
2173
+
2174
+ if check == "col-vals-in-set" and not column:
2175
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
2176
+ console.print(
2177
+ "Example: pb validate-simple data.csv --check col-vals-in-set --column status --set 'active,inactive'"
2178
+ )
2179
+ sys.exit(1)
2180
+
2181
+ if check == "col-vals-in-set" and not set:
2182
+ console.print(f"[red]Error:[/red] --set is required for {check} check")
2183
+ console.print(
2184
+ "Example: pb validate-simple data.csv --check col-vals-in-set --column status --set 'active,inactive,pending'"
2185
+ )
2186
+ sys.exit(1)
2187
+
2188
+ if check == "col-vals-gt" and not column:
2189
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
2190
+ console.print(
2191
+ "Example: pb validate-simple data.csv --check col-vals-gt --column score --value 50"
2192
+ )
2193
+ sys.exit(1)
2194
+
2195
+ if check == "col-vals-gt" and value is None:
2196
+ console.print(f"[red]Error:[/red] --value is required for {check} check")
2197
+ console.print(
2198
+ "Example: pb validate-simple data.csv --check col-vals-gt --column score --value 50"
2199
+ )
2200
+ sys.exit(1)
2201
+
2202
+ if check == "col-vals-ge" and not column:
2203
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
2204
+ console.print(
2205
+ "Example: pb validate-simple data.csv --check col-vals-ge --column age --value 18"
2206
+ )
2207
+ sys.exit(1)
2208
+
2209
+ if check == "col-vals-ge" and value is None:
2210
+ console.print(f"[red]Error:[/red] --value is required for {check} check")
2211
+ console.print(
2212
+ "Example: pb validate-simple data.csv --check col-vals-ge --column age --value 18"
2213
+ )
2214
+ sys.exit(1)
2215
+
2216
+ if check == "col-vals-lt" and not column:
2217
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
2218
+ console.print(
2219
+ "Example: pb validate-simple data.csv --check col-vals-lt --column age --value 65"
2220
+ )
2221
+ sys.exit(1)
2222
+
2223
+ if check == "col-vals-lt" and value is None:
2224
+ console.print(f"[red]Error:[/red] --value is required for {check} check")
2225
+ console.print(
2226
+ "Example: pb validate-simple data.csv --check col-vals-lt --column age --value 65"
2227
+ )
2228
+ sys.exit(1)
2229
+
2230
+ if check == "col-vals-le" and not column:
2231
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
2232
+ console.print(
2233
+ "Example: pb validate-simple data.csv --check col-vals-le --column score --value 100"
2234
+ )
2235
+ sys.exit(1)
2236
+
2237
+ if check == "col-vals-le" and value is None:
2238
+ console.print(f"[red]Error:[/red] --value is required for {check} check")
2239
+ console.print(
2240
+ "Example: pb validate-simple data.csv --check col-vals-le --column score --value 100"
2241
+ )
2242
+ sys.exit(1)
2243
+
2244
+ with console.status("[bold green]Loading data..."):
2245
+ # Try to load as a pointblank dataset first
2246
+ if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
2247
+ data = pb.load_dataset(data_source)
2248
+ console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
2249
+ else:
2250
+ # Assume it's a file path or connection string
2251
+ data = data_source
2252
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
2253
+
2254
+ # Perform the validation based on the check type
2255
+ with console.status(f"[bold green]Running {check} validation..."):
2256
+ if check == "rows-distinct":
2257
+ # Create validation for duplicate rows
2258
+ validation = (
2259
+ pb.Validate(
2260
+ data=data,
2261
+ tbl_name=f"Data from {data_source}",
2262
+ label=f"CLI Simple Validation: {check}",
2263
+ )
2264
+ .rows_distinct()
2265
+ .interrogate()
2266
+ )
2267
+
2268
+ # Get the result
2269
+ all_passed = validation.all_passed()
2270
+
2271
+ console.print(
2272
+ f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2273
+ )
2274
+ elif check == "col-vals-not-null":
2275
+ # Create validation for not null values in specified column
2276
+ validation = (
2277
+ pb.Validate(
2278
+ data=data,
2279
+ tbl_name=f"Data from {data_source}",
2280
+ label=f"CLI Simple Validation: {check} on column '{column}'",
2281
+ )
2282
+ .col_vals_not_null(columns=column)
2283
+ .interrogate()
2284
+ )
2285
+
2286
+ # Get the result
2287
+ all_passed = validation.all_passed()
2288
+
2289
+ console.print(
2290
+ f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2291
+ )
2292
+ elif check == "rows-complete":
2293
+ # Create validation for complete rows (no missing values in any column)
2294
+ validation = (
2295
+ pb.Validate(
2296
+ data=data,
2297
+ tbl_name=f"Data from {data_source}",
2298
+ label=f"CLI Simple Validation: {check}",
2299
+ )
2300
+ .rows_complete()
2301
+ .interrogate()
2302
+ )
2303
+
2304
+ # Get the result
2305
+ all_passed = validation.all_passed()
2306
+
2307
+ console.print(
2308
+ f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2309
+ )
2310
+ elif check == "col-exists":
2311
+ # Create validation for column existence
2312
+ validation = (
2313
+ pb.Validate(
2314
+ data=data,
2315
+ tbl_name=f"Data from {data_source}",
2316
+ label=f"CLI Simple Validation: {check} for column '{column}'",
2317
+ )
2318
+ .col_exists(columns=column)
2319
+ .interrogate()
2320
+ )
2321
+
2322
+ # Get the result
2323
+ all_passed = validation.all_passed()
2324
+
2325
+ console.print(
2326
+ f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2327
+ )
2328
+ elif check == "col-vals-in-set":
2329
+ # Parse the comma-separated set values
2330
+ allowed_values = [value.strip() for value in set.split(",")]
2331
+
2332
+ # Create validation for values in set
2333
+ validation = (
2334
+ pb.Validate(
2335
+ data=data,
2336
+ tbl_name=f"Data from {data_source}",
2337
+ label=f"CLI Simple Validation: {check} for column '{column}'",
2338
+ )
2339
+ .col_vals_in_set(columns=column, set=allowed_values)
2340
+ .interrogate()
2341
+ )
2342
+
2343
+ # Get the result
2344
+ all_passed = validation.all_passed()
2345
+
2346
+ console.print(
2347
+ f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2348
+ )
2349
+ elif check == "col-vals-gt":
2350
+ # Create validation for values greater than threshold
2351
+ validation = (
2352
+ pb.Validate(
2353
+ data=data,
2354
+ tbl_name=f"Data from {data_source}",
2355
+ label=f"CLI Simple Validation: {check} for column '{column}' > {value}",
2356
+ )
2357
+ .col_vals_gt(columns=column, value=value)
2358
+ .interrogate()
2359
+ )
2360
+
2361
+ # Get the result
2362
+ all_passed = validation.all_passed()
2363
+
2364
+ console.print(
2365
+ f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2366
+ )
2367
+ elif check == "col-vals-ge":
2368
+ # Create validation for values greater than or equal to threshold
2369
+ validation = (
2370
+ pb.Validate(
2371
+ data=data,
2372
+ tbl_name=f"Data from {data_source}",
2373
+ label=f"CLI Simple Validation: {check} for column '{column}' >= {value}",
2374
+ )
2375
+ .col_vals_ge(columns=column, value=value)
2376
+ .interrogate()
2377
+ )
2378
+
2379
+ # Get the result
2380
+ all_passed = validation.all_passed()
2381
+
2382
+ console.print(
2383
+ f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2384
+ )
2385
+ elif check == "col-vals-lt":
2386
+ # Create validation for values less than threshold
2387
+ validation = (
2388
+ pb.Validate(
2389
+ data=data,
2390
+ tbl_name=f"Data from {data_source}",
2391
+ label=f"CLI Simple Validation: {check} for column '{column}' < {value}",
2392
+ )
2393
+ .col_vals_lt(columns=column, value=value)
2394
+ .interrogate()
2395
+ )
2396
+
2397
+ # Get the result
2398
+ all_passed = validation.all_passed()
2399
+
2400
+ console.print(
2401
+ f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2402
+ )
2403
+ elif check == "col-vals-le":
2404
+ # Create validation for values less than or equal to threshold
2405
+ validation = (
2406
+ pb.Validate(
2407
+ data=data,
2408
+ tbl_name=f"Data from {data_source}",
2409
+ label=f"CLI Simple Validation: {check} for column '{column}' <= {value}",
2410
+ )
2411
+ .col_vals_le(columns=column, value=value)
2412
+ .interrogate()
2413
+ )
2414
+
2415
+ # Get the result
2416
+ all_passed = validation.all_passed()
2417
+
2418
+ console.print(
2419
+ f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2420
+ )
2421
+ else:
2422
+ # This shouldn't happen due to click.Choice, but just in case
2423
+ console.print(f"[red]Error:[/red] Unknown check type: {check}")
2424
+ sys.exit(1)
2425
+
2426
+ # Display results
2427
+ from rich.box import SIMPLE_HEAD
2428
+
2429
+ # Create friendly title for table
2430
+ if check == "rows-distinct":
2431
+ table_title = "Validation Result: Rows Distinct"
2432
+ elif check == "col-vals-not-null":
2433
+ table_title = "Validation Result: Column Values Not Null"
2434
+ elif check == "rows-complete":
2435
+ table_title = "Validation Result: Rows Complete"
2436
+ elif check == "col-exists":
2437
+ table_title = "Validation Result: Column Exists"
2438
+ elif check == "col-vals-in-set":
2439
+ table_title = "Validation Result: Column Values In Set"
2440
+ elif check == "col-vals-gt":
2441
+ table_title = "Validation Result: Column Values Greater Than"
2442
+ elif check == "col-vals-ge":
2443
+ table_title = "Validation Result: Column Values Greater Than Or Equal"
2444
+ elif check == "col-vals-lt":
2445
+ table_title = "Validation Result: Column Values Less Than"
2446
+ elif check == "col-vals-le":
2447
+ table_title = "Validation Result: Column Values Less Than Or Equal"
2448
+ else:
2449
+ table_title = f"Validation Result: {check.replace('-', ' ').title()}"
2450
+
2451
+ result_table = Table(
2452
+ title=table_title,
2453
+ show_header=True,
2454
+ header_style="bold magenta",
2455
+ box=SIMPLE_HEAD,
2456
+ title_style="bold cyan",
2457
+ title_justify="left",
2458
+ )
2459
+ result_table.add_column("Property", style="cyan", no_wrap=True)
2460
+ result_table.add_column("Value", style="white")
2461
+
2462
+ # Add basic info
2463
+ result_table.add_row("Data Source", data_source)
2464
+ result_table.add_row("Check Type", check)
2465
+
2466
+ # Add column info for column-specific checks
2467
+ if check in [
2468
+ "col-vals-not-null",
2469
+ "col-exists",
2470
+ "col-vals-in-set",
2471
+ "col-vals-gt",
2472
+ "col-vals-ge",
2473
+ "col-vals-lt",
2474
+ "col-vals-le",
2475
+ ]:
2476
+ result_table.add_row("Column", column)
2477
+
2478
+ # Add set info for col-vals-in-set check
2479
+ if check == "col-vals-in-set":
2480
+ allowed_values = [value.strip() for value in set.split(",")]
2481
+ result_table.add_row("Allowed Values", ", ".join(allowed_values))
2482
+
2483
+ # Add value info for range checks
2484
+ if check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"]:
2485
+ if check == "col-vals-gt":
2486
+ operator = ">"
2487
+ elif check == "col-vals-ge":
2488
+ operator = ">="
2489
+ elif check == "col-vals-lt":
2490
+ operator = "<"
2491
+ elif check == "col-vals-le":
2492
+ operator = "<="
2493
+ result_table.add_row("Threshold", f"{operator} {value}")
2494
+
2495
+ # Get validation details
2496
+ if hasattr(validation, "validation_info") and validation.validation_info:
2497
+ step_info = validation.validation_info[0] # Should only be one step
2498
+ result_table.add_row("Total Rows Tested", f"{step_info.n:,}")
2499
+ result_table.add_row("Passing Rows", f"{step_info.n_passed:,}")
2500
+ result_table.add_row("Failing Rows", f"{step_info.n_failed:,}")
2501
+
2502
+ # Overall result with color coding
2503
+ if all_passed:
2504
+ result_table.add_row("Result", "[green]✓ PASSED[/green]")
2505
+ if check == "rows-distinct":
2506
+ result_table.add_row("Duplicate Rows", "[green]None found[/green]")
2507
+ elif check == "col-vals-not-null":
2508
+ result_table.add_row("Null Values", "[green]None found[/green]")
2509
+ elif check == "rows-complete":
2510
+ result_table.add_row("Incomplete Rows", "[green]None found[/green]")
2511
+ elif check == "col-exists":
2512
+ result_table.add_row("Column Status", "[green]Column exists[/green]")
2513
+ elif check == "col-vals-in-set":
2514
+ result_table.add_row(
2515
+ "Values Status", "[green]All values in allowed set[/green]"
2516
+ )
2517
+ elif check == "col-vals-gt":
2518
+ result_table.add_row("Values Status", f"[green]All values > {value}[/green]")
2519
+ elif check == "col-vals-ge":
2520
+ result_table.add_row("Values Status", f"[green]All values >= {value}[/green]")
2521
+ else:
2522
+ result_table.add_row("Result", "[red]✗ FAILED[/red]")
2523
+ if check == "rows-distinct":
2524
+ result_table.add_row(
2525
+ "Duplicate Rows", f"[red]{step_info.n_failed:,} found[/red]"
2526
+ )
2527
+ elif check == "col-vals-not-null":
2528
+ result_table.add_row("Null Values", f"[red]{step_info.n_failed:,} found[/red]")
2529
+ elif check == "rows-complete":
2530
+ result_table.add_row(
2531
+ "Incomplete Rows", f"[red]{step_info.n_failed:,} found[/red]"
2532
+ )
2533
+ elif check == "col-exists":
2534
+ result_table.add_row("Column Status", "[red]Column does not exist[/red]")
2535
+ elif check == "col-vals-in-set":
2536
+ result_table.add_row(
2537
+ "Invalid Values", f"[red]{step_info.n_failed:,} found[/red]"
2538
+ )
2539
+ elif check == "col-vals-gt":
2540
+ result_table.add_row(
2541
+ "Invalid Values", f"[red]{step_info.n_failed:,} values <= {value}[/red]"
2542
+ )
2543
+ elif check == "col-vals-ge":
2544
+ result_table.add_row(
2545
+ "Invalid Values", f"[red]{step_info.n_failed:,} values < {value}[/red]"
2546
+ )
2547
+
2548
+ console.print()
2549
+ console.print(result_table)
2550
+
2551
+ # Show extract if requested and validation failed
2552
+ if show_extract and not all_passed:
2553
+ console.print()
2554
+
2555
+ # Dynamic message based on check type
2556
+ if check == "rows-distinct":
2557
+ extract_message = "[yellow]Preview of failing rows (duplicates):[/yellow]"
2558
+ row_type = "duplicate rows"
2559
+ elif check == "rows-complete":
2560
+ extract_message = "[yellow]Preview of failing rows (incomplete rows):[/yellow]"
2561
+ row_type = "incomplete rows"
2562
+ elif check == "col-exists":
2563
+ extract_message = (
2564
+ f"[yellow]Column '{column}' does not exist in the dataset[/yellow]"
2565
+ )
2566
+ row_type = "missing column"
2567
+ elif check == "col-vals-in-set":
2568
+ extract_message = (
2569
+ f"[yellow]Preview of failing rows (invalid values in '{column}'):[/yellow]"
2570
+ )
2571
+ row_type = "rows with invalid values"
2572
+ elif check == "col-vals-gt":
2573
+ extract_message = (
2574
+ f"[yellow]Preview of failing rows (values in '{column}' <= {value}):[/yellow]"
2575
+ )
2576
+ row_type = f"rows with values <= {value}"
2577
+ elif check == "col-vals-ge":
2578
+ extract_message = (
2579
+ f"[yellow]Preview of failing rows (values in '{column}' < {value}):[/yellow]"
2580
+ )
2581
+ row_type = f"rows with values < {value}"
2582
+ else:
2583
+ extract_message = "[yellow]Preview of failing rows:[/yellow]"
2584
+ row_type = "failing rows"
2585
+
2586
+ console.print(extract_message)
2587
+
2588
+ # Special handling for col-exists check - no rows to show when column doesn't exist
2589
+ if check == "col-exists" and not all_passed:
2590
+ console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
2591
+ console.print(
2592
+ "[dim]Use --show-extract with other check types to see failing data rows.[/dim]"
2593
+ )
2594
+ else:
2595
+ try:
2596
+ # Get failing rows extract
2597
+ failing_rows = validation.get_data_extracts(i=1, frame=True)
2598
+
2599
+ if failing_rows is not None and len(failing_rows) > 0:
2600
+ # Limit the number of rows shown
2601
+ if len(failing_rows) > limit:
2602
+ display_rows = failing_rows.head(limit)
2603
+ console.print(
2604
+ f"[dim]Showing first {limit} of {len(failing_rows)} {row_type}[/dim]"
2605
+ )
2606
+ else:
2607
+ display_rows = failing_rows
2608
+ console.print(f"[dim]Showing all {len(failing_rows)} {row_type}[/dim]")
2609
+
2610
+ # Create a preview table using pointblank's preview function
2611
+ preview_table = pb.preview(
2612
+ data=display_rows,
2613
+ n_head=min(limit, len(display_rows)),
2614
+ n_tail=0,
2615
+ limit=limit,
2616
+ show_row_numbers=True,
2617
+ )
2618
+
2619
+ # Display using our Rich table function
2620
+ _rich_print_gt_table(preview_table)
2621
+ else:
2622
+ console.print("[yellow]No failing rows could be extracted[/yellow]")
2623
+ except Exception as e:
2624
+ console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
2625
+
2626
+ # Summary message
2627
+ console.print()
2628
+ if all_passed:
2629
+ if check == "rows-distinct":
2630
+ success_message = (
2631
+ f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
2632
+ )
2633
+ elif check == "col-vals-not-null":
2634
+ success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
2635
+ elif check == "rows-complete":
2636
+ success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
2637
+ elif check == "col-exists":
2638
+ success_message = (
2639
+ f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
2640
+ )
2641
+ elif check == "col-vals-in-set":
2642
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
2643
+ elif check == "col-vals-gt":
2644
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
2645
+ elif check == "col-vals-ge":
2646
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
2647
+ elif check == "col-vals-lt":
2648
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
2649
+ elif check == "col-vals-le":
2650
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
2651
+ else:
2652
+ success_message = (
2653
+ f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
2654
+ )
2655
+
2656
+ console.print(
2657
+ Panel(
2658
+ success_message,
2659
+ border_style="green",
2660
+ )
2661
+ )
2662
+ else:
2663
+ if hasattr(validation, "validation_info") and validation.validation_info:
2664
+ step_info = validation.validation_info[0]
2665
+
2666
+ if check == "rows-distinct":
2667
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
2668
+ elif check == "col-vals-not-null":
2669
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
2670
+ elif check == "rows-complete":
2671
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
2672
+ elif check == "col-exists":
2673
+ failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
2674
+ elif check == "col-vals-in-set":
2675
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
2676
+ elif check == "col-vals-gt":
2677
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
2678
+ elif check == "col-vals-ge":
2679
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
2680
+ elif check == "col-vals-lt":
2681
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
2682
+ elif check == "col-vals-le":
2683
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
2684
+ else:
2685
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
2686
+
2687
+ # Add hint about --show-extract if not already used (except for col-exists which has no rows to show)
2688
+ if not show_extract and check != "col-exists":
2689
+ failure_message += (
2690
+ "\n[dim]💡 Tip: Use --show-extract to see the failing rows[/dim]"
2691
+ )
2692
+
2693
+ console.print(
2694
+ Panel(
2695
+ failure_message,
2696
+ border_style="red",
2697
+ )
2698
+ )
2699
+ else:
2700
+ if check == "rows-distinct":
2701
+ failure_message = (
2702
+ f"[red]✗ Validation FAILED: Duplicate rows found in {data_source}[/red]"
2703
+ )
2704
+ elif check == "rows-complete":
2705
+ failure_message = (
2706
+ f"[red]✗ Validation FAILED: Incomplete rows found in {data_source}[/red]"
2707
+ )
2708
+ else:
2709
+ failure_message = (
2710
+ f"[red]✗ Validation FAILED: {check} check failed for {data_source}[/red]"
2711
+ )
2712
+
2713
+ # Add hint about --show-extract if not already used
2714
+ if not show_extract:
2715
+ failure_message += (
2716
+ "\n[dim]💡 Tip: Use --show-extract to see the failing rows[/dim]"
2717
+ )
2718
+
2719
+ console.print(
2720
+ Panel(
2721
+ failure_message,
2722
+ border_style="red",
2723
+ )
2724
+ )
2725
+
2726
+ # Exit with appropriate code if requested
2727
+ if exit_code and not all_passed:
2728
+ console.print("[dim]Exiting with non-zero code due to validation failure[/dim]")
2729
+ sys.exit(1)
2730
+
2731
+ except Exception as e:
2732
+ console.print(f"[red]Error:[/red] {e}")
2733
+ sys.exit(1)
2734
+
2735
+
2736
+ if __name__ == "__main__": # pragma: no cover
2737
+ cli()