pointblank 0.11.0__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/cli.py CHANGED
@@ -15,6 +15,81 @@ from pointblank._utils import _get_tbl_type, _is_lib_present
15
15
  console = Console()
16
16
 
17
17
 
18
+ class OrderedGroup(click.Group):
19
+ """A Click Group that displays commands in a custom order."""
20
+
21
+ def list_commands(self, ctx):
22
+ """Return commands in the desired logical order."""
23
+ # Define the desired order
24
+ desired_order = [
25
+ # Data Discovery/Exploration
26
+ "info",
27
+ "preview",
28
+ "scan",
29
+ "missing",
30
+ # Validation
31
+ "validate",
32
+ "run",
33
+ "make-template",
34
+ # Utilities
35
+ "datasets",
36
+ "requirements",
37
+ ]
38
+
39
+ # Get all available commands
40
+ available_commands = super().list_commands(ctx)
41
+
42
+ # Return commands in desired order, followed by any not in the list
43
+ ordered = []
44
+ for cmd in desired_order:
45
+ if cmd in available_commands:
46
+ ordered.append(cmd)
47
+
48
+ # Add any commands not in our desired order (safety fallback)
49
+ for cmd in available_commands:
50
+ if cmd not in ordered:
51
+ ordered.append(cmd)
52
+
53
+ return ordered
54
+
55
+
56
+ def _load_data_source(data_source: str) -> Any:
57
+ """
58
+ Centralized data loading function for CLI that handles all supported data source types.
59
+
60
+ This function provides a consistent way to load data across all CLI commands by leveraging
61
+ the _process_data() utility function and adding support for pointblank dataset names.
62
+
63
+ Parameters
64
+ ----------
65
+ data_source : str
66
+ The data source which could be:
67
+ - A pointblank dataset name (small_table, game_revenue, nycflights, global_sales)
68
+ - A GitHub URL pointing to a CSV or Parquet file
69
+ - A database connection string (e.g., "duckdb:///path/to/file.ddb::table_name")
70
+ - A CSV file path (string or Path object with .csv extension)
71
+ - A Parquet file path, glob pattern, directory, or partitioned dataset
72
+
73
+ Returns
74
+ -------
75
+ Any
76
+ Loaded data as a DataFrame or other data object
77
+
78
+ Raises
79
+ ------
80
+ ValueError
81
+ If the pointblank dataset name is not recognized
82
+ """
83
+ # Check if it's a pointblank dataset name first
84
+ if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
85
+ return pb.load_dataset(data_source)
86
+
87
+ # Otherwise, use the centralized _process_data() function for all other data sources
88
+ from pointblank.validate import _process_data
89
+
90
+ return _process_data(data_source)
91
+
92
+
18
93
  def _format_cell_value(
19
94
  value: Any, is_row_number: bool = False, max_width: int = 50, num_columns: int = 10
20
95
  ) -> str:
@@ -206,173 +281,443 @@ def _format_dtype_compact(dtype_str: str) -> str:
206
281
  return dtype_str
207
282
 
208
283
 
209
- def _rich_print_gt_table(gt_table: Any, preview_info: dict | None = None) -> None:
210
- """Convert a GT table to Rich table and display it in the terminal.
284
+ def _rich_print_scan_table(
285
+ scan_result: Any,
286
+ data_source: str,
287
+ source_type: str,
288
+ table_type: str,
289
+ total_rows: int | None = None,
290
+ total_columns: int | None = None,
291
+ ) -> None:
292
+ """
293
+ Display scan results as a Rich table in the terminal with statistical measures.
211
294
 
212
295
  Args:
213
- gt_table: The GT table object to display
214
- preview_info: Optional dict with preview context info:
215
- - total_rows: Total rows in the dataset
216
- - head_rows: Number of head rows shown
217
- - tail_rows: Number of tail rows shown
218
- - is_complete: Whether the entire dataset is shown
296
+ scan_result: The GT object from col_summary_tbl()
297
+ data_source: Name of the data source being scanned
298
+ source_type: Type of data source (e.g., "Pointblank dataset: small_table")
299
+ table_type: Type of table (e.g., "polars.LazyFrame")
300
+ total_rows: Total number of rows in the dataset
301
+ total_columns: Total number of columns in the dataset
219
302
  """
220
303
  try:
221
- # Try to extract the underlying data from the GT table
222
- df = None
304
+ import re
223
305
 
224
- # Great Tables stores the original data in different places depending on how it was created
225
- # Let's try multiple approaches to get the data
226
- if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
227
- df = gt_table._tbl_data
228
- elif (
229
- hasattr(gt_table, "_body")
230
- and hasattr(gt_table._body, "body")
231
- and gt_table._body.body is not None
232
- ):
233
- df = gt_table._body.body
234
- elif hasattr(gt_table, "_data") and gt_table._data is not None:
235
- df = gt_table._data
236
- elif hasattr(gt_table, "data") and gt_table.data is not None:
237
- df = gt_table.data
306
+ import narwhals as nw
307
+ from rich.box import SIMPLE_HEAD
238
308
 
239
- if df is not None:
240
- # Create a Rich table with horizontal lines
241
- from rich.box import SIMPLE_HEAD
309
+ # Extract the underlying DataFrame from the GT object
310
+ # The GT object has a _tbl_data attribute that contains the DataFrame
311
+ gt_data = scan_result._tbl_data
242
312
 
243
- # Create enhanced title if preview_info contains metadata
244
- table_title = None
245
- if preview_info and "source_type" in preview_info and "table_type" in preview_info:
246
- source_type = preview_info["source_type"]
247
- table_type = preview_info["table_type"]
248
- table_title = f"Data Preview / {source_type} / {table_type}"
313
+ # Convert to Narwhals DataFrame for consistent handling
314
+ nw_data = nw.from_native(gt_data)
249
315
 
250
- rich_table = Table(
251
- title=table_title,
252
- show_header=True,
253
- header_style="bold magenta",
254
- box=SIMPLE_HEAD,
255
- title_style="bold cyan",
256
- title_justify="left",
257
- )
316
+ # Convert to dictionary for easier access
317
+ data_dict = nw_data.to_dict(as_series=False)
258
318
 
259
- # Get column names
260
- columns = []
261
- if hasattr(df, "columns"):
262
- columns = list(df.columns)
263
- elif hasattr(df, "schema"): # pragma: no cover
264
- columns = list(df.schema.names)
265
- elif hasattr(df, "column_names"): # pragma: no cover
266
- columns = list(df.column_names)
319
+ # Create main scan table with missing data table styling
320
+ # Create a comprehensive title with data source, source type, and table type
321
+ title_text = f"Column Summary / {source_type} / {table_type}"
267
322
 
268
- if not columns: # pragma: no cover
269
- # Fallback: try to determine columns from first row
270
- try:
271
- if hasattr(df, "to_dicts") and len(df) > 0:
272
- first_dict = df.to_dicts()[0]
273
- columns = list(first_dict.keys())
274
- elif hasattr(df, "to_dict") and len(df) > 0:
275
- first_dict = df.to_dict("records")[0]
276
- columns = list(first_dict.keys())
277
- except Exception: # pragma: no cover
278
- columns = [f"Column {i + 1}" for i in range(10)] # Default fallback
323
+ # Add dimensions subtitle in gray if available
324
+ if total_rows is not None and total_columns is not None:
325
+ title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
279
326
 
280
- # Add columns to Rich table
281
- # Handle wide tables by limiting columns displayed
282
- max_terminal_cols = 15 # Reasonable limit for terminal display
327
+ scan_table = Table(
328
+ title=title_text,
329
+ show_header=True,
330
+ header_style="bold magenta",
331
+ box=SIMPLE_HEAD,
332
+ title_style="bold cyan",
333
+ title_justify="left",
334
+ )
283
335
 
284
- # Get terminal width to adjust column behavior
285
- try:
286
- terminal_width = console.size.width
287
- # Estimate max column width based on terminal size and number of columns
288
- if len(columns) <= 5:
289
- max_col_width = min(60, terminal_width // 4)
290
- elif len(columns) <= 10:
291
- max_col_width = min(40, terminal_width // 6)
292
- else:
293
- max_col_width = min(30, terminal_width // 8)
294
- except Exception: # pragma: no cover
295
- # Fallback if we can't get terminal width
296
- max_col_width = 40 if len(columns) <= 10 else 25
336
+ # Add columns with specific styling and appropriate widths
337
+ scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
338
+ scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
339
+ scan_table.add_column(
340
+ "NA", style="red", width=6, justify="right"
341
+ ) # Adjusted for better formatting
342
+ scan_table.add_column(
343
+ "UQ", style="green", width=8, justify="right"
344
+ ) # Adjusted for boolean values
297
345
 
298
- if len(columns) > max_terminal_cols:
299
- # For wide tables, show first few, middle indicator, and last few columns
300
- first_cols = 7
301
- last_cols = 7
346
+ # Add statistical columns if they exist with appropriate widths
347
+ stat_columns = []
348
+ column_mapping = {
349
+ "mean": ("Mean", "blue", 9),
350
+ "std": ("SD", "blue", 9),
351
+ "min": ("Min", "yellow", 9),
352
+ "median": ("Med", "yellow", 9),
353
+ "max": ("Max", "yellow", 9),
354
+ "q_1": ("Q₁", "magenta", 8),
355
+ "q_3": ("Q₃", "magenta", 9),
356
+ "iqr": ("IQR", "magenta", 8),
357
+ }
302
358
 
303
- display_columns = columns[:first_cols] + ["...more..."] + columns[-last_cols:]
359
+ for col_key, (display_name, color, width) in column_mapping.items():
360
+ if col_key in data_dict:
361
+ scan_table.add_column(display_name, style=color, width=width, justify="right")
362
+ stat_columns.append(col_key)
304
363
 
305
- console.print(
306
- f"\n[yellow]⚠ Table has {len(columns)} columns. Showing first {first_cols} and last {last_cols} columns.[/yellow]"
307
- )
308
- console.print("[dim]Use --columns to specify which columns to display.[/dim]")
309
- console.print(
310
- f"[dim]Full column list: {', '.join(columns[:5])}...{', '.join(columns[-5:])}[/dim]\n"
311
- )
364
+ # Helper function to extract column name and type from HTML
365
+ def extract_column_info(html_content: str) -> tuple[str, str]:
366
+ """Extract column name and type from HTML formatted content."""
367
+ # Extract column name from first div
368
+ name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
369
+ column_name = name_match.group(1) if name_match else "Unknown"
370
+
371
+ # Extract data type from second div (with gray color)
372
+ type_match = re.search(r"<div[^>]*color: gray[^>]*>([^<]+)</div>", html_content)
373
+ if type_match:
374
+ data_type = type_match.group(1)
375
+ # Convert to compact format using the existing function
376
+ compact_type = _format_dtype_compact(data_type)
377
+ data_type = compact_type
312
378
  else:
313
- display_columns = columns
379
+ data_type = "unknown"
314
380
 
315
- # Get data types for columns
316
- dtypes_dict = _get_column_dtypes(df, columns)
381
+ return column_name, data_type
317
382
 
318
- # Calculate row number column width if needed
319
- row_num_width = 6 # Default width
320
- if "_row_num_" in columns:
321
- try:
322
- # Get the maximum row number to calculate appropriate width
323
- if hasattr(df, "to_dicts"):
324
- data_dict = df.to_dicts()
325
- if data_dict:
326
- row_nums = [row.get("_row_num_", 0) for row in data_dict]
327
- max_row_num = max(row_nums) if row_nums else 0
328
- row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
329
- elif hasattr(df, "to_dict"):
330
- data_dict = df.to_dict("records")
331
- if data_dict:
332
- row_nums = [row.get("_row_num_", 0) for row in data_dict]
333
- max_row_num = max(row_nums) if row_nums else 0
334
- row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
335
- except Exception: # pragma: no cover
336
- # If we can't determine max row number, use default
337
- row_num_width = 8 # Slightly larger default for safety
383
+ # Helper function to format values with improved number formatting
384
+ def format_value(
385
+ value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
386
+ ) -> str:
387
+ """Format values for display with smart number formatting and HTML cleanup."""
388
+ if value is None or (isinstance(value, str) and value.strip() == ""):
389
+ return "[dim]—[/dim]"
338
390
 
339
- for i, col in enumerate(display_columns):
340
- if col == "...more...":
341
- # Add a special indicator column
342
- rich_table.add_column("···", style="dim", width=3, no_wrap=True)
343
- else:
344
- # Handle row number column specially
345
- if col == "_row_num_":
346
- # Row numbers get no header, right alignment, and dim gray style
347
- # Use dynamic width to prevent truncation
348
- rich_table.add_column(
349
- "", style="dim", justify="right", no_wrap=True, width=row_num_width
350
- )
351
- else:
352
- display_col = str(col)
391
+ # Handle missing values indicator
392
+ if is_missing and str(value) == "0":
393
+ return "[green]●[/green]" # No missing values
353
394
 
354
- # Get data type for this column (if available)
355
- if col in dtypes_dict:
356
- dtype_display = f"<{dtypes_dict[col]}>"
357
- # Create header with column name and data type
358
- header_text = f"{display_col}\n[dim yellow]{dtype_display}[/dim yellow]"
359
- else:
360
- header_text = display_col
395
+ # Clean up HTML formatting from the raw data
396
+ str_val = str(value)
361
397
 
362
- rich_table.add_column(
363
- header_text,
364
- style="cyan",
365
- no_wrap=False,
366
- overflow="ellipsis",
367
- max_width=max_col_width,
368
- )
398
+ # Handle multi-line values with <br> tags FIRST - take the first line (absolute number)
399
+ if "<br>" in str_val:
400
+ str_val = str_val.split("<br>")[0].strip()
401
+ # For unique values, we want just the integer part
402
+ if is_unique:
403
+ try:
404
+ # Try to extract just the integer part for unique counts
405
+ num_val = float(str_val)
406
+ return str(int(num_val))
407
+ except (ValueError, TypeError):
408
+ pass
369
409
 
370
- # Convert data to list of rows
371
- rows = []
372
- try:
373
- if hasattr(df, "to_dicts"):
374
- # Polars interface
375
- data_dict = df.to_dicts()
410
+ # Now handle HTML content (especially from boolean unique values)
411
+ if "<" in str_val and ">" in str_val:
412
+ # Remove HTML tags completely for cleaner display
413
+ str_val = re.sub(r"<[^>]+>", "", str_val).strip()
414
+ # Clean up extra whitespace
415
+ str_val = re.sub(r"\s+", " ", str_val).strip()
416
+
417
+ # Handle values like "2<.01" - extract the first number
418
+ if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
419
+ # Extract number before the < symbol
420
+ before_lt = str_val.split("<")[0].strip()
421
+ if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
422
+ str_val = before_lt
423
+
424
+ # Handle boolean unique values like "T0.62F0.38" - extract the more readable format
425
+ if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
426
+ # Extract T and F values
427
+ t_match = re.search(r"T(\d+\.\d+)", str_val)
428
+ f_match = re.search(r"F(\d+\.\d+)", str_val)
429
+ if t_match and f_match:
430
+ t_val = float(t_match.group(1))
431
+ f_val = float(f_match.group(1))
432
+ # Show as "T0.62F0.38" but truncated if needed
433
+ formatted = f"T{t_val:.2f}F{f_val:.2f}"
434
+ if len(formatted) > max_width:
435
+ # Truncate to fit, showing dominant value
436
+ if t_val > f_val:
437
+ return f"T{t_val:.1f}"
438
+ else:
439
+ return f"F{f_val:.1f}"
440
+ return formatted
441
+
442
+ # Try to parse as a number for better formatting
443
+ try:
444
+ # Try to convert to float first
445
+ num_val = float(str_val)
446
+
447
+ # Handle special cases
448
+ if num_val == 0:
449
+ return "0"
450
+ elif abs(num_val) == int(abs(num_val)) and abs(num_val) < 10000:
451
+ # Simple integers under 10000
452
+ return str(int(num_val))
453
+ elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
454
+ # Likely dates in YYYYMMDD format - format as date-like
455
+ int_val = int(num_val)
456
+ if 19000101 <= int_val <= 29991231: # Reasonable date range
457
+ str_date = str(int_val)
458
+ if len(str_date) == 8:
459
+ return (
460
+ f"{str_date[:4]}-{str_date[4:6]}-{str_date[6:]}"[: max_width - 1]
461
+ + "…"
462
+ )
463
+ # Otherwise treat as large number
464
+ return f"{num_val / 1000000:.1f}M"
465
+ elif abs(num_val) >= 1000000:
466
+ # Large numbers - use scientific notation or M/k notation
467
+
468
+ if abs(num_val) >= 1000000000:
469
+ return f"{num_val:.1e}"
470
+ else:
471
+ return f"{num_val / 1000000:.1f}M"
472
+ elif abs(num_val) >= 10000:
473
+ # Numbers >= 10k - use compact notation
474
+ return f"{num_val / 1000:.1f}k"
475
+ elif abs(num_val) >= 100:
476
+ # Numbers 100-9999 - show with minimal decimals
477
+ return f"{num_val:.1f}"
478
+ elif abs(num_val) >= 10:
479
+ # Numbers 10-99 - show with one decimal
480
+ return f"{num_val:.1f}"
481
+ elif abs(num_val) >= 1:
482
+ # Numbers 1-9 - show with two decimals
483
+ return f"{num_val:.2f}"
484
+ elif abs(num_val) >= 0.01:
485
+ # Small numbers - show with appropriate precision
486
+ return f"{num_val:.2f}"
487
+ else:
488
+ # Very small numbers - use scientific notation
489
+
490
+ return f"{num_val:.1e}"
491
+
492
+ except (ValueError, TypeError):
493
+ # Not a number, handle as string
494
+ pass
495
+
496
+ # Handle date/datetime strings - show abbreviated format
497
+ if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
498
+ # Likely a date/datetime, show abbreviated
499
+ if len(str_val) > max_width:
500
+ return str_val[: max_width - 1] + "…"
501
+
502
+ # General string truncation with ellipsis
503
+ if len(str_val) > max_width:
504
+ return str_val[: max_width - 1] + "…"
505
+
506
+ return str_val
507
+
508
+ # Populate table rows
509
+ num_rows = len(data_dict["colname"])
510
+ for i in range(num_rows):
511
+ row_data = []
512
+
513
+ # Column name and type from HTML content
514
+ colname_html = data_dict["colname"][i]
515
+ column_name, data_type = extract_column_info(colname_html)
516
+ row_data.append(column_name)
517
+ row_data.append(data_type)
518
+
519
+ # Missing values (NA)
520
+ missing_val = data_dict.get("n_missing", [None] * num_rows)[i]
521
+ row_data.append(format_value(missing_val, is_missing=True, max_width=6))
522
+
523
+ # Unique values (UQ)
524
+ unique_val = data_dict.get("n_unique", [None] * num_rows)[i]
525
+ row_data.append(format_value(unique_val, is_unique=True, max_width=8))
526
+
527
+ # Statistical columns
528
+ for stat_col in stat_columns:
529
+ stat_val = data_dict.get(stat_col, [None] * num_rows)[i]
530
+ # Use appropriate width based on column type
531
+ if stat_col in ["q_1", "iqr"]:
532
+ width = 8
533
+ elif stat_col in ["mean", "std", "min", "median", "max", "q_3"]:
534
+ width = 9
535
+ else:
536
+ width = 8
537
+ row_data.append(format_value(stat_val, max_width=width))
538
+
539
+ scan_table.add_row(*row_data)
540
+
541
+ # Display the results
542
+ console.print()
543
+ console.print(scan_table)
544
+
545
+ except Exception as e:
546
+ # Fallback to simple message if table creation fails
547
+ console.print(f"[yellow]Scan results available for {data_source}[/yellow]")
548
+ console.print(f"[red]Error displaying table: {str(e)}[/red]")
549
+
550
+
551
+ def _rich_print_gt_table(
552
+ gt_table: Any, preview_info: dict | None = None, show_summary: bool = True
553
+ ) -> None:
554
+ """Convert a GT table to Rich table and display it in the terminal.
555
+
556
+ Args:
557
+ gt_table: The GT table object to display
558
+ preview_info: Optional dict with preview context info:
559
+ - total_rows: Total rows in the dataset
560
+ - head_rows: Number of head rows shown
561
+ - tail_rows: Number of tail rows shown
562
+ - is_complete: Whether the entire dataset is shown
563
+ show_summary: Whether to show the row count summary at the bottom
564
+ """
565
+ try:
566
+ # Try to extract the underlying data from the GT table
567
+ df = None
568
+
569
+ # Great Tables stores the original data in different places depending on how it was created
570
+ # Let's try multiple approaches to get the data
571
+ if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
572
+ df = gt_table._tbl_data
573
+ elif (
574
+ hasattr(gt_table, "_body")
575
+ and hasattr(gt_table._body, "body")
576
+ and gt_table._body.body is not None
577
+ ):
578
+ df = gt_table._body.body
579
+ elif hasattr(gt_table, "_data") and gt_table._data is not None:
580
+ df = gt_table._data
581
+ elif hasattr(gt_table, "data") and gt_table.data is not None:
582
+ df = gt_table.data
583
+
584
+ if df is not None:
585
+ # Create a Rich table with horizontal lines
586
+ from rich.box import SIMPLE_HEAD
587
+
588
+ # Create enhanced title if preview_info contains metadata
589
+ table_title = None
590
+ if preview_info and "source_type" in preview_info and "table_type" in preview_info:
591
+ source_type = preview_info["source_type"]
592
+ table_type = preview_info["table_type"]
593
+ table_title = f"Data Preview / {source_type} / {table_type}"
594
+
595
+ rich_table = Table(
596
+ title=table_title,
597
+ show_header=True,
598
+ header_style="bold magenta",
599
+ box=SIMPLE_HEAD,
600
+ title_style="bold cyan",
601
+ title_justify="left",
602
+ )
603
+
604
+ # Get column names
605
+ columns = []
606
+ if hasattr(df, "columns"):
607
+ columns = list(df.columns)
608
+ elif hasattr(df, "schema"): # pragma: no cover
609
+ columns = list(df.schema.names)
610
+ elif hasattr(df, "column_names"): # pragma: no cover
611
+ columns = list(df.column_names)
612
+
613
+ if not columns: # pragma: no cover
614
+ # Fallback: try to determine columns from first row
615
+ try:
616
+ if hasattr(df, "to_dicts") and len(df) > 0:
617
+ first_dict = df.to_dicts()[0]
618
+ columns = list(first_dict.keys())
619
+ elif hasattr(df, "to_dict") and len(df) > 0:
620
+ first_dict = df.to_dict("records")[0]
621
+ columns = list(first_dict.keys())
622
+ except Exception: # pragma: no cover
623
+ columns = [f"Column {i + 1}" for i in range(10)] # Default fallback
624
+
625
+ # Add columns to Rich table
626
+ # Handle wide tables by limiting columns displayed
627
+ max_terminal_cols = 15 # Reasonable limit for terminal display
628
+
629
+ # Get terminal width to adjust column behavior
630
+ try:
631
+ terminal_width = console.size.width
632
+ # Estimate max column width based on terminal size and number of columns
633
+ if len(columns) <= 5:
634
+ max_col_width = min(60, terminal_width // 4)
635
+ elif len(columns) <= 10:
636
+ max_col_width = min(40, terminal_width // 6)
637
+ else:
638
+ max_col_width = min(30, terminal_width // 8)
639
+ except Exception: # pragma: no cover
640
+ # Fallback if we can't get terminal width
641
+ max_col_width = 40 if len(columns) <= 10 else 25
642
+
643
+ if len(columns) > max_terminal_cols:
644
+ # For wide tables, show first few, middle indicator, and last few columns
645
+ first_cols = 7
646
+ last_cols = 7
647
+
648
+ display_columns = columns[:first_cols] + ["...more..."] + columns[-last_cols:]
649
+
650
+ console.print(
651
+ f"\n[yellow]⚠ Table has {len(columns)} columns. Showing first {first_cols} and last {last_cols} columns.[/yellow]"
652
+ )
653
+ console.print("[dim]Use --columns to specify which columns to display.[/dim]")
654
+ console.print(
655
+ f"[dim]Full column list: {', '.join(columns[:5])}...{', '.join(columns[-5:])}[/dim]\n"
656
+ )
657
+ else:
658
+ display_columns = columns
659
+
660
+ # Get data types for columns
661
+ dtypes_dict = _get_column_dtypes(df, columns)
662
+
663
+ # Calculate row number column width if needed
664
+ row_num_width = 6 # Default width
665
+ if "_row_num_" in columns:
666
+ try:
667
+ # Get the maximum row number to calculate appropriate width
668
+ if hasattr(df, "to_dicts"):
669
+ data_dict = df.to_dicts()
670
+ if data_dict:
671
+ row_nums = [row.get("_row_num_", 0) for row in data_dict]
672
+ max_row_num = max(row_nums) if row_nums else 0
673
+ row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
674
+ elif hasattr(df, "to_dict"):
675
+ data_dict = df.to_dict("records")
676
+ if data_dict:
677
+ row_nums = [row.get("_row_num_", 0) for row in data_dict]
678
+ max_row_num = max(row_nums) if row_nums else 0
679
+ row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
680
+ except Exception: # pragma: no cover
681
+ # If we can't determine max row number, use default
682
+ row_num_width = 8 # Slightly larger default for safety
683
+
684
+ for i, col in enumerate(display_columns):
685
+ if col == "...more...":
686
+ # Add a special indicator column
687
+ rich_table.add_column("···", style="dim", width=3, no_wrap=True)
688
+ else:
689
+ # Handle row number column specially
690
+ if col == "_row_num_":
691
+ # Row numbers get no header, right alignment, and dim gray style
692
+ # Use dynamic width to prevent truncation
693
+ rich_table.add_column(
694
+ "", style="dim", justify="right", no_wrap=True, width=row_num_width
695
+ )
696
+ else:
697
+ display_col = str(col)
698
+
699
+ # Get data type for this column (if available)
700
+ if col in dtypes_dict:
701
+ dtype_display = f"<{dtypes_dict[col]}>"
702
+ # Create header with column name and data type
703
+ header_text = f"{display_col}\n[dim yellow]{dtype_display}[/dim yellow]"
704
+ else:
705
+ header_text = display_col
706
+
707
+ rich_table.add_column(
708
+ header_text,
709
+ style="cyan",
710
+ no_wrap=False,
711
+ overflow="ellipsis",
712
+ max_width=max_col_width,
713
+ )
714
+
715
+ # Convert data to list of rows
716
+ rows = []
717
+ try:
718
+ if hasattr(df, "to_dicts"):
719
+ # Polars interface
720
+ data_dict = df.to_dicts()
376
721
  if len(columns) > max_terminal_cols:
377
722
  # For wide tables, extract only the displayed columns
378
723
  display_data_columns = (
@@ -519,44 +864,45 @@ def _rich_print_gt_table(gt_table: Any, preview_info: dict | None = None) -> Non
519
864
  console.print()
520
865
  console.print(rich_table)
521
866
 
522
- # Show summary info
523
- total_rows = len(rows)
867
+ # Show summary info (conditionally)
868
+ if show_summary:
869
+ total_rows = len(rows)
524
870
 
525
- # Use preview info if available, otherwise fall back to old logic
526
- if preview_info:
527
- total_dataset_rows = preview_info.get("total_rows", total_rows)
528
- head_rows = preview_info.get("head_rows", 0)
529
- tail_rows = preview_info.get("tail_rows", 0)
530
- is_complete = preview_info.get("is_complete", False)
871
+ # Use preview info if available, otherwise fall back to old logic
872
+ if preview_info:
873
+ total_dataset_rows = preview_info.get("total_rows", total_rows)
874
+ head_rows = preview_info.get("head_rows", 0)
875
+ tail_rows = preview_info.get("tail_rows", 0)
876
+ is_complete = preview_info.get("is_complete", False)
531
877
 
532
- if is_complete:
533
- console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
534
- elif head_rows > 0 and tail_rows > 0:
535
- console.print(
536
- f"\n[dim]Showing first {head_rows} and last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
537
- )
538
- elif head_rows > 0:
539
- console.print(
540
- f"\n[dim]Showing first {head_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
541
- )
542
- elif tail_rows > 0:
543
- console.print(
544
- f"\n[dim]Showing last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
545
- )
546
- else:
547
- # Fallback for other cases
548
- console.print(
549
- f"\n[dim]Showing {total_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
550
- )
551
- else:
552
- # Original logic as fallback
553
- max_rows = 50 # This should match the limit used above
554
- if total_rows > max_rows:
555
- console.print(
556
- f"\n[dim]Showing first {max_rows} of {total_rows} rows. Use --output-html to see all data.[/dim]"
557
- )
878
+ if is_complete:
879
+ console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
880
+ elif head_rows > 0 and tail_rows > 0:
881
+ console.print(
882
+ f"\n[dim]Showing first {head_rows} and last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
883
+ )
884
+ elif head_rows > 0:
885
+ console.print(
886
+ f"\n[dim]Showing first {head_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
887
+ )
888
+ elif tail_rows > 0:
889
+ console.print(
890
+ f"\n[dim]Showing last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
891
+ )
892
+ else:
893
+ # Fallback for other cases
894
+ console.print(
895
+ f"\n[dim]Showing {total_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
896
+ )
558
897
  else:
559
- console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
898
+ # Original logic as fallback
899
+ max_rows = 50 # This should match the limit used above
900
+ if total_rows > max_rows:
901
+ console.print(
902
+ f"\n[dim]Showing first {max_rows} of {total_rows} rows. Use --output-html to see all data.[/dim]"
903
+ )
904
+ else:
905
+ console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
560
906
 
561
907
  else:
562
908
  # If we can't extract data, show the success message
@@ -693,73 +1039,71 @@ def _display_validation_summary(validation: Any) -> None:
693
1039
  console.print(f"[dim]{traceback.format_exc()}[/dim]") # pragma: no cover
694
1040
 
695
1041
 
696
- @click.group()
1042
+ @click.group(cls=OrderedGroup)
697
1043
  @click.version_option(version=pb.__version__, prog_name="pb")
698
1044
  def cli():
699
1045
  """
700
1046
  Pointblank CLI - Data validation and quality tools for data engineers.
701
1047
 
702
- Use this CLI to validate data, preview tables, and generate reports
1048
+ Use this CLI to run validation scripts, preview tables, and generate reports
703
1049
  directly from the command line.
704
1050
  """
705
1051
  pass
706
1052
 
707
1053
 
708
1054
  @cli.command()
709
- def datasets():
710
- """
711
- List available built-in datasets.
1055
+ @click.argument("data_source", type=str)
1056
+ def info(data_source: str):
712
1057
  """
713
- datasets_info = [
714
- ("small_table", "13 rows × 8 columns", "Small demo dataset for testing"),
715
- ("game_revenue", "2,000 rows × 11 columns", "Game development company revenue data"),
716
- ("nycflights", "336,776 rows × 18 columns", "NYC airport flights data from 2013"),
717
- ("global_sales", "50,000 rows × 20 columns", "Global sales data across regions"),
718
- ]
1058
+ Display information about a data source.
719
1059
 
720
- table = Table(
721
- title="Available Pointblank Datasets", show_header=True, header_style="bold magenta"
722
- )
723
- table.add_column("Dataset Name", style="cyan", no_wrap=True)
724
- table.add_column("Dimensions", style="green")
725
- table.add_column("Description", style="white")
1060
+ Shows table type, dimensions, column names, and data types.
726
1061
 
727
- for name, dims, desc in datasets_info:
728
- table.add_row(name, dims, desc)
1062
+ DATA_SOURCE can be:
729
1063
 
730
- console.print(table)
731
- console.print("\n[dim]Use these dataset names directly with any pb CLI command.[/dim]")
732
- console.print("[dim]Example: pb preview small_table[/dim]")
1064
+ \b
1065
+ - CSV file path (e.g., data.csv)
1066
+ - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
1067
+ - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1068
+ - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1069
+ - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1070
+ """
1071
+ try:
1072
+ with console.status("[bold green]Loading data..."):
1073
+ # Load the data source using the centralized function
1074
+ data = _load_data_source(data_source)
733
1075
 
1076
+ # Get table information
1077
+ tbl_type = _get_tbl_type(data)
1078
+ row_count = pb.get_row_count(data)
1079
+ col_count = pb.get_column_count(data)
734
1080
 
735
- @cli.command()
736
- def requirements():
737
- """
738
- Check installed dependencies and their availability.
739
- """
740
- dependencies = [
741
- ("polars", "Polars DataFrame support"),
742
- ("pandas", "Pandas DataFrame support"),
743
- ("ibis", "Ibis backend support (DuckDB, etc.)"),
744
- ("duckdb", "DuckDB database support"),
745
- ("pyarrow", "Parquet file support"),
746
- ]
1081
+ # Import the box style
1082
+ from rich.box import SIMPLE_HEAD
747
1083
 
748
- table = Table(title="Dependency Status", show_header=True, header_style="bold magenta")
749
- table.add_column("Package", style="cyan", no_wrap=True)
750
- table.add_column("Status", style="white")
751
- table.add_column("Description", style="dim")
1084
+ # Create info table
1085
+ info_table = Table(
1086
+ title="Data Source Information",
1087
+ show_header=True,
1088
+ header_style="bold magenta",
1089
+ box=SIMPLE_HEAD,
1090
+ title_style="bold cyan",
1091
+ title_justify="left",
1092
+ )
1093
+ info_table.add_column("Property", style="cyan", no_wrap=True)
1094
+ info_table.add_column("Value", style="green")
752
1095
 
753
- for package, description in dependencies:
754
- if _is_lib_present(package):
755
- status = "[green]✓ Installed[/green]"
756
- else:
757
- status = "[red]✗ Not installed[/red]"
1096
+ info_table.add_row("Source", data_source)
1097
+ info_table.add_row("Table Type", tbl_type)
1098
+ info_table.add_row("Rows", f"{row_count:,}")
1099
+ info_table.add_row("Columns", f"{col_count:,}")
758
1100
 
759
- table.add_row(package, status, description)
1101
+ console.print()
1102
+ console.print(info_table)
760
1103
 
761
- console.print(table)
762
- console.print("\n[dim]Install missing packages to enable additional functionality.[/dim]")
1104
+ except Exception as e:
1105
+ console.print(f"[red]Error:[/red] {e}")
1106
+ sys.exit(1)
763
1107
 
764
1108
 
765
1109
  @cli.command()
@@ -799,6 +1143,7 @@ def preview(
799
1143
  \b
800
1144
  - CSV file path (e.g., data.csv)
801
1145
  - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
1146
+ - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
802
1147
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
803
1148
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
804
1149
 
@@ -816,14 +1161,10 @@ def preview(
816
1161
  """
817
1162
  try:
818
1163
  with console.status("[bold green]Loading data..."):
819
- # Try to load as a pointblank dataset first
820
- if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
821
- data = pb.load_dataset(data_source)
822
- console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
823
- else:
824
- # Assume it's a file path or connection string
825
- data = data_source
826
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1164
+ # Load the data source using the centralized function
1165
+ data = _load_data_source(data_source)
1166
+
1167
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
827
1168
 
828
1169
  # Parse columns if provided
829
1170
  columns_list = None
@@ -832,18 +1173,8 @@ def preview(
832
1173
 
833
1174
  # If data has _row_num_ and it's not explicitly included, add it at the beginning
834
1175
  try:
835
- from pointblank.validate import (
836
- _process_connection_string,
837
- _process_csv_input,
838
- _process_parquet_input,
839
- )
840
-
841
- # Process the data source to get actual data object to check for _row_num_
1176
+ # Data is already processed, just use it directly
842
1177
  processed_data = data
843
- if isinstance(data, str):
844
- processed_data = _process_connection_string(data)
845
- processed_data = _process_csv_input(processed_data)
846
- processed_data = _process_parquet_input(processed_data)
847
1178
 
848
1179
  # Get column names from the processed data
849
1180
  all_columns = []
@@ -860,19 +1191,8 @@ def preview(
860
1191
  pass
861
1192
  elif col_range or col_first or col_last:
862
1193
  # Need to get column names to apply range/first/last selection
863
- # Load the data to get column names
864
- from pointblank.validate import (
865
- _process_connection_string,
866
- _process_csv_input,
867
- _process_parquet_input,
868
- )
869
-
870
- # Process the data source to get actual data object
1194
+ # Data is already processed, just use it directly
871
1195
  processed_data = data
872
- if isinstance(data, str):
873
- processed_data = _process_connection_string(data)
874
- processed_data = _process_csv_input(processed_data)
875
- processed_data = _process_parquet_input(processed_data)
876
1196
 
877
1197
  # Get column names from the processed data
878
1198
  all_columns = []
@@ -934,20 +1254,11 @@ def preview(
934
1254
  with console.status("[bold green]Generating preview..."):
935
1255
  # Get total dataset size before preview and gather metadata
936
1256
  try:
937
- # Process the data to get the actual data object for row count and metadata
938
- from pointblank.validate import (
939
- _process_connection_string,
940
- _process_csv_input,
941
- _process_parquet_input,
942
- )
943
-
1257
+ # Data is already processed, just use it directly
944
1258
  processed_data = data
945
- if isinstance(data, str):
946
- processed_data = _process_connection_string(data)
947
- processed_data = _process_csv_input(processed_data)
948
- processed_data = _process_parquet_input(processed_data)
949
1259
 
950
1260
  total_dataset_rows = pb.get_row_count(processed_data)
1261
+ total_dataset_columns = pb.get_column_count(processed_data)
951
1262
 
952
1263
  # Determine source type and table type for enhanced preview title
953
1264
  if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
@@ -959,6 +1270,7 @@ def preview(
959
1270
  except Exception:
960
1271
  # If we can't get metadata, set defaults
961
1272
  total_dataset_rows = None
1273
+ total_dataset_columns = None
962
1274
  source_type = f"Data source: {data_source}"
963
1275
  table_type = "unknown"
964
1276
 
@@ -989,6 +1301,7 @@ def preview(
989
1301
 
990
1302
  preview_info = {
991
1303
  "total_rows": total_dataset_rows,
1304
+ "total_columns": total_dataset_columns,
992
1305
  "head_rows": head,
993
1306
  "tail_rows": tail,
994
1307
  "is_complete": is_complete,
@@ -1003,71 +1316,6 @@ def preview(
1003
1316
  sys.exit(1) # pragma: no cover
1004
1317
 
1005
1318
 
1006
- @cli.command()
1007
- @click.argument("data_source", type=str)
1008
- def info(data_source: str):
1009
- """
1010
- Display information about a data source.
1011
-
1012
- Shows table type, dimensions, column names, and data types.
1013
- """
1014
- try:
1015
- with console.status("[bold green]Loading data..."):
1016
- # Try to load as a pointblank dataset first
1017
- if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1018
- data = pb.load_dataset(data_source)
1019
- source_type = f"Pointblank dataset: {data_source}"
1020
- console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
1021
- else:
1022
- # Assume it's a file path or connection string
1023
- data = data_source
1024
- source_type = f"External source: {data_source}"
1025
-
1026
- # Process the data to get actual table object for inspection
1027
- from pointblank.validate import (
1028
- _process_connection_string,
1029
- _process_csv_input,
1030
- _process_parquet_input,
1031
- )
1032
-
1033
- data = _process_connection_string(data)
1034
- data = _process_csv_input(data)
1035
- data = _process_parquet_input(data)
1036
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1037
-
1038
- # Get table information
1039
- tbl_type = _get_tbl_type(data)
1040
- row_count = pb.get_row_count(data)
1041
- col_count = pb.get_column_count(data)
1042
-
1043
- # Import the box style for consistent styling with scan table
1044
- from rich.box import SIMPLE_HEAD
1045
-
1046
- # Create info table with same styling as scan table
1047
- info_table = Table(
1048
- title="Data Source Information",
1049
- show_header=True,
1050
- header_style="bold magenta",
1051
- box=SIMPLE_HEAD,
1052
- title_style="bold cyan",
1053
- title_justify="left",
1054
- )
1055
- info_table.add_column("Property", style="cyan", no_wrap=True)
1056
- info_table.add_column("Value", style="green")
1057
-
1058
- info_table.add_row("Source", source_type)
1059
- info_table.add_row("Table Type", tbl_type)
1060
- info_table.add_row("Rows", f"{row_count:,}")
1061
- info_table.add_row("Columns", f"{col_count:,}")
1062
-
1063
- console.print()
1064
- console.print(info_table)
1065
-
1066
- except Exception as e:
1067
- console.print(f"[red]Error:[/red] {e}")
1068
- sys.exit(1)
1069
-
1070
-
1071
1319
  @cli.command()
1072
1320
  @click.argument("data_source", type=str)
1073
1321
  @click.option("--output-html", type=click.Path(), help="Save HTML scan report to file")
@@ -1093,6 +1341,7 @@ def scan(
1093
1341
  \b
1094
1342
  - CSV file path (e.g., data.csv)
1095
1343
  - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
1344
+ - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1096
1345
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1097
1346
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1098
1347
  """
@@ -1102,14 +1351,10 @@ def scan(
1102
1351
  start_time = time.time()
1103
1352
 
1104
1353
  with console.status("[bold green]Loading data..."):
1105
- # Try to load as a pointblank dataset first
1106
- if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1107
- data = pb.load_dataset(data_source)
1108
- console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
1109
- else:
1110
- # Assume it's a file path or connection string
1111
- data = data_source
1112
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1354
+ # Load the data source using the centralized function
1355
+ data = _load_data_source(data_source)
1356
+
1357
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1113
1358
 
1114
1359
  # Parse columns if provided
1115
1360
  columns_list = None
@@ -1119,35 +1364,22 @@ def scan(
1119
1364
  # Generate data scan
1120
1365
  with console.status("[bold green]Generating data scan..."):
1121
1366
  # Use col_summary_tbl for comprehensive column scanning
1367
+ # Data is already processed by _load_data_source
1368
+ scan_result = pb.col_summary_tbl(data=data)
1369
+
1122
1370
  if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1123
- # For pointblank datasets, data is already the loaded dataframe
1124
- scan_result = pb.col_summary_tbl(data=data)
1125
1371
  source_type = f"Pointblank dataset: {data_source}"
1126
- table_type = _get_tbl_type(data)
1127
- # Get row count for footer
1128
- try:
1129
- total_rows = pb.get_row_count(data)
1130
- except Exception:
1131
- total_rows = None
1132
1372
  else:
1133
- # For file paths and connection strings, load the data first
1134
- from pointblank.validate import (
1135
- _process_connection_string,
1136
- _process_csv_input,
1137
- _process_parquet_input,
1138
- )
1139
-
1140
- processed_data = _process_connection_string(data)
1141
- processed_data = _process_csv_input(processed_data)
1142
- processed_data = _process_parquet_input(processed_data)
1143
- scan_result = pb.col_summary_tbl(data=processed_data)
1144
1373
  source_type = f"External source: {data_source}"
1145
- table_type = _get_tbl_type(processed_data)
1146
- # Get row count for footer
1147
- try:
1148
- total_rows = pb.get_row_count(processed_data)
1149
- except Exception:
1150
- total_rows = None
1374
+
1375
+ table_type = _get_tbl_type(data)
1376
+ # Get row count and column count for header
1377
+ try:
1378
+ total_rows = pb.get_row_count(data)
1379
+ total_columns = pb.get_column_count(data)
1380
+ except Exception:
1381
+ total_rows = None
1382
+ total_columns = None
1151
1383
 
1152
1384
  scan_time = time.time() - start_time
1153
1385
 
@@ -1167,7 +1399,7 @@ def scan(
1167
1399
  # Display detailed column summary using rich formatting
1168
1400
  try:
1169
1401
  _rich_print_scan_table(
1170
- scan_result, data_source, source_type, table_type, total_rows
1402
+ scan_result, data_source, source_type, table_type, total_rows, total_columns
1171
1403
  )
1172
1404
 
1173
1405
  except Exception as e:
@@ -1190,40 +1422,23 @@ def missing(data_source: str, output_html: str | None):
1190
1422
  \b
1191
1423
  - CSV file path (e.g., data.csv)
1192
1424
  - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
1425
+ - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1193
1426
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1194
1427
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1195
1428
  """
1196
1429
  try:
1197
1430
  with console.status("[bold green]Loading data..."):
1198
- # Try to load as a pointblank dataset first
1199
- if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1200
- data = pb.load_dataset(data_source)
1201
- console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
1202
- else:
1203
- # Assume it's a file path or connection string
1204
- data = data_source
1205
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1431
+ # Load the data source using the centralized function
1432
+ data = _load_data_source(data_source)
1433
+
1434
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1206
1435
 
1207
1436
  # Generate missing values table
1208
1437
  with console.status("[bold green]Analyzing missing values..."):
1209
1438
  gt_table = pb.missing_vals_tbl(data)
1210
1439
 
1211
- # Get original data for column types
1440
+ # Data is already processed, just use it directly
1212
1441
  original_data = data
1213
- if isinstance(data, str):
1214
- # Process the data to get the actual data object
1215
- from pointblank.validate import (
1216
- _process_connection_string,
1217
- _process_csv_input,
1218
- _process_parquet_input,
1219
- )
1220
-
1221
- try:
1222
- original_data = _process_connection_string(data)
1223
- original_data = _process_csv_input(original_data)
1224
- original_data = _process_parquet_input(original_data)
1225
- except Exception: # pragma: no cover
1226
- pass # Use the string data as fallback
1227
1442
 
1228
1443
  if output_html:
1229
1444
  # Save HTML to file
@@ -1239,556 +1454,505 @@ def missing(data_source: str, output_html: str | None):
1239
1454
  sys.exit(1)
1240
1455
 
1241
1456
 
1242
- @cli.command()
1243
- @click.argument("output_file", type=click.Path())
1244
- def validate_example(output_file: str):
1245
- """
1246
- Generate an example validation script.
1247
-
1248
- Creates a sample Python script showing how to use Pointblank for validation.
1249
- """
1250
- example_script = '''"""
1251
- Example Pointblank validation script.
1252
-
1253
- This script demonstrates how to create validation rules for your data.
1254
- Modify the validation rules below to match your data requirements.
1255
- """
1256
-
1257
- import pointblank as pb
1258
-
1259
- # Create a validation object
1260
- # The 'data' variable is automatically provided by the CLI
1261
- validation = (
1262
- pb.Validate(
1263
- data=data,
1264
- tbl_name="Example Data",
1265
- label="CLI Validation Example",
1266
- thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
1267
- )
1268
- # Add your validation rules here
1269
- # Example rules (modify these based on your data structure):
1270
-
1271
- # Check that specific columns exist
1272
- # .col_exists(["column1", "column2"])
1273
-
1274
- # Check for null values
1275
- # .col_vals_not_null(columns="important_column")
1276
-
1277
- # Check value ranges
1278
- # .col_vals_gt(columns="amount", value=0)
1279
- # .col_vals_between(columns="score", left=0, right=100)
1280
-
1281
- # Check string patterns
1282
- # .col_vals_regex(columns="email", pattern=r"^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$")
1283
-
1284
- # Check unique values
1285
- # .col_vals_unique(columns="id")
1286
-
1287
- # Finalize the validation
1288
- .interrogate()
1289
- )
1290
-
1291
- # The validation object will be automatically used by the CLI
1292
- '''
1293
-
1294
- Path(output_file).write_text(example_script)
1295
- console.print(f"[green]✓[/green] Example validation script created: {output_file}")
1296
- console.print("\nEdit the script to add your validation rules, then run:")
1297
- console.print(f"[cyan]pb validate your_data.csv {output_file}[/cyan]")
1298
-
1299
-
1300
- @cli.command()
1457
+ @cli.command(name="validate")
1301
1458
  @click.argument("data_source", type=str)
1302
- @click.argument("validation_script", type=click.Path(exists=True))
1303
- @click.option("--output-html", type=click.Path(), help="Save HTML validation report to file")
1304
- @click.option("--output-json", type=click.Path(), help="Save JSON validation summary to file")
1305
- @click.option("--fail-on-error", is_flag=True, help="Exit with non-zero code if validation fails")
1459
+ @click.option(
1460
+ "--check",
1461
+ "checks", # Changed to collect multiple values
1462
+ type=click.Choice(
1463
+ [
1464
+ "rows-distinct",
1465
+ "col-vals-not-null",
1466
+ "rows-complete",
1467
+ "col-exists",
1468
+ "col-vals-in-set",
1469
+ "col-vals-gt",
1470
+ "col-vals-ge",
1471
+ "col-vals-lt",
1472
+ "col-vals-le",
1473
+ ]
1474
+ ),
1475
+ multiple=True, # Allow multiple --check options
1476
+ help="Type of validation check to perform. Can be used multiple times for multiple checks.",
1477
+ )
1478
+ @click.option("--list-checks", is_flag=True, help="List available validation checks and exit")
1479
+ @click.option(
1480
+ "--column",
1481
+ "columns", # Changed to collect multiple values
1482
+ multiple=True, # Allow multiple --column options
1483
+ help="Column name or integer position as #N (1-based index) for validation.",
1484
+ )
1485
+ @click.option(
1486
+ "--set",
1487
+ "sets", # Changed to collect multiple values
1488
+ multiple=True, # Allow multiple --set options
1489
+ help="Comma-separated allowed values for col-vals-in-set checks.",
1490
+ )
1491
+ @click.option(
1492
+ "--value",
1493
+ "values", # Changed to collect multiple values
1494
+ type=float,
1495
+ multiple=True, # Allow multiple --value options
1496
+ help="Numeric value for comparison checks.",
1497
+ )
1498
+ @click.option(
1499
+ "--show-extract", is_flag=True, help="Show extract of failing rows if validation fails"
1500
+ )
1501
+ @click.option(
1502
+ "--write-extract", type=str, help="Save failing rows to folder. Provide base name for folder."
1503
+ )
1504
+ @click.option(
1505
+ "--limit", "-l", default=10, help="Maximum number of failing rows to show/save (default: 10)"
1506
+ )
1507
+ @click.option("--exit-code", is_flag=True, help="Exit with non-zero code if validation fails")
1508
+ @click.pass_context
1306
1509
  def validate(
1510
+ ctx: click.Context,
1307
1511
  data_source: str,
1308
- validation_script: str,
1309
- output_html: str | None,
1310
- output_json: str | None,
1311
- fail_on_error: bool,
1512
+ checks: tuple[str, ...], # Changed to tuple
1513
+ columns: tuple[str, ...], # Changed to tuple
1514
+ sets: tuple[str, ...], # Changed to tuple
1515
+ values: tuple[float, ...], # Changed to tuple
1516
+ show_extract: bool,
1517
+ write_extract: str | None,
1518
+ limit: int,
1519
+ exit_code: bool,
1520
+ list_checks: bool,
1312
1521
  ):
1313
1522
  """
1314
- Run validation using a Python validation script.
1523
+ Perform single or multiple data validations.
1524
+
1525
+ Run one or more validation checks on your data in a single command.
1526
+ Use multiple --check options to perform multiple validations.
1315
1527
 
1316
1528
  DATA_SOURCE can be:
1317
1529
 
1318
1530
  \b
1319
1531
  - CSV file path (e.g., data.csv)
1320
1532
  - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
1533
+ - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1321
1534
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1322
1535
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1323
1536
 
1324
- VALIDATION_SCRIPT should be a Python file that defines validation rules.
1325
- See 'pb validate-example' for a sample script.
1326
- """
1327
- try:
1328
- with console.status("[bold green]Loading data..."):
1329
- # Try to load as a pointblank dataset first
1330
- if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1331
- data = pb.load_dataset(data_source)
1332
- console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
1333
- else:
1334
- # Assume it's a file path or connection string
1335
- data = data_source
1336
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1337
-
1338
- # Execute the validation script
1339
- with console.status("[bold green]Running validation..."):
1340
- # Read and execute the validation script
1341
- script_content = Path(validation_script).read_text()
1342
-
1343
- # Create a namespace with pointblank and the data
1344
- namespace = {
1345
- "pb": pb,
1346
- "pointblank": pb,
1347
- "data": data,
1348
- "__name__": "__main__",
1349
- }
1350
-
1351
- # Execute the script
1352
- try:
1353
- exec(script_content, namespace)
1354
- except Exception as e:
1355
- console.print(f"[red]Error executing validation script:[/red] {e}")
1356
- sys.exit(1)
1357
-
1358
- # Look for a validation object in the namespace
1359
- validation = None
1360
-
1361
- # Try to find the 'validation' variable specifically first
1362
- if "validation" in namespace:
1363
- validation = namespace["validation"]
1364
- else:
1365
- # Look for any validation object in the namespace
1366
- for key, value in namespace.items():
1367
- if hasattr(value, "interrogate") and hasattr(value, "validation_info"):
1368
- validation = value
1369
- break
1370
- # Also check if it's a Validate object that has been interrogated
1371
- elif str(type(value)).find("Validate") != -1:
1372
- validation = value
1373
- break
1374
-
1375
- if validation is None:
1376
- raise ValueError(
1377
- "No validation object found in script. "
1378
- "Script should create a Validate object and assign it to a variable named 'validation'."
1379
- )
1380
-
1381
- console.print("[green]✓[/green] Validation completed")
1382
-
1383
- # Display summary
1384
- _display_validation_summary(validation)
1385
-
1386
- # Save outputs
1387
- if output_html:
1388
- try:
1389
- # Get HTML representation
1390
- html_content = validation._repr_html_()
1391
- Path(output_html).write_text(html_content, encoding="utf-8")
1392
- console.print(f"[green]✓[/green] HTML report saved to: {output_html}")
1393
- except Exception as e:
1394
- console.print(f"[yellow]Warning: Could not save HTML report: {e}[/yellow]")
1395
-
1396
- if output_json:
1397
- try:
1398
- # Get JSON report
1399
- json_report = validation.get_json_report()
1400
- Path(output_json).write_text(json_report, encoding="utf-8")
1401
- console.print(f"[green]✓[/green] JSON summary saved to: {output_json}")
1402
- except Exception as e:
1403
- console.print(f"[yellow]Warning: Could not save JSON report: {e}[/yellow]")
1404
-
1405
- # Check if we should fail on error
1406
- if fail_on_error:
1407
- try:
1408
- if (
1409
- hasattr(validation, "validation_info")
1410
- and validation.validation_info is not None
1411
- ):
1412
- info = validation.validation_info
1413
- n_critical = sum(1 for step in info if step.critical)
1414
- n_error = sum(1 for step in info if step.error)
1415
-
1416
- if n_critical > 0 or n_error > 0:
1417
- severity = "critical" if n_critical > 0 else "error"
1418
- console.print(
1419
- f"[red]Exiting with error due to {severity} validation failures[/red]"
1420
- )
1421
- sys.exit(1)
1422
- except Exception as e:
1423
- console.print(
1424
- f"[yellow]Warning: Could not check validation status for fail-on-error: {e}[/yellow]"
1425
- )
1426
-
1427
- except Exception as e:
1428
- console.print(f"[red]Error:[/red] {e}")
1429
- sys.exit(1)
1430
-
1431
-
1432
- @cli.command()
1433
- @click.argument("data_source", type=str)
1434
- @click.argument("validation_script", type=click.Path(exists=True))
1435
- @click.argument("step_number", type=int)
1436
- @click.option(
1437
- "--limit", "-l", default=100, help="Maximum number of failing rows to show (default: 100)"
1438
- )
1439
- @click.option("--output-csv", type=click.Path(), help="Save failing rows to CSV file")
1440
- @click.option("--output-html", type=click.Path(), help="Save failing rows table to HTML file")
1441
- def extract(
1442
- data_source: str,
1443
- validation_script: str,
1444
- step_number: int,
1445
- limit: int,
1446
- output_csv: str | None,
1447
- output_html: str | None,
1448
- ):
1449
- """
1450
- Extract failing rows from a specific validation step.
1451
-
1452
- This command runs a validation and extracts the rows that failed
1453
- a specific validation step, which is useful for debugging data quality issues.
1454
-
1455
- DATA_SOURCE: Same as validate command
1456
- VALIDATION_SCRIPT: Path to validation script
1457
- STEP_NUMBER: The step number to extract failing rows from (1-based)
1458
- """
1459
- try:
1460
- with console.status("[bold green]Loading data..."):
1461
- # Try to load as a pointblank dataset first
1462
- if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1463
- data = pb.load_dataset(data_source)
1464
- console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
1465
- else:
1466
- # Assume it's a file path or connection string
1467
- data = data_source
1468
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1469
-
1470
- # Execute the validation script
1471
- with console.status("[bold green]Running validation..."):
1472
- # Read and execute the validation script
1473
- script_content = Path(validation_script).read_text()
1474
-
1475
- # Create a namespace with pointblank and the data
1476
- namespace = {
1477
- "pb": pb,
1478
- "pointblank": pb,
1479
- "data": data,
1480
- "__name__": "__main__",
1481
- }
1482
-
1483
- # Execute the script
1484
- try:
1485
- exec(script_content, namespace)
1486
- except Exception as e:
1487
- console.print(f"[red]Error executing validation script:[/red] {e}")
1488
- sys.exit(1)
1489
-
1490
- # Look for a validation object in the namespace
1491
- validation = None
1492
- if "validation" in namespace:
1493
- validation = namespace["validation"]
1494
- else:
1495
- # Look for any validation object in the namespace
1496
- for key, value in namespace.items():
1497
- if hasattr(value, "interrogate") and hasattr(value, "validation_info"):
1498
- validation = value
1499
- break
1500
- elif str(type(value)).find("Validate") != -1:
1501
- validation = value
1502
- break
1503
-
1504
- if validation is None:
1505
- raise ValueError(
1506
- "No validation object found in script. "
1507
- "Script should create a Validate object and assign it to a variable named 'validation'."
1508
- )
1509
-
1510
- console.print("[green]✓[/green] Validation completed")
1511
-
1512
- # Extract failing rows from the specified step
1513
- with console.status(f"[bold green]Extracting failing rows from step {step_number}..."):
1514
- try:
1515
- # Get the data extracts for the specific step
1516
- step_extract = validation.get_data_extracts(i=step_number, frame=True)
1537
+ AVAILABLE CHECKS:
1517
1538
 
1518
- if step_extract is None or len(step_extract) == 0:
1519
- console.print(f"[yellow]No failing rows found for step {step_number}[/yellow]")
1520
- return
1539
+ Use --list-checks to see all available validation methods with examples.
1521
1540
 
1522
- # Limit the results
1523
- if len(step_extract) > limit:
1524
- step_extract = step_extract.head(limit)
1525
- console.print(f"[yellow]Limited to first {limit} failing rows[/yellow]")
1541
+ The default check is 'rows-distinct' which checks for duplicate rows.
1526
1542
 
1527
- console.print(f"[green]✓[/green] Extracted {len(step_extract)} failing rows")
1543
+ \b
1544
+ - rows-distinct: Check if all rows in the dataset are unique (no duplicates)
1545
+ - rows-complete: Check if all rows are complete (no missing values in any column)
1546
+ - col-exists: Check if a specific column exists in the dataset (requires --column)
1547
+ - col-vals-not-null: Check if all values in a column are not null/missing (requires --column)
1548
+ - col-vals-gt: Check if all values in a column are greater than a threshold (requires --column and --value)
1549
+ - col-vals-ge: Check if all values in a column are greater than or equal to a threshold (requires --column and --value)
1550
+ - col-vals-lt: Check if all values in a column are less than a threshold (requires --column and --value)
1551
+ - col-vals-le: Check if all values in a column are less than or equal to a threshold (requires --column and --value)
1552
+ - col-vals-in-set: Check if all values in a column are in an allowed set (requires --column and --set)
1528
1553
 
1529
- # Save outputs
1530
- if output_csv:
1531
- if hasattr(step_extract, "write_csv"):
1532
- step_extract.write_csv(output_csv)
1533
- else:
1534
- step_extract.to_csv(output_csv, index=False)
1535
- console.print(f"[green]✓[/green] Failing rows saved to CSV: {output_csv}")
1554
+ Examples:
1536
1555
 
1537
- if output_html:
1538
- # Create a preview of the failing rows
1539
- preview_table = pb.preview(
1540
- step_extract, n_head=min(10, len(step_extract)), n_tail=0
1541
- )
1542
- html_content = preview_table._repr_html_()
1543
- Path(output_html).write_text(html_content, encoding="utf-8")
1544
- console.print(
1545
- f"[green]✓[/green] Failing rows table saved to HTML: {output_html}"
1546
- )
1556
+ \b
1557
+ pb validate data.csv # Uses default validation (rows-distinct)
1558
+ pb validate data.csv --list-checks # Show all available checks
1559
+ pb validate data.csv --check rows-distinct
1560
+ pb validate data.csv --check rows-distinct --show-extract
1561
+ pb validate data.csv --check rows-distinct --write-extract failing_rows_folder
1562
+ pb validate data.csv --check rows-distinct --exit-code
1563
+ pb validate data.csv --check rows-complete
1564
+ pb validate data.csv --check col-exists --column price
1565
+ pb validate data.csv --check col-vals-not-null --column email
1566
+ pb validate data.csv --check col-vals-gt --column score --value 50
1567
+ pb validate data.csv --check col-vals-in-set --column status --set "active,inactive,pending"
1568
+
1569
+ Multiple validations in one command:
1570
+ pb validate data.csv --check rows-distinct --check rows-complete
1571
+ pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
1572
+ """
1573
+ try:
1574
+ # Handle backward compatibility and parameter conversion
1575
+ import sys
1576
+
1577
+ # Convert parameter tuples to lists, handling default case
1578
+ if not checks:
1579
+ # No --check options provided, use default
1580
+ checks_list = ["rows-distinct"]
1581
+ is_using_default_check = True
1582
+ else:
1583
+ checks_list = list(checks)
1584
+ is_using_default_check = False
1547
1585
 
1548
- if not output_csv and not output_html:
1549
- # Display basic info about the failing rows
1550
- info_table = Table(
1551
- title=f"Failing Rows - Step {step_number}",
1552
- show_header=True,
1553
- header_style="bold red",
1554
- )
1555
- info_table.add_column("Property", style="cyan")
1556
- info_table.add_column("Value", style="white")
1586
+ columns_list = list(columns) if columns else []
1587
+ sets_list = list(sets) if sets else []
1588
+ values_list = list(values) if values else []
1557
1589
 
1558
- info_table.add_row("Total Failing Rows", f"{len(step_extract):,}")
1559
- info_table.add_row(
1560
- "Columns",
1561
- f"{len(step_extract.columns) if hasattr(step_extract, 'columns') else 'N/A'}",
1562
- )
1590
+ # Map parameters to checks intelligently
1591
+ mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
1592
+ checks_list, columns_list, sets_list, values_list
1593
+ )
1563
1594
 
1564
- console.print(info_table)
1565
- console.print(
1566
- "\n[dim]Use --output-csv or --output-html to save the failing rows.[/dim]"
1567
- )
1595
+ # Handle --list-checks option
1596
+ if list_checks:
1597
+ console.print("[bold bright_cyan]Available Validation Checks:[/bold bright_cyan]")
1598
+ console.print()
1599
+ console.print("[bold magenta]Basic checks:[/bold magenta]")
1600
+ console.print(
1601
+ " • [bold cyan]rows-distinct[/bold cyan] Check for duplicate rows [yellow](default)[/yellow]"
1602
+ )
1603
+ console.print(
1604
+ " • [bold cyan]rows-complete[/bold cyan] Check for missing values in any column"
1605
+ )
1606
+ console.print()
1607
+ console.print(
1608
+ "[bold magenta]Column-specific checks [bright_black](require --column)[/bright_black]:[/bold magenta]"
1609
+ )
1610
+ console.print(" • [bold cyan]col-exists[/bold cyan] Check if a column exists")
1611
+ console.print(
1612
+ " • [bold cyan]col-vals-not-null[/bold cyan] Check for null values in a column"
1613
+ )
1614
+ console.print()
1615
+ console.print(
1616
+ "[bold magenta]Value comparison checks [bright_black](require --column and --value)[/bright_black]:[/bold magenta]"
1617
+ )
1618
+ console.print(
1619
+ " • [bold cyan]col-vals-gt[/bold cyan] Values greater than threshold"
1620
+ )
1621
+ console.print(
1622
+ " • [bold cyan]col-vals-ge[/bold cyan] Values greater than or equal to threshold"
1623
+ )
1624
+ console.print(" • [bold cyan]col-vals-lt[/bold cyan] Values less than threshold")
1625
+ console.print(
1626
+ " • [bold cyan]col-vals-le[/bold cyan] Values less than or equal to threshold"
1627
+ )
1628
+ console.print()
1629
+ console.print(
1630
+ "[bold magenta]Set validation check [bright_black](requires --column and --set)[/bright_black]:[/bold magenta]"
1631
+ )
1632
+ console.print(
1633
+ " • [bold cyan]col-vals-in-set[/bold cyan] Values must be in allowed set"
1634
+ )
1635
+ console.print()
1636
+ console.print("[bold bright_yellow]Examples:[/bold bright_yellow]")
1637
+ console.print(
1638
+ f" [bright_blue]pb validate {data_source} --check rows-distinct[/bright_blue]"
1639
+ )
1640
+ console.print(
1641
+ f" [bright_blue]pb validate {data_source} --check col-vals-not-null --column price[/bright_blue]"
1642
+ )
1643
+ console.print(
1644
+ f" [bright_blue]pb validate {data_source} --check col-vals-gt --column age --value 18[/bright_blue]"
1645
+ )
1646
+ import sys
1568
1647
 
1569
- except Exception as e:
1570
- console.print(f"[red]Error extracting failing rows:[/red] {e}")
1571
- # Try to provide helpful information
1572
- if hasattr(validation, "validation_info") and validation.validation_info:
1573
- max_step = len(validation.validation_info)
1574
- console.print(f"[yellow]Available steps: 1 to {max_step}[/yellow]")
1575
-
1576
- # Show step information
1577
- steps_table = Table(title="Available Validation Steps", show_header=True)
1578
- steps_table.add_column("Step", style="cyan")
1579
- steps_table.add_column("Type", style="white")
1580
- steps_table.add_column("Column", style="green")
1581
- steps_table.add_column("Has Failures", style="yellow")
1582
-
1583
- for i, step in enumerate(validation.validation_info, 1):
1584
- has_failures = "Yes" if not step.all_passed else "No"
1585
- steps_table.add_row(
1586
- str(i),
1587
- step.assertion_type,
1588
- str(step.column) if step.column else "—",
1589
- has_failures,
1590
- )
1648
+ sys.exit(0)
1591
1649
 
1592
- console.print(steps_table)
1650
+ # Validate required parameters for different check types
1651
+ # Check parameters for each check in the list using mapped parameters
1652
+ for i, check in enumerate(checks_list):
1653
+ # Get corresponding mapped parameters for this check
1654
+ column = mapped_columns[i] if i < len(mapped_columns) else None
1655
+ set_val = mapped_sets[i] if i < len(mapped_sets) else None
1656
+ value = mapped_values[i] if i < len(mapped_values) else None
1657
+
1658
+ if check == "col-vals-not-null" and not column:
1659
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
1660
+ console.print(
1661
+ "Example: pb validate data.csv --check col-vals-not-null --column email"
1662
+ )
1593
1663
  sys.exit(1)
1594
1664
 
1595
- except Exception as e:
1596
- console.print(f"[red]Error:[/red] {e}")
1597
- sys.exit(1)
1665
+ if check == "col-exists" and not column:
1666
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
1667
+ console.print("Example: pb validate data.csv --check col-exists --column price")
1668
+ sys.exit(1)
1598
1669
 
1670
+ if check == "col-vals-in-set" and not column:
1671
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
1672
+ console.print(
1673
+ "Example: pb validate data.csv --check col-vals-in-set --column status --set 'active,inactive'"
1674
+ )
1675
+ sys.exit(1)
1599
1676
 
1600
- def _format_missing_percentage(value: float) -> str:
1601
- """Format missing value percentages for display.
1677
+ if check == "col-vals-in-set" and not set_val:
1678
+ console.print(f"[red]Error:[/red] --set is required for {check} check")
1679
+ console.print(
1680
+ "Example: pb validate data.csv --check col-vals-in-set --column status --set 'active,inactive'"
1681
+ )
1682
+ sys.exit(1)
1602
1683
 
1603
- Args:
1604
- value: The percentage value (0-100)
1684
+ if check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"] and not column:
1685
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
1686
+ console.print(
1687
+ f"Example: pb validate data.csv --check {check} --column score --value 50"
1688
+ )
1689
+ sys.exit(1)
1605
1690
 
1606
- Returns:
1607
- Formatted string with proper percentage display
1608
- """
1609
- if value == 0.0:
1610
- return "[green][/green]" # Large green circle for no missing values
1611
- elif value == 100.0:
1612
- return "[red]●[/red]" # Large red circle for completely missing values
1613
- elif value < 1.0 and value > 0:
1614
- return "<1%" # Less than 1%
1615
- elif value > 99.0 and value < 100.0:
1616
- return ">99%" # More than 99%
1617
- else:
1618
- return f"{int(round(value))}%" # Round to nearest integer with % sign
1691
+ if (
1692
+ check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"]
1693
+ and value is None
1694
+ ):
1695
+ console.print(f"[red]Error:[/red] --value is required for {check} check")
1696
+ console.print(
1697
+ f"Example: pb validate data.csv --check {check} --column score --value 50"
1698
+ )
1699
+ sys.exit(1)
1619
1700
 
1701
+ with console.status("[bold green]Loading data..."):
1702
+ # Load the data source using the centralized function
1703
+ data = _load_data_source(data_source)
1704
+
1705
+ # Get all column names for error reporting
1706
+ if hasattr(data, "columns"):
1707
+ all_columns = list(data.columns)
1708
+ elif hasattr(data, "schema"):
1709
+ all_columns = list(data.schema.names)
1710
+ else:
1711
+ all_columns = []
1620
1712
 
1621
- def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
1622
- """Convert a missing values GT table to Rich table with special formatting.
1713
+ # Resolve any '#N' column references to actual column names
1714
+ columns_list = _resolve_column_indices(columns_list, data)
1623
1715
 
1624
- Args:
1625
- gt_table: The GT table object for missing values
1626
- original_data: The original data source to extract column types
1627
- """
1628
- try:
1629
- # Extract the underlying data from the GT table
1630
- df = None
1716
+ # Check for out-of-range #N columns and provide a helpful error
1717
+ for col in columns_list:
1718
+ if isinstance(col, str) and col.startswith("#"):
1719
+ try:
1720
+ idx = int(col[1:])
1721
+ if idx < 1 or idx > len(all_columns):
1722
+ console.print(
1723
+ f"[red]Error:[/red] There is no column {idx} (the column position "
1724
+ f"range is 1 to {len(all_columns)})"
1725
+ )
1726
+ sys.exit(1)
1727
+ except Exception:
1728
+ pass # Let later validation handle other errors
1631
1729
 
1632
- if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
1633
- df = gt_table._tbl_data
1634
- elif hasattr(gt_table, "_data") and gt_table._data is not None:
1635
- df = gt_table._data
1636
- elif hasattr(gt_table, "data") and gt_table.data is not None:
1637
- df = gt_table.data
1730
+ # Update mapped_columns to use resolved column names
1731
+ mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
1732
+ checks_list, columns_list, sets_list, values_list
1733
+ )
1638
1734
 
1639
- if df is not None:
1640
- # Create a Rich table with horizontal lines
1641
- from rich.box import SIMPLE_HEAD
1735
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1642
1736
 
1643
- rich_table = Table(show_header=True, header_style="bold magenta", box=SIMPLE_HEAD)
1737
+ # Build a single validation object with chained checks
1738
+ with console.status(f"[bold green]Running {len(checks_list)} validation check(s)..."):
1739
+ # Initialize validation object
1740
+ validation = pb.Validate(
1741
+ data=data,
1742
+ tbl_name=f"Data from {data_source}",
1743
+ label=f"CLI Validation: {', '.join(checks_list)}",
1744
+ )
1644
1745
 
1645
- # Get column names
1646
- columns = []
1647
- try:
1648
- if hasattr(df, "columns"):
1649
- columns = list(df.columns)
1650
- elif hasattr(df, "schema"):
1651
- columns = list(df.schema.names)
1652
- except Exception as e:
1653
- console.print(f"[red]Error getting columns:[/red] {e}")
1654
- columns = []
1746
+ # Add each check to the validation chain
1747
+ for i, check in enumerate(checks_list):
1748
+ # Get corresponding mapped parameters for this check
1749
+ column = mapped_columns[i] if i < len(mapped_columns) else None
1750
+ set_val = mapped_sets[i] if i < len(mapped_sets) else None
1751
+ value = mapped_values[i] if i < len(mapped_values) else None
1655
1752
 
1656
- if not columns:
1657
- columns = [f"Column {i + 1}" for i in range(10)] # Fallback
1753
+ if check == "rows-distinct":
1754
+ validation = validation.rows_distinct()
1755
+ elif check == "col-vals-not-null":
1756
+ validation = validation.col_vals_not_null(columns=column)
1757
+ elif check == "rows-complete":
1758
+ validation = validation.rows_complete()
1759
+ elif check == "col-exists":
1760
+ validation = validation.col_exists(columns=column)
1761
+ elif check == "col-vals-in-set":
1762
+ # Parse the comma-separated set values
1763
+ allowed_values = [v.strip() for v in set_val.split(",")]
1764
+ validation = validation.col_vals_in_set(columns=column, set=allowed_values)
1765
+ elif check == "col-vals-gt":
1766
+ validation = validation.col_vals_gt(columns=column, value=value)
1767
+ elif check == "col-vals-ge":
1768
+ validation = validation.col_vals_ge(columns=column, value=value)
1769
+ elif check == "col-vals-lt":
1770
+ validation = validation.col_vals_lt(columns=column, value=value)
1771
+ elif check == "col-vals-le":
1772
+ validation = validation.col_vals_le(columns=column, value=value)
1773
+ else:
1774
+ console.print(f"[red]Error:[/red] Unknown check type: {check}")
1775
+ sys.exit(1)
1658
1776
 
1659
- # Get original data to extract column types
1660
- column_types = {}
1661
- if original_data is not None:
1662
- try:
1663
- # Get column types from original data
1664
- if hasattr(original_data, "columns"):
1665
- original_columns = list(original_data.columns)
1666
- column_types = _get_column_dtypes(original_data, original_columns)
1667
- except Exception as e:
1668
- console.print(f"[red]Error getting column types:[/red] {e}")
1669
- pass # Use empty dict as fallback
1777
+ # Execute all validations
1778
+ validation = validation.interrogate()
1779
+ all_passed = validation.all_passed()
1670
1780
 
1671
- # Add columns to Rich table with special formatting for missing values table
1672
- sector_columns = [col for col in columns if col != "columns" and col.isdigit()]
1781
+ # Display completion message
1782
+ if len(checks_list) == 1:
1783
+ if is_using_default_check:
1784
+ console.print(
1785
+ f"[green]✓[/green] {checks_list[0]} validation completed [dim](default validation)[/dim]"
1786
+ )
1787
+ else:
1788
+ console.print(f"[green]✓[/green] {checks_list[0]} validation completed")
1789
+ else:
1790
+ console.print(f"[green]✓[/green] {len(checks_list)} validations completed")
1791
+
1792
+ # Display results based on whether we have single or multiple checks
1793
+ if len(checks_list) == 1:
1794
+ # Single check - use current display format
1795
+ _display_validation_result(
1796
+ validation,
1797
+ checks_list,
1798
+ mapped_columns,
1799
+ mapped_sets,
1800
+ mapped_values,
1801
+ data_source,
1802
+ 0,
1803
+ 1,
1804
+ show_extract,
1805
+ write_extract,
1806
+ limit,
1807
+ )
1808
+ else:
1809
+ # Multiple checks - use stacked display format
1810
+ any_failed = False
1811
+ for i in range(len(checks_list)):
1812
+ console.print() # Add spacing between results
1813
+ _display_validation_result(
1814
+ validation,
1815
+ checks_list,
1816
+ mapped_columns,
1817
+ mapped_sets,
1818
+ mapped_values,
1819
+ data_source,
1820
+ i,
1821
+ len(checks_list),
1822
+ show_extract,
1823
+ write_extract,
1824
+ limit,
1825
+ )
1673
1826
 
1674
- # Two separate columns: Column name (20 chars) and Data type (10 chars)
1675
- rich_table.add_column("Column", style="cyan", no_wrap=True, width=20)
1676
- rich_table.add_column("Type", style="yellow", no_wrap=True, width=10)
1827
+ # Check if this validation failed
1828
+ if hasattr(validation, "validation_info") and len(validation.validation_info) > i:
1829
+ step_info = validation.validation_info[i]
1830
+ if step_info.n_failed > 0:
1831
+ any_failed = True
1677
1832
 
1678
- # Sector columns: All same width, optimized for "100%" (4 chars + padding)
1679
- for sector in sector_columns:
1680
- rich_table.add_column(
1681
- sector,
1682
- style="cyan",
1683
- justify="center",
1684
- no_wrap=True,
1685
- width=5, # Fixed width optimized for percentage values
1833
+ # Show tip about --show-extract if any failed and not already used
1834
+ if any_failed and not show_extract:
1835
+ console.print()
1836
+ console.print(
1837
+ "[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
1686
1838
  )
1687
1839
 
1688
- # Convert data to rows with special formatting
1689
- rows = []
1690
- try:
1691
- if hasattr(df, "to_dicts"):
1692
- data_dict = df.to_dicts()
1693
- elif hasattr(df, "to_dict"):
1694
- data_dict = df.to_dict("records")
1695
- else:
1696
- data_dict = []
1840
+ # Add informational hints when using default validation (only for single check)
1841
+ if len(checks_list) == 1 and is_using_default_check:
1842
+ console.print()
1843
+ console.print("[bold blue]ℹ️ Information:[/bold blue] Using default validation method")
1844
+ console.print("To specify a different validation, use the --check option.")
1845
+ console.print()
1846
+ console.print("[bold magenta]Common validation options:[/bold magenta]")
1847
+ console.print(
1848
+ " • [bold cyan]--check rows-complete[/bold cyan] Check for rows with missing values"
1849
+ )
1850
+ console.print(
1851
+ " • [bold cyan]--check col-vals-not-null[/bold cyan] Check for null values in a column [bright_black](requires --column)[/bright_black]"
1852
+ )
1853
+ console.print(
1854
+ " • [bold cyan]--check col-exists[/bold cyan] Check if a column exists [bright_black](requires --column)[/bright_black]"
1855
+ )
1856
+ console.print()
1857
+ console.print("[bold bright_yellow]Examples:[/bold bright_yellow]")
1858
+ console.print(
1859
+ f" [bright_blue]pb validate {data_source} --check rows-complete[/bright_blue]"
1860
+ )
1861
+ console.print(
1862
+ f" [bright_blue]pb validate {data_source} --check col-vals-not-null --column price[/bright_blue]"
1863
+ )
1697
1864
 
1698
- for i, row in enumerate(data_dict):
1699
- try:
1700
- # Each row should have: [column_name, data_type, sector1, sector2, ...]
1701
- column_name = str(row.get("columns", ""))
1865
+ # Exit with appropriate code if requested
1866
+ if exit_code and not all_passed:
1867
+ console.print("[dim]Exiting with non-zero code due to validation failure[/dim]")
1868
+ import sys
1702
1869
 
1703
- # Truncate column name to 20 characters with ellipsis if needed
1704
- if len(column_name) > 20:
1705
- truncated_name = column_name[:17] + "…"
1706
- else:
1707
- truncated_name = column_name
1870
+ sys.exit(1)
1708
1871
 
1709
- # Get data type for this column
1710
- if column_name in column_types:
1711
- dtype = column_types[column_name]
1712
- if len(dtype) > 10:
1713
- truncated_dtype = dtype[:9] + "…"
1714
- else:
1715
- truncated_dtype = dtype
1716
- else:
1717
- truncated_dtype = "?"
1872
+ except Exception as e:
1873
+ console.print(f"[red]Error:[/red] {e}")
1874
+ sys.exit(1)
1718
1875
 
1719
- # Start building the row with column name and type
1720
- formatted_row = [truncated_name, truncated_dtype]
1721
1876
 
1722
- # Add sector values (formatted percentages)
1723
- for sector in sector_columns:
1724
- value = row.get(sector, 0.0)
1725
- if isinstance(value, (int, float)):
1726
- formatted_row.append(_format_missing_percentage(float(value)))
1727
- else:
1728
- formatted_row.append(str(value))
1877
+ @cli.command()
1878
+ def datasets():
1879
+ """
1880
+ List available built-in datasets.
1881
+ """
1882
+ from rich.box import SIMPLE_HEAD
1729
1883
 
1730
- rows.append(formatted_row)
1884
+ datasets_info = [
1885
+ ("small_table", "13 rows × 8 columns", "Small demo dataset for testing"),
1886
+ ("game_revenue", "2,000 rows × 11 columns", "Game development company revenue data"),
1887
+ ("nycflights", "336,776 rows × 18 columns", "NYC airport flights data from 2013"),
1888
+ ("global_sales", "50,000 rows × 20 columns", "Global sales data across regions"),
1889
+ ]
1731
1890
 
1732
- except Exception as e:
1733
- console.print(f"[red]Error processing row {i}:[/red] {e}")
1734
- continue
1891
+ table = Table(
1892
+ title="Available Pointblank Datasets", show_header=True, header_style="bold magenta"
1893
+ )
1735
1894
 
1736
- except Exception as e:
1737
- console.print(f"[red]Error extracting data:[/red] {e}")
1738
- rows = [["Error extracting data", "?", *["" for _ in sector_columns]]]
1895
+ # Create the datasets table
1896
+ table = Table(
1897
+ title="Available Pointblank Datasets",
1898
+ show_header=True,
1899
+ header_style="bold magenta",
1900
+ box=SIMPLE_HEAD,
1901
+ title_style="bold cyan",
1902
+ title_justify="left",
1903
+ )
1739
1904
 
1740
- # Add rows to Rich table
1741
- for row in rows:
1742
- try:
1743
- rich_table.add_row(*row)
1744
- except Exception as e:
1745
- console.print(f"[red]Error adding row:[/red] {e}")
1746
- break
1905
+ table.add_column("Dataset Name", style="cyan", no_wrap=True)
1906
+ table.add_column("Dimensions", style="green")
1907
+ table.add_column("Description", style="white")
1747
1908
 
1748
- # Show the table with custom spanner header if we have sector columns
1749
- if sector_columns:
1750
- # Create a custom header line that shows the spanner
1751
- header_parts = []
1752
- header_parts.append(" " * 20) # Space for Column header
1753
- header_parts.append(" " * 10) # Space for Type header
1909
+ for name, dims, desc in datasets_info:
1910
+ table.add_row(name, dims, desc)
1754
1911
 
1755
- # Left-align "Row Sectors" with the first numbered column
1756
- row_sectors_text = "Row Sectors"
1757
- header_parts.append(row_sectors_text)
1912
+ console.print(table)
1913
+ console.print("\n[dim]Use these dataset names directly with any pb CLI command.[/dim]")
1914
+ console.print("[dim]Example: pb preview small_table[/dim]")
1758
1915
 
1759
- # Print the custom spanner header
1760
- console.print("[dim]" + " ".join(header_parts) + "[/dim]")
1761
1916
 
1762
- # Add a horizontal rule below the spanner
1763
- rule_parts = []
1764
- rule_parts.append(" " * 20) # Space for Column header
1765
- rule_parts.append(" " * 10) # Space for Type header
1917
+ @cli.command()
1918
+ def requirements():
1919
+ """
1920
+ Check installed dependencies and their availability.
1921
+ """
1922
+ from rich.box import SIMPLE_HEAD
1766
1923
 
1767
- # Use a fixed width horizontal rule for "Row Sectors"
1768
- horizontal_rule = "" * 20
1769
- rule_parts.append(horizontal_rule)
1924
+ dependencies = [
1925
+ ("polars", "Polars DataFrame support"),
1926
+ ("pandas", "Pandas DataFrame support"),
1927
+ ("ibis", "Ibis backend support (DuckDB, etc.)"),
1928
+ ("duckdb", "DuckDB database support"),
1929
+ ("pyarrow", "Parquet file support"),
1930
+ ]
1770
1931
 
1771
- # Print the horizontal rule
1772
- console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
1932
+ # Create requirements table
1933
+ table = Table(
1934
+ title="Dependency Status",
1935
+ show_header=True,
1936
+ header_style="bold magenta",
1937
+ box=SIMPLE_HEAD,
1938
+ title_style="bold cyan",
1939
+ title_justify="left",
1940
+ )
1773
1941
 
1774
- # Print the Rich table (will handle terminal width automatically)
1775
- console.print(rich_table)
1776
- footer_text = (
1777
- "[dim]Symbols: [green]●[/green] = no missing values, "
1778
- "[red]●[/red] = completely missing, "
1779
- "<1% = less than 1% missing, "
1780
- ">99% = more than 99% missing[/dim]"
1781
- )
1782
- console.print(footer_text)
1942
+ table.add_column("Package", style="cyan", no_wrap=True)
1943
+ table.add_column("Status", style="white")
1944
+ table.add_column("Description", style="dim")
1783
1945
 
1946
+ for package, description in dependencies:
1947
+ if _is_lib_present(package):
1948
+ status = "[green]✓ Installed[/green]"
1784
1949
  else:
1785
- # Fallback to regular table display
1786
- _rich_print_gt_table(gt_table)
1950
+ status = "[red]✗ Not installed[/red]"
1787
1951
 
1788
- except Exception as e:
1789
- console.print(f"[red]Error rendering missing values table:[/red] {e}")
1790
- # Fallback to regular table display
1791
- _rich_print_gt_table(gt_table)
1952
+ table.add_row(package, status, description)
1953
+
1954
+ console.print(table)
1955
+ console.print("\n[dim]Install missing packages to enable additional functionality.[/dim]")
1792
1956
 
1793
1957
 
1794
1958
  def _rich_print_scan_table(
@@ -1797,6 +1961,7 @@ def _rich_print_scan_table(
1797
1961
  source_type: str,
1798
1962
  table_type: str,
1799
1963
  total_rows: int | None = None,
1964
+ total_columns: int | None = None,
1800
1965
  ) -> None:
1801
1966
  """
1802
1967
  Display scan results as a Rich table in the terminal with statistical measures.
@@ -1807,6 +1972,7 @@ def _rich_print_scan_table(
1807
1972
  source_type: Type of data source (e.g., "Pointblank dataset: small_table")
1808
1973
  table_type: Type of table (e.g., "polars.LazyFrame")
1809
1974
  total_rows: Total number of rows in the dataset
1975
+ total_columns: Total number of columns in the dataset
1810
1976
  """
1811
1977
  try:
1812
1978
  import re
@@ -1828,6 +1994,11 @@ def _rich_print_scan_table(
1828
1994
  # Create a comprehensive title with data source, source type, and table type
1829
1995
  title_text = f"Column Summary / {source_type} / {table_type}"
1830
1996
 
1997
+ # Add dimensions subtitle in gray if available
1998
+ if total_rows is not None and total_columns is not None:
1999
+ title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2000
+
2001
+ # Create the scan table
1831
2002
  scan_table = Table(
1832
2003
  title=title_text,
1833
2004
  show_header=True,
@@ -1990,6 +2161,7 @@ def _rich_print_scan_table(
1990
2161
  return f"{num_val:.2f}"
1991
2162
  else:
1992
2163
  # Very small numbers - use scientific notation
2164
+
1993
2165
  return f"{num_val:.1e}"
1994
2166
 
1995
2167
  except (ValueError, TypeError):
@@ -2002,83 +2174,260 @@ def _rich_print_scan_table(
2002
2174
  if len(str_val) > max_width:
2003
2175
  return str_val[: max_width - 1] + "…"
2004
2176
 
2005
- # General string truncation with ellipsis
2006
- if len(str_val) > max_width:
2007
- return str_val[: max_width - 1] + "…"
2177
+ # General string truncation with ellipsis
2178
+ if len(str_val) > max_width:
2179
+ return str_val[: max_width - 1] + "…"
2180
+
2181
+ return str_val
2182
+
2183
+ # Populate table rows
2184
+ num_rows = len(data_dict["colname"])
2185
+ for i in range(num_rows):
2186
+ row_data = []
2187
+
2188
+ # Column name and type from HTML content
2189
+ colname_html = data_dict["colname"][i]
2190
+ column_name, data_type = extract_column_info(colname_html)
2191
+ row_data.append(column_name)
2192
+ row_data.append(data_type)
2193
+
2194
+ # Missing values (NA)
2195
+ missing_val = data_dict.get("n_missing", [None] * num_rows)[i]
2196
+ row_data.append(format_value(missing_val, is_missing=True, max_width=6))
2197
+
2198
+ # Unique values (UQ)
2199
+ unique_val = data_dict.get("n_unique", [None] * num_rows)[i]
2200
+ row_data.append(format_value(unique_val, is_unique=True, max_width=8))
2201
+
2202
+ # Statistical columns
2203
+ for stat_col in stat_columns:
2204
+ stat_val = data_dict.get(stat_col, [None] * num_rows)[i]
2205
+ # Use appropriate width based on column type
2206
+ if stat_col in ["q_1", "iqr"]:
2207
+ width = 8
2208
+ elif stat_col in ["mean", "std", "min", "median", "max", "q_3"]:
2209
+ width = 9
2210
+ else:
2211
+ width = 8
2212
+ row_data.append(format_value(stat_val, max_width=width))
2213
+
2214
+ scan_table.add_row(*row_data)
2215
+
2216
+ # Display the results
2217
+ console.print()
2218
+ console.print(scan_table)
2219
+
2220
+ except Exception as e:
2221
+ # Fallback to simple message if table creation fails
2222
+ console.print(f"[yellow]Scan results available for {data_source}[/yellow]")
2223
+ console.print(f"[red]Error displaying table: {str(e)}[/red]")
2224
+
2225
+
2226
+ def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
2227
+ """Convert a missing values GT table to Rich table with special formatting.
2228
+
2229
+ Args:
2230
+ gt_table: The GT table object for missing values
2231
+ original_data: The original data source to extract column types
2232
+ """
2233
+ try:
2234
+ # Extract the underlying data from the GT table
2235
+ df = None
2236
+
2237
+ if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
2238
+ df = gt_table._tbl_data
2239
+ elif hasattr(gt_table, "_data") and gt_table._data is not None:
2240
+ df = gt_table._data
2241
+ elif hasattr(gt_table, "data") and gt_table.data is not None:
2242
+ df = gt_table.data
2243
+
2244
+ if df is not None:
2245
+ from rich.box import SIMPLE_HEAD
2246
+
2247
+ # Create the missing values table
2248
+ rich_table = Table(show_header=True, header_style="bold magenta", box=SIMPLE_HEAD)
2249
+
2250
+ # Get column names
2251
+ columns = []
2252
+ try:
2253
+ if hasattr(df, "columns"):
2254
+ columns = list(df.columns)
2255
+ elif hasattr(df, "schema"):
2256
+ columns = list(df.schema.names)
2257
+ except Exception as e:
2258
+ console.print(f"[red]Error getting columns:[/red] {e}")
2259
+ columns = []
2260
+
2261
+ if not columns:
2262
+ columns = [f"Column {i + 1}" for i in range(10)] # Fallback
2263
+
2264
+ # Get original data to extract column types
2265
+ column_types = {}
2266
+ if original_data is not None:
2267
+ try:
2268
+ # Get column types from original data
2269
+ if hasattr(original_data, "columns"):
2270
+ original_columns = list(original_data.columns)
2271
+ column_types = _get_column_dtypes(original_data, original_columns)
2272
+ except Exception as e:
2273
+ console.print(f"[red]Error getting column types:[/red] {e}")
2274
+ pass # Use empty dict as fallback
2275
+
2276
+ # Add columns to Rich table with special formatting for missing values table
2277
+ sector_columns = [col for col in columns if col != "columns" and col.isdigit()]
2278
+
2279
+ # Two separate columns: Column name (20 chars) and Data type (10 chars)
2280
+ rich_table.add_column("Column", style="cyan", no_wrap=True, width=20)
2281
+ rich_table.add_column("Type", style="yellow", no_wrap=True, width=10)
2282
+
2283
+ # Sector columns: All same width, optimized for "100%" (4 chars + padding)
2284
+ for sector in sector_columns:
2285
+ rich_table.add_column(
2286
+ sector,
2287
+ style="cyan",
2288
+ justify="center",
2289
+ no_wrap=True,
2290
+ width=5, # Fixed width optimized for percentage values
2291
+ )
2292
+
2293
+ # Convert data to rows with special formatting
2294
+ rows = []
2295
+ try:
2296
+ if hasattr(df, "to_dicts"):
2297
+ data_dict = df.to_dicts()
2298
+ elif hasattr(df, "to_dict"):
2299
+ data_dict = df.to_dict("records")
2300
+ else:
2301
+ data_dict = []
2302
+
2303
+ for i, row in enumerate(data_dict):
2304
+ try:
2305
+ # Each row should have: [column_name, data_type, sector1, sector2, ...]
2306
+ column_name = str(row.get("columns", ""))
2307
+
2308
+ # Truncate column name to 20 characters with ellipsis if needed
2309
+ if len(column_name) > 20:
2310
+ truncated_name = column_name[:17] + "…"
2311
+ else:
2312
+ truncated_name = column_name
2313
+
2314
+ # Get data type for this column
2315
+ if column_name in column_types:
2316
+ dtype = column_types[column_name]
2317
+ if len(dtype) > 10:
2318
+ truncated_dtype = dtype[:9] + "…"
2319
+ else:
2320
+ truncated_dtype = dtype
2321
+ else:
2322
+ truncated_dtype = "?"
2323
+
2324
+ # Start building the row with column name and type
2325
+ formatted_row = [truncated_name, truncated_dtype]
2326
+
2327
+ # Add sector values (formatted percentages)
2328
+ for sector in sector_columns:
2329
+ value = row.get(sector, 0.0)
2330
+ if isinstance(value, (int, float)):
2331
+ formatted_row.append(_format_missing_percentage(float(value)))
2332
+ else:
2333
+ formatted_row.append(str(value))
2334
+
2335
+ rows.append(formatted_row)
2336
+
2337
+ except Exception as e:
2338
+ console.print(f"[red]Error processing row {i}:[/red] {e}")
2339
+ continue
2340
+
2341
+ except Exception as e:
2342
+ console.print(f"[red]Error extracting data:[/red] {e}")
2343
+ rows = [["Error extracting data", "?", *["" for _ in sector_columns]]]
2344
+
2345
+ # Add rows to Rich table
2346
+ for row in rows:
2347
+ try:
2348
+ rich_table.add_row(*row)
2349
+ except Exception as e:
2350
+ console.print(f"[red]Error adding row:[/red] {e}")
2351
+ break
2352
+
2353
+ # Show the table with custom spanner header if we have sector columns
2354
+ if sector_columns:
2355
+ # Create a custom header line that shows the spanner
2356
+ header_parts = []
2357
+ header_parts.append(" " * 20) # Space for Column header
2358
+ header_parts.append(" " * 10) # Space for Type header
2359
+
2360
+ # Left-align "Row Sectors" with the first numbered column
2361
+ row_sectors_text = "Row Sectors"
2362
+ header_parts.append(row_sectors_text)
2008
2363
 
2009
- return str_val
2364
+ # Print the custom spanner header
2365
+ console.print("[dim]" + " ".join(header_parts) + "[/dim]")
2010
2366
 
2011
- # Populate table rows
2012
- num_rows = len(data_dict["colname"])
2013
- for i in range(num_rows):
2014
- row_data = []
2367
+ # Add a horizontal rule below the spanner
2368
+ rule_parts = []
2369
+ rule_parts.append(" " * 20) # Space for Column header
2370
+ rule_parts.append(" " * 10) # Space for Type header
2015
2371
 
2016
- # Column name and type from HTML content
2017
- colname_html = data_dict["colname"][i]
2018
- column_name, data_type = extract_column_info(colname_html)
2019
- row_data.append(column_name)
2020
- row_data.append(data_type)
2372
+ # Use a fixed width horizontal rule for "Row Sectors"
2373
+ horizontal_rule = "" * 20
2374
+ rule_parts.append(horizontal_rule)
2021
2375
 
2022
- # Missing values (NA)
2023
- missing_val = data_dict.get("n_missing", [None] * num_rows)[i]
2024
- row_data.append(format_value(missing_val, is_missing=True, max_width=6))
2376
+ # Print the horizontal rule
2377
+ console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
2025
2378
 
2026
- # Unique values (UQ)
2027
- unique_val = data_dict.get("n_unique", [None] * num_rows)[i]
2028
- row_data.append(format_value(unique_val, is_unique=True, max_width=8))
2379
+ # Print the Rich table (will handle terminal width automatically)
2380
+ console.print(rich_table)
2381
+ footer_text = (
2382
+ "[dim]Symbols: [green]●[/green] = no missing values, "
2383
+ "[red]●[/red] = completely missing, "
2384
+ "<1% = less than 1% missing, "
2385
+ ">99% = more than 99% missing[/dim]"
2386
+ )
2387
+ console.print(footer_text)
2029
2388
 
2030
- # Statistical columns
2031
- for stat_col in stat_columns:
2032
- stat_val = data_dict.get(stat_col, [None] * num_rows)[i]
2033
- # Use appropriate width based on column type
2034
- if stat_col in ["q_1", "iqr"]:
2035
- width = 8
2036
- elif stat_col in ["mean", "std", "min", "median", "max", "q_3"]:
2037
- width = 9
2038
- else:
2039
- width = 8
2040
- row_data.append(format_value(stat_val, max_width=width))
2389
+ else:
2390
+ # Fallback to regular table display
2391
+ _rich_print_gt_table(gt_table)
2041
2392
 
2042
- scan_table.add_row(*row_data)
2393
+ except Exception as e:
2394
+ console.print(f"[red]Error rendering missing values table:[/red] {e}")
2395
+ # Fallback to regular table display
2396
+ _rich_print_gt_table(gt_table)
2043
2397
 
2044
- # Display the results
2045
- console.print()
2046
- console.print(scan_table) # Add informational footer about the scan scope
2047
- try:
2048
- if total_rows is not None:
2049
- # Full table scan
2050
- footer_text = f"[dim]Scan from all {total_rows:,} rows in the table.[/dim]"
2051
-
2052
- # Create a simple footer
2053
- footer_table = Table(
2054
- show_header=False,
2055
- show_lines=False,
2056
- box=None,
2057
- padding=(0, 0),
2058
- )
2059
- footer_table.add_column("", style="dim", width=80)
2060
- footer_table.add_row(footer_text)
2061
- console.print(footer_table)
2062
2398
 
2063
- except Exception:
2064
- # If we can't determine the scan scope, don't show a footer
2065
- pass
2399
+ def _map_parameters_to_checks(
2400
+ checks_list: list[str], columns_list: list[str], sets_list: list[str], values_list: list[float]
2401
+ ) -> tuple[list[str], list[str], list[float]]:
2402
+ """
2403
+ Map parameters to checks intelligently, handling flexible parameter ordering.
2066
2404
 
2067
- except Exception as e:
2068
- # Fallback to simple message if table creation fails
2069
- console.print(f"[yellow]Scan results available for {data_source}[/yellow]")
2070
- console.print(f"[red]Error displaying table: {str(e)}[/red]")
2405
+ This function distributes the provided parameters across checks based on what each check needs.
2406
+ For checks that don't need certain parameters, None/empty values are assigned.
2071
2407
 
2408
+ Args:
2409
+ checks_list: List of validation check types
2410
+ columns_list: List of column names provided by user
2411
+ sets_list: List of set values provided by user
2412
+ values_list: List of numeric values provided by user
2072
2413
 
2073
- @cli.command(name="validate-simple")
2074
- @click.argument("data_source", type=str)
2075
- @click.option(
2076
- "--check",
2077
- type=click.Choice(
2078
- [
2079
- "rows-distinct",
2414
+ Returns:
2415
+ Tuple of (mapped_columns, mapped_sets, mapped_values) where each list
2416
+ has the same length as checks_list
2417
+ """
2418
+ mapped_columns = []
2419
+ mapped_sets = []
2420
+ mapped_values = []
2421
+
2422
+ # Keep track of which parameters we've used
2423
+ column_index = 0
2424
+ set_index = 0
2425
+ value_index = 0
2426
+
2427
+ for check in checks_list:
2428
+ # Determine what parameters this check needs
2429
+ needs_column = check in [
2080
2430
  "col-vals-not-null",
2081
- "rows-complete",
2082
2431
  "col-exists",
2083
2432
  "col-vals-in-set",
2084
2433
  "col-vals-gt",
@@ -2086,652 +2435,1210 @@ def _rich_print_scan_table(
2086
2435
  "col-vals-lt",
2087
2436
  "col-vals-le",
2088
2437
  ]
2089
- ),
2090
- default="rows-distinct",
2091
- help="Type of validation check to perform",
2092
- )
2093
- @click.option(
2094
- "--column",
2095
- help="Column name to validate (required for col-vals-not-null, col-exists, col-vals-in-set, col-vals-gt, col-vals-ge, col-vals-lt, and col-vals-le checks)",
2096
- )
2097
- @click.option("--set", help="Comma-separated allowed values (required for col-vals-in-set check)")
2098
- @click.option(
2099
- "--value",
2100
- type=float,
2101
- help="Numeric value for comparison (required for col-vals-gt, col-vals-ge, col-vals-lt, and col-vals-le checks)",
2102
- )
2103
- @click.option(
2104
- "--show-extract", is_flag=True, help="Show preview of failing rows if validation fails"
2105
- )
2106
- @click.option(
2107
- "--limit", "-l", default=10, help="Maximum number of failing rows to show (default: 10)"
2108
- )
2109
- @click.option("--exit-code", is_flag=True, help="Exit with non-zero code if validation fails")
2110
- def validate_simple(
2438
+ needs_set = check == "col-vals-in-set"
2439
+ needs_value = check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"]
2440
+
2441
+ # Assign column parameter if needed
2442
+ if needs_column:
2443
+ if column_index < len(columns_list):
2444
+ mapped_columns.append(columns_list[column_index])
2445
+ column_index += 1
2446
+ else:
2447
+ mapped_columns.append(None) # Will cause validation error later
2448
+ else:
2449
+ mapped_columns.append(None)
2450
+
2451
+ # Assign set parameter if needed
2452
+ if needs_set:
2453
+ if set_index < len(sets_list):
2454
+ mapped_sets.append(sets_list[set_index])
2455
+ set_index += 1
2456
+ else:
2457
+ mapped_sets.append(None) # Will cause validation error later
2458
+ else:
2459
+ mapped_sets.append(None)
2460
+
2461
+ # Assign value parameter if needed
2462
+ if needs_value:
2463
+ if value_index < len(values_list):
2464
+ mapped_values.append(values_list[value_index])
2465
+ value_index += 1
2466
+ else:
2467
+ mapped_values.append(None) # Will cause validation error later
2468
+ else:
2469
+ mapped_values.append(None)
2470
+
2471
+ return mapped_columns, mapped_sets, mapped_values
2472
+
2473
+
2474
+ def _resolve_column_indices(columns_list, data):
2475
+ """
2476
+ Replace any '#N' entries in columns_list with the actual column name from data (1-based).
2477
+ """
2478
+ # Get column names from the data
2479
+ if hasattr(data, "columns"):
2480
+ all_columns = list(data.columns)
2481
+ elif hasattr(data, "schema"):
2482
+ all_columns = list(data.schema.names)
2483
+ else:
2484
+ return columns_list # Can't resolve, return as-is
2485
+
2486
+ resolved = []
2487
+ for col in columns_list:
2488
+ if isinstance(col, str) and col.startswith("#"):
2489
+ try:
2490
+ idx = int(col[1:]) - 1 # 1-based to 0-based
2491
+ if 0 <= idx < len(all_columns):
2492
+ resolved.append(all_columns[idx])
2493
+ else:
2494
+ resolved.append(col) # Out of range, keep as-is
2495
+ except Exception:
2496
+ resolved.append(col) # Not a valid number, keep as-is
2497
+ else:
2498
+ resolved.append(col)
2499
+ return resolved
2500
+
2501
+
2502
+ def _display_validation_result(
2503
+ validation: Any,
2504
+ checks_list: list[str],
2505
+ columns_list: list[str],
2506
+ sets_list: list[str],
2507
+ values_list: list[float],
2508
+ data_source: str,
2509
+ step_index: int,
2510
+ total_checks: int,
2511
+ show_extract: bool,
2512
+ write_extract: str | None,
2513
+ limit: int,
2514
+ ) -> None:
2515
+ """Display a single validation result with proper formatting for single or multiple checks."""
2516
+ from rich.box import SIMPLE_HEAD
2517
+
2518
+ # Get parameters for this specific check
2519
+ check = checks_list[step_index]
2520
+ column = columns_list[step_index] if step_index < len(columns_list) else None
2521
+ set_val = sets_list[step_index] if step_index < len(sets_list) else None
2522
+ value = values_list[step_index] if step_index < len(values_list) else None
2523
+
2524
+ # Get validation step info
2525
+ step_info = None
2526
+ if hasattr(validation, "validation_info") and len(validation.validation_info) > step_index:
2527
+ step_info = validation.validation_info[step_index]
2528
+
2529
+ # Create friendly title for table
2530
+ if total_checks == 1:
2531
+ # Single check - use original title format
2532
+ if check == "rows-distinct":
2533
+ table_title = "Validation Result: Rows Distinct"
2534
+ elif check == "col-vals-not-null":
2535
+ table_title = "Validation Result: Column Values Not Null"
2536
+ elif check == "rows-complete":
2537
+ table_title = "Validation Result: Rows Complete"
2538
+ elif check == "col-exists":
2539
+ table_title = "Validation Result: Column Exists"
2540
+ elif check == "col-vals-in-set":
2541
+ table_title = "Validation Result: Column Values In Set"
2542
+ elif check == "col-vals-gt":
2543
+ table_title = "Validation Result: Column Values Greater Than"
2544
+ elif check == "col-vals-ge":
2545
+ table_title = "Validation Result: Column Values Greater Than Or Equal"
2546
+ elif check == "col-vals-lt":
2547
+ table_title = "Validation Result: Column Values Less Than"
2548
+ elif check == "col-vals-le":
2549
+ table_title = "Validation Result: Column Values Less Than Or Equal"
2550
+ else:
2551
+ table_title = f"Validation Result: {check.replace('-', ' ').title()}"
2552
+ else:
2553
+ # Multiple checks - add numbering
2554
+ if check == "rows-distinct":
2555
+ base_title = "Rows Distinct"
2556
+ elif check == "col-vals-not-null":
2557
+ base_title = "Column Values Not Null"
2558
+ elif check == "rows-complete":
2559
+ base_title = "Rows Complete"
2560
+ elif check == "col-exists":
2561
+ base_title = "Column Exists"
2562
+ elif check == "col-vals-in-set":
2563
+ base_title = "Column Values In Set"
2564
+ elif check == "col-vals-gt":
2565
+ base_title = "Column Values Greater Than"
2566
+ elif check == "col-vals-ge":
2567
+ base_title = "Column Values Greater Than Or Equal"
2568
+ elif check == "col-vals-lt":
2569
+ base_title = "Column Values Less Than"
2570
+ elif check == "col-vals-le":
2571
+ base_title = "Column Values Less Than Or Equal"
2572
+ else:
2573
+ base_title = check.replace("-", " ").title()
2574
+
2575
+ table_title = f"Validation Result ({step_index + 1} of {total_checks}): {base_title}"
2576
+
2577
+ # Create the validation results table
2578
+ result_table = Table(
2579
+ title=table_title,
2580
+ show_header=True,
2581
+ header_style="bold magenta",
2582
+ box=SIMPLE_HEAD,
2583
+ title_style="bold cyan",
2584
+ title_justify="left",
2585
+ )
2586
+ result_table.add_column("Property", style="cyan", no_wrap=True)
2587
+ result_table.add_column("Value", style="white")
2588
+
2589
+ # Add basic info
2590
+ result_table.add_row("Data Source", data_source)
2591
+ result_table.add_row("Check Type", check)
2592
+
2593
+ # Add column info for column-specific checks
2594
+ if check in [
2595
+ "col-vals-not-null",
2596
+ "col-exists",
2597
+ "col-vals-in-set",
2598
+ "col-vals-gt",
2599
+ "col-vals-ge",
2600
+ "col-vals-lt",
2601
+ "col-vals-le",
2602
+ ]:
2603
+ result_table.add_row("Column", column)
2604
+
2605
+ # Add set info for col-vals-in-set check
2606
+ if check == "col-vals-in-set" and set_val:
2607
+ allowed_values = [v.strip() for v in set_val.split(",")]
2608
+ result_table.add_row("Allowed Values", ", ".join(allowed_values))
2609
+
2610
+ # Add value info for range checks
2611
+ if check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"] and value is not None:
2612
+ if check == "col-vals-gt":
2613
+ operator = ">"
2614
+ elif check == "col-vals-ge":
2615
+ operator = ">="
2616
+ elif check == "col-vals-lt":
2617
+ operator = "<"
2618
+ elif check == "col-vals-le":
2619
+ operator = "<="
2620
+ result_table.add_row("Threshold", f"{operator} {value}")
2621
+
2622
+ # Get validation details
2623
+ if step_info:
2624
+ result_table.add_row("Total Rows Tested", f"{step_info.n:,}")
2625
+ result_table.add_row("Passing Rows", f"{step_info.n_passed:,}")
2626
+ result_table.add_row("Failing Rows", f"{step_info.n_failed:,}")
2627
+
2628
+ # Check if this step passed
2629
+ step_passed = step_info.n_failed == 0
2630
+
2631
+ # Overall result with color coding
2632
+ if step_passed:
2633
+ result_table.add_row("Result", "[green]✓ PASSED[/green]")
2634
+ if check == "rows-distinct":
2635
+ result_table.add_row("Duplicate Rows", "[green]None found[/green]")
2636
+ elif check == "col-vals-not-null":
2637
+ result_table.add_row("Null Values", "[green]None found[/green]")
2638
+ elif check == "rows-complete":
2639
+ result_table.add_row("Incomplete Rows", "[green]None found[/green]")
2640
+ elif check == "col-exists":
2641
+ result_table.add_row("Column Status", "[green]Column exists[/green]")
2642
+ elif check == "col-vals-in-set":
2643
+ result_table.add_row("Values Status", "[green]All values in allowed set[/green]")
2644
+ elif check == "col-vals-gt":
2645
+ result_table.add_row("Values Status", f"[green]All values > {value}[/green]")
2646
+ elif check == "col-vals-ge":
2647
+ result_table.add_row("Values Status", f"[green]All values >= {value}[/green]")
2648
+ elif check == "col-vals-lt":
2649
+ result_table.add_row("Values Status", f"[green]All values < {value}[/green]")
2650
+ elif check == "col-vals-le":
2651
+ result_table.add_row("Values Status", f"[green]All values <= {value}[/green]")
2652
+ else:
2653
+ result_table.add_row("Result", "[red]✗ FAILED[/red]")
2654
+ if check == "rows-distinct":
2655
+ result_table.add_row("Duplicate Rows", f"[red]{step_info.n_failed:,} found[/red]")
2656
+ elif check == "col-vals-not-null":
2657
+ result_table.add_row("Null Values", f"[red]{step_info.n_failed:,} found[/red]")
2658
+ elif check == "rows-complete":
2659
+ result_table.add_row("Incomplete Rows", f"[red]{step_info.n_failed:,} found[/red]")
2660
+ elif check == "col-exists":
2661
+ result_table.add_row("Column Status", "[red]Column does not exist[/red]")
2662
+ elif check == "col-vals-in-set":
2663
+ result_table.add_row("Invalid Values", f"[red]{step_info.n_failed:,} found[/red]")
2664
+ elif check == "col-vals-gt":
2665
+ result_table.add_row(
2666
+ "Invalid Values", f"[red]{step_info.n_failed:,} values <= {value}[/red]"
2667
+ )
2668
+ elif check == "col-vals-ge":
2669
+ result_table.add_row(
2670
+ "Invalid Values", f"[red]{step_info.n_failed:,} values < {value}[/red]"
2671
+ )
2672
+ elif check == "col-vals-lt":
2673
+ result_table.add_row(
2674
+ "Invalid Values", f"[red]{step_info.n_failed:,} values >= {value}[/red]"
2675
+ )
2676
+ elif check == "col-vals-le":
2677
+ result_table.add_row(
2678
+ "Invalid Values", f"[red]{step_info.n_failed:,} values > {value}[/red]"
2679
+ )
2680
+
2681
+ console.print()
2682
+ console.print(result_table)
2683
+
2684
+ # Show extract and summary for single check only, or if this is a failed step in multiple checks
2685
+ if total_checks == 1:
2686
+ # For single check, show extract and summary as before
2687
+ _show_extract_and_summary(
2688
+ validation,
2689
+ check,
2690
+ column,
2691
+ set_val,
2692
+ value,
2693
+ data_source,
2694
+ step_index,
2695
+ step_info,
2696
+ show_extract,
2697
+ write_extract,
2698
+ limit,
2699
+ )
2700
+ else:
2701
+ # For multiple checks, show summary panel and handle extract if needed
2702
+ if step_info:
2703
+ step_passed = step_info.n_failed == 0
2704
+ if step_passed:
2705
+ # Create success message for this step
2706
+ if check == "rows-distinct":
2707
+ success_message = f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
2708
+ elif check == "col-vals-not-null":
2709
+ success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
2710
+ elif check == "rows-complete":
2711
+ success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
2712
+ elif check == "col-exists":
2713
+ success_message = f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
2714
+ elif check == "col-vals-in-set":
2715
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
2716
+ elif check == "col-vals-gt":
2717
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
2718
+ elif check == "col-vals-ge":
2719
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
2720
+ elif check == "col-vals-lt":
2721
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
2722
+ elif check == "col-vals-le":
2723
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
2724
+ else:
2725
+ success_message = f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
2726
+
2727
+ console.print(
2728
+ Panel(
2729
+ success_message,
2730
+ border_style="green",
2731
+ )
2732
+ )
2733
+ else:
2734
+ # Create failure message for this step (without tip)
2735
+ if check == "rows-distinct":
2736
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
2737
+ elif check == "col-vals-not-null":
2738
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
2739
+ elif check == "rows-complete":
2740
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
2741
+ elif check == "col-exists":
2742
+ failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
2743
+ elif check == "col-vals-in-set":
2744
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
2745
+ elif check == "col-vals-gt":
2746
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
2747
+ elif check == "col-vals-ge":
2748
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
2749
+ elif check == "col-vals-lt":
2750
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
2751
+ elif check == "col-vals-le":
2752
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
2753
+ else:
2754
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
2755
+
2756
+ console.print(
2757
+ Panel(
2758
+ failure_message,
2759
+ border_style="red",
2760
+ )
2761
+ )
2762
+
2763
+ # For multiple checks, show extract if requested and this step failed
2764
+ if (show_extract or write_extract) and not step_passed:
2765
+ _show_extract_for_multi_check(
2766
+ validation,
2767
+ check,
2768
+ column,
2769
+ set_val,
2770
+ value,
2771
+ data_source,
2772
+ step_index,
2773
+ step_info,
2774
+ show_extract,
2775
+ write_extract,
2776
+ limit,
2777
+ )
2778
+
2779
+
2780
+ def _show_extract_for_multi_check(
2781
+ validation: Any,
2782
+ check: str,
2783
+ column: str | None,
2784
+ set_val: str | None,
2785
+ value: float | None,
2111
2786
  data_source: str,
2787
+ step_index: int,
2788
+ step_info: Any,
2789
+ show_extract: bool,
2790
+ write_extract: str | None,
2791
+ limit: int,
2792
+ ) -> None:
2793
+ """Show extract for a single validation step in multiple checks scenario."""
2794
+ # Dynamic message based on check type
2795
+ if check == "rows-distinct":
2796
+ extract_message = "[yellow]Extract of failing rows (duplicates):[/yellow]"
2797
+ row_type = "duplicate rows"
2798
+ elif check == "rows-complete":
2799
+ extract_message = "[yellow]Extract of failing rows (incomplete rows):[/yellow]"
2800
+ row_type = "incomplete rows"
2801
+ elif check == "col-exists":
2802
+ extract_message = f"[yellow]Column '{column}' does not exist in the dataset[/yellow]"
2803
+ row_type = "missing column"
2804
+ elif check == "col-vals-not-null":
2805
+ extract_message = f"[yellow]Extract of failing rows (null values in '{column}'):[/yellow]"
2806
+ row_type = "rows with null values"
2807
+ elif check == "col-vals-in-set":
2808
+ extract_message = (
2809
+ f"[yellow]Extract of failing rows (invalid values in '{column}'):[/yellow]"
2810
+ )
2811
+ row_type = "rows with invalid values"
2812
+ elif check == "col-vals-gt":
2813
+ extract_message = (
2814
+ f"[yellow]Extract of failing rows (values in '{column}' <= {value}):[/yellow]"
2815
+ )
2816
+ row_type = f"rows with values <= {value}"
2817
+ elif check == "col-vals-ge":
2818
+ extract_message = (
2819
+ f"[yellow]Extract of failing rows (values in '{column}' < {value}):[/yellow]"
2820
+ )
2821
+ row_type = f"rows with values < {value}"
2822
+ elif check == "col-vals-lt":
2823
+ extract_message = (
2824
+ f"[yellow]Extract of failing rows (values in '{column}' >= {value}):[/yellow]"
2825
+ )
2826
+ row_type = f"rows with values >= {value}"
2827
+ elif check == "col-vals-le":
2828
+ extract_message = (
2829
+ f"[yellow]Extract of failing rows (values in '{column}' > {value}):[/yellow]"
2830
+ )
2831
+ row_type = f"rows with values > {value}"
2832
+ else:
2833
+ extract_message = "[yellow]Extract of failing rows:[/yellow]"
2834
+ row_type = "failing rows"
2835
+
2836
+ if show_extract:
2837
+ console.print()
2838
+ console.print(extract_message)
2839
+
2840
+ # Special handling for col-exists check - no rows to show when column doesn't exist
2841
+ if check == "col-exists":
2842
+ if show_extract:
2843
+ console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
2844
+ console.print(
2845
+ "[dim]Use --show-extract with other check types to see failing data rows.[/dim]"
2846
+ )
2847
+ if write_extract:
2848
+ console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
2849
+ else:
2850
+ try:
2851
+ # Get failing rows extract - use step_index + 1 since extracts are 1-indexed
2852
+ failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
2853
+
2854
+ if failing_rows is not None and len(failing_rows) > 0:
2855
+ if show_extract:
2856
+ # Limit the number of rows shown
2857
+ if len(failing_rows) > limit:
2858
+ display_rows = failing_rows.head(limit)
2859
+ console.print(
2860
+ f"[dim]Showing first {limit} of {len(failing_rows)} {row_type}[/dim]"
2861
+ )
2862
+ else:
2863
+ display_rows = failing_rows
2864
+ console.print(f"[dim]Showing all {len(failing_rows)} {row_type}[/dim]")
2865
+
2866
+ # Create a preview table using pointblank's preview function
2867
+ import pointblank as pb
2868
+
2869
+ preview_table = pb.preview(
2870
+ data=display_rows,
2871
+ n_head=min(limit, len(display_rows)),
2872
+ n_tail=0,
2873
+ limit=limit,
2874
+ show_row_numbers=True,
2875
+ )
2876
+
2877
+ # Display using our Rich table function
2878
+ _rich_print_gt_table(preview_table, show_summary=False)
2879
+
2880
+ if write_extract:
2881
+ try:
2882
+ from pathlib import Path
2883
+
2884
+ folder_name = write_extract
2885
+
2886
+ # Create the output folder
2887
+ output_folder = Path(folder_name)
2888
+ output_folder.mkdir(parents=True, exist_ok=True)
2889
+
2890
+ # Create safe filename from check type
2891
+ safe_check_type = check.replace("-", "_")
2892
+ filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
2893
+ filepath = output_folder / filename
2894
+
2895
+ # Limit the output if needed
2896
+ write_rows = failing_rows
2897
+ if len(failing_rows) > limit:
2898
+ write_rows = failing_rows.head(limit)
2899
+
2900
+ # Save to CSV
2901
+ if hasattr(write_rows, "write_csv"):
2902
+ # Polars
2903
+ write_rows.write_csv(str(filepath))
2904
+ elif hasattr(write_rows, "to_csv"):
2905
+ # Pandas
2906
+ write_rows.to_csv(str(filepath), index=False)
2907
+ else:
2908
+ # Try converting to pandas as fallback
2909
+ import pandas as pd
2910
+
2911
+ pd_data = pd.DataFrame(write_rows)
2912
+ pd_data.to_csv(str(filepath), index=False)
2913
+
2914
+ rows_saved = len(write_rows) if hasattr(write_rows, "__len__") else limit
2915
+ console.print(
2916
+ f"[green]✓[/green] Failing rows saved to folder: {output_folder}"
2917
+ )
2918
+ console.print(f"[dim] - {filename}: {rows_saved} rows[/dim]")
2919
+ except Exception as e:
2920
+ console.print(f"[yellow]Warning: Could not save failing rows: {e}[/yellow]")
2921
+ else:
2922
+ if show_extract:
2923
+ console.print("[yellow]No failing rows could be extracted[/yellow]")
2924
+ if write_extract:
2925
+ console.print("[yellow]No failing rows could be extracted to save[/yellow]")
2926
+ except Exception as e:
2927
+ if show_extract:
2928
+ console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
2929
+ if write_extract:
2930
+ console.print(f"[yellow]Could not extract failing rows to save: {e}[/yellow]")
2931
+
2932
+
2933
+ def _show_extract_and_summary(
2934
+ validation: Any,
2112
2935
  check: str,
2113
2936
  column: str | None,
2114
- set: str | None,
2937
+ set_val: str | None,
2115
2938
  value: float | None,
2939
+ data_source: str,
2940
+ step_index: int,
2941
+ step_info: Any,
2116
2942
  show_extract: bool,
2943
+ write_extract: str | None,
2117
2944
  limit: int,
2118
- exit_code: bool,
2119
- ):
2120
- """
2121
- Perform simple, single-step validations directly from the command line.
2945
+ ) -> None:
2946
+ """Show extract and summary for a validation step (used for single checks)."""
2947
+ step_passed = step_info.n_failed == 0 if step_info else True
2122
2948
 
2123
- This command provides a quick way to perform common data validation checks
2124
- without needing to write a validation script.
2949
+ # Show extract if requested and validation failed
2950
+ if (show_extract or write_extract) and not step_passed:
2951
+ console.print()
2125
2952
 
2126
- DATA_SOURCE can be:
2953
+ # Dynamic message based on check type
2954
+ if check == "rows-distinct":
2955
+ extract_message = "[yellow]Extract of failing rows (duplicates):[/yellow]"
2956
+ row_type = "duplicate rows"
2957
+ elif check == "rows-complete":
2958
+ extract_message = "[yellow]Extract of failing rows (incomplete rows):[/yellow]"
2959
+ row_type = "incomplete rows"
2960
+ elif check == "col-exists":
2961
+ extract_message = f"[yellow]Column '{column}' does not exist in the dataset[/yellow]"
2962
+ row_type = "missing column"
2963
+ elif check == "col-vals-not-null":
2964
+ extract_message = (
2965
+ f"[yellow]Extract of failing rows (null values in '{column}'):[/yellow]"
2966
+ )
2967
+ row_type = "rows with null values"
2968
+ elif check == "col-vals-in-set":
2969
+ extract_message = (
2970
+ f"[yellow]Extract of failing rows (invalid values in '{column}'):[/yellow]"
2971
+ )
2972
+ row_type = "rows with invalid values"
2973
+ elif check == "col-vals-gt":
2974
+ extract_message = (
2975
+ f"[yellow]Extract of failing rows (values in '{column}' <= {value}):[/yellow]"
2976
+ )
2977
+ row_type = f"rows with values <= {value}"
2978
+ elif check == "col-vals-ge":
2979
+ extract_message = (
2980
+ f"[yellow]Extract of failing rows (values in '{column}' < {value}):[/yellow]"
2981
+ )
2982
+ row_type = f"rows with values < {value}"
2983
+ elif check == "col-vals-lt":
2984
+ extract_message = (
2985
+ f"[yellow]Extract of failing rows (values in '{column}' >= {value}):[/yellow]"
2986
+ )
2987
+ row_type = f"rows with values >= {value}"
2988
+ elif check == "col-vals-le":
2989
+ extract_message = (
2990
+ f"[yellow]Extract of failing rows (values in '{column}' > {value}):[/yellow]"
2991
+ )
2992
+ row_type = f"rows with values > {value}"
2993
+ else:
2994
+ extract_message = "[yellow]Extract of failing rows:[/yellow]"
2995
+ row_type = "failing rows"
2127
2996
 
2128
- \b
2129
- - CSV file path (e.g., data.csv)
2130
- - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
2131
- - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
2132
- - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
2997
+ if show_extract:
2998
+ console.print(extract_message)
2133
2999
 
2134
- AVAILABLE CHECKS:
3000
+ # Special handling for col-exists check - no rows to show when column doesn't exist
3001
+ if check == "col-exists" and not step_passed:
3002
+ if show_extract:
3003
+ console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
3004
+ console.print(
3005
+ "[dim]Use --show-extract with other check types to see failing data rows.[/dim]"
3006
+ )
3007
+ if write_extract:
3008
+ console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
3009
+ else:
3010
+ try:
3011
+ # Get failing rows extract - use step_index + 1 since extracts are 1-indexed
3012
+ failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
2135
3013
 
2136
- \b
2137
- - rows-distinct: Check if all rows in the dataset are unique (no duplicates)
2138
- - rows-complete: Check if all rows are complete (no missing values in any column)
2139
- - col-exists: Check if a specific column exists in the dataset (requires --column)
2140
- - col-vals-not-null: Check if all values in a column are not null/missing (requires --column)
2141
- - col-vals-gt: Check if all values in a column are greater than a threshold (requires --column and --value)
2142
- - col-vals-ge: Check if all values in a column are greater than or equal to a threshold (requires --column and --value)
2143
- - col-vals-lt: Check if all values in a column are less than a threshold (requires --column and --value)
2144
- - col-vals-le: Check if all values in a column are less than or equal to a threshold (requires --column and --value)
2145
- - col-vals-in-set: Check if all values in a column are in an allowed set (requires --column and --set)
3014
+ if failing_rows is not None and len(failing_rows) > 0:
3015
+ if show_extract:
3016
+ # Limit the number of rows shown
3017
+ if len(failing_rows) > limit:
3018
+ display_rows = failing_rows.head(limit)
3019
+ console.print(
3020
+ f"[dim]Showing first {limit} of {len(failing_rows)} {row_type}[/dim]"
3021
+ )
3022
+ else:
3023
+ display_rows = failing_rows
3024
+ console.print(f"[dim]Showing all {len(failing_rows)} {row_type}[/dim]")
2146
3025
 
2147
- Examples:
3026
+ # Create a preview table using pointblank's preview function
3027
+ import pointblank as pb
2148
3028
 
2149
- \b
2150
- pb validate-simple data.csv --check rows-distinct
2151
- pb validate-simple data.csv --check rows-distinct --show-extract
2152
- pb validate-simple data.csv --check rows-distinct --exit-code
2153
- pb validate-simple data.csv --check rows-complete
2154
- pb validate-simple data.csv --check col-exists --column price
2155
- pb validate-simple data.csv --check col-vals-not-null --column email
2156
- pb validate-simple data.csv --check col-vals-gt --column score --value 50
2157
- pb validate-simple data.csv --check col-vals-in-set --column status --set "active,inactive,pending"
2158
- """
2159
- try:
2160
- # Validate required parameters for different check types
2161
- if check == "col-vals-not-null" and not column:
2162
- console.print(f"[red]Error:[/red] --column is required for {check} check")
2163
- console.print(
2164
- "Example: pb validate-simple data.csv --check col-vals-not-null --column email"
2165
- )
2166
- sys.exit(1)
2167
- sys.exit(1)
3029
+ preview_table = pb.preview(
3030
+ data=display_rows,
3031
+ n_head=min(limit, len(display_rows)),
3032
+ n_tail=0,
3033
+ limit=limit,
3034
+ show_row_numbers=True,
3035
+ )
2168
3036
 
2169
- if check == "col-exists" and not column:
2170
- console.print(f"[red]Error:[/red] --column is required for {check} check")
2171
- console.print("Example: pb validate-simple data.csv --check col-exists --column price")
2172
- sys.exit(1)
3037
+ # Display using our Rich table function
3038
+ _rich_print_gt_table(preview_table, show_summary=False)
3039
+
3040
+ if write_extract:
3041
+ try:
3042
+ from pathlib import Path
3043
+
3044
+ folder_name = write_extract
3045
+
3046
+ # Create the output folder
3047
+ output_folder = Path(folder_name)
3048
+ output_folder.mkdir(parents=True, exist_ok=True)
3049
+
3050
+ # Create safe filename from check type
3051
+ safe_check_type = check.replace("-", "_")
3052
+ filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
3053
+ filepath = output_folder / filename
3054
+
3055
+ # Limit the output if needed
3056
+ write_rows = failing_rows
3057
+ if len(failing_rows) > limit:
3058
+ write_rows = failing_rows.head(limit)
3059
+
3060
+ # Save to CSV
3061
+ if hasattr(write_rows, "write_csv"):
3062
+ # Polars
3063
+ write_rows.write_csv(str(filepath))
3064
+ elif hasattr(write_rows, "to_csv"):
3065
+ # Pandas
3066
+ write_rows.to_csv(str(filepath), index=False)
3067
+ else:
3068
+ # Try converting to pandas as fallback
3069
+ import pandas as pd
2173
3070
 
2174
- if check == "col-vals-in-set" and not column:
2175
- console.print(f"[red]Error:[/red] --column is required for {check} check")
2176
- console.print(
2177
- "Example: pb validate-simple data.csv --check col-vals-in-set --column status --set 'active,inactive'"
2178
- )
2179
- sys.exit(1)
3071
+ pd_data = pd.DataFrame(write_rows)
3072
+ pd_data.to_csv(str(filepath), index=False)
2180
3073
 
2181
- if check == "col-vals-in-set" and not set:
2182
- console.print(f"[red]Error:[/red] --set is required for {check} check")
2183
- console.print(
2184
- "Example: pb validate-simple data.csv --check col-vals-in-set --column status --set 'active,inactive,pending'"
2185
- )
2186
- sys.exit(1)
3074
+ rows_saved = (
3075
+ len(write_rows) if hasattr(write_rows, "__len__") else limit
3076
+ )
3077
+ console.print(
3078
+ f"[green]✓[/green] Failing rows saved to folder: {output_folder}"
3079
+ )
3080
+ console.print(f"[dim] - {filename}: {rows_saved} rows[/dim]")
3081
+ except Exception as e:
3082
+ console.print(
3083
+ f"[yellow]Warning: Could not save failing rows: {e}[/yellow]"
3084
+ )
3085
+ else:
3086
+ if show_extract:
3087
+ console.print("[yellow]No failing rows could be extracted[/yellow]")
3088
+ if write_extract:
3089
+ console.print("[yellow]No failing rows could be extracted to save[/yellow]")
3090
+ except Exception as e:
3091
+ if show_extract:
3092
+ console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
3093
+ if write_extract:
3094
+ console.print(f"[yellow]Could not extract failing rows to save: {e}[/yellow]")
2187
3095
 
2188
- if check == "col-vals-gt" and not column:
2189
- console.print(f"[red]Error:[/red] --column is required for {check} check")
2190
- console.print(
2191
- "Example: pb validate-simple data.csv --check col-vals-gt --column score --value 50"
3096
+ # Summary message
3097
+ console.print()
3098
+ if step_passed:
3099
+ if check == "rows-distinct":
3100
+ success_message = (
3101
+ f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
2192
3102
  )
2193
- sys.exit(1)
2194
-
2195
- if check == "col-vals-gt" and value is None:
2196
- console.print(f"[red]Error:[/red] --value is required for {check} check")
2197
- console.print(
2198
- "Example: pb validate-simple data.csv --check col-vals-gt --column score --value 50"
3103
+ elif check == "col-vals-not-null":
3104
+ success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
3105
+ elif check == "rows-complete":
3106
+ success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
3107
+ elif check == "col-exists":
3108
+ success_message = (
3109
+ f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
2199
3110
  )
2200
- sys.exit(1)
2201
-
2202
- if check == "col-vals-ge" and not column:
2203
- console.print(f"[red]Error:[/red] --column is required for {check} check")
2204
- console.print(
2205
- "Example: pb validate-simple data.csv --check col-vals-ge --column age --value 18"
3111
+ elif check == "col-vals-in-set":
3112
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
3113
+ elif check == "col-vals-gt":
3114
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
3115
+ elif check == "col-vals-ge":
3116
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
3117
+ elif check == "col-vals-lt":
3118
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
3119
+ elif check == "col-vals-le":
3120
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
3121
+ else:
3122
+ success_message = (
3123
+ f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
2206
3124
  )
2207
- sys.exit(1)
2208
3125
 
2209
- if check == "col-vals-ge" and value is None:
2210
- console.print(f"[red]Error:[/red] --value is required for {check} check")
2211
- console.print(
2212
- "Example: pb validate-simple data.csv --check col-vals-ge --column age --value 18"
2213
- )
2214
- sys.exit(1)
3126
+ console.print(Panel(success_message, border_style="green"))
3127
+ else:
3128
+ if step_info:
3129
+ if check == "rows-distinct":
3130
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
3131
+ elif check == "col-vals-not-null":
3132
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
3133
+ elif check == "rows-complete":
3134
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
3135
+ elif check == "col-exists":
3136
+ failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
3137
+ elif check == "col-vals-in-set":
3138
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
3139
+ elif check == "col-vals-gt":
3140
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
3141
+ elif check == "col-vals-ge":
3142
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
3143
+ elif check == "col-vals-lt":
3144
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
3145
+ elif check == "col-vals-le":
3146
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
3147
+ else:
3148
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
2215
3149
 
2216
- if check == "col-vals-lt" and not column:
2217
- console.print(f"[red]Error:[/red] --column is required for {check} check")
2218
- console.print(
2219
- "Example: pb validate-simple data.csv --check col-vals-lt --column age --value 65"
2220
- )
2221
- sys.exit(1)
3150
+ # Add hint about --show-extract if not already used (except for col-exists which has no rows to show)
3151
+ if not show_extract and check != "col-exists":
3152
+ failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
2222
3153
 
2223
- if check == "col-vals-lt" and value is None:
2224
- console.print(f"[red]Error:[/red] --value is required for {check} check")
2225
- console.print(
2226
- "Example: pb validate-simple data.csv --check col-vals-lt --column age --value 65"
2227
- )
2228
- sys.exit(1)
3154
+ console.print(Panel(failure_message, border_style="red"))
3155
+ else:
3156
+ if check == "rows-distinct":
3157
+ failure_message = (
3158
+ f"[red]✗ Validation FAILED: Duplicate rows found in {data_source}[/red]"
3159
+ )
3160
+ elif check == "rows-complete":
3161
+ failure_message = (
3162
+ f"[red]✗ Validation FAILED: Incomplete rows found in {data_source}[/red]"
3163
+ )
3164
+ else:
3165
+ failure_message = (
3166
+ f"[red]✗ Validation FAILED: {check} check failed for {data_source}[/red]"
3167
+ )
2229
3168
 
2230
- if check == "col-vals-le" and not column:
2231
- console.print(f"[red]Error:[/red] --column is required for {check} check")
2232
- console.print(
2233
- "Example: pb validate-simple data.csv --check col-vals-le --column score --value 100"
2234
- )
2235
- sys.exit(1)
3169
+ # Add hint about --show-extract if not already used
3170
+ if not show_extract:
3171
+ failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
2236
3172
 
2237
- if check == "col-vals-le" and value is None:
2238
- console.print(f"[red]Error:[/red] --value is required for {check} check")
2239
- console.print(
2240
- "Example: pb validate-simple data.csv --check col-vals-le --column score --value 100"
2241
- )
2242
- sys.exit(1)
3173
+ console.print(Panel(failure_message, border_style="red"))
2243
3174
 
2244
- with console.status("[bold green]Loading data..."):
2245
- # Try to load as a pointblank dataset first
2246
- if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
2247
- data = pb.load_dataset(data_source)
2248
- console.print(f"[green]✓[/green] Loaded dataset: {data_source}")
2249
- else:
2250
- # Assume it's a file path or connection string
2251
- data = data_source
2252
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
2253
3175
 
2254
- # Perform the validation based on the check type
2255
- with console.status(f"[bold green]Running {check} validation..."):
2256
- if check == "rows-distinct":
2257
- # Create validation for duplicate rows
2258
- validation = (
2259
- pb.Validate(
2260
- data=data,
2261
- tbl_name=f"Data from {data_source}",
2262
- label=f"CLI Simple Validation: {check}",
2263
- )
2264
- .rows_distinct()
2265
- .interrogate()
2266
- )
3176
+ @cli.command()
3177
+ @click.argument("output_file", type=click.Path())
3178
+ def make_template(output_file: str):
3179
+ """
3180
+ Create a validation script template.
2267
3181
 
2268
- # Get the result
2269
- all_passed = validation.all_passed()
3182
+ Creates a sample Python script with examples showing how to use Pointblank
3183
+ for data validation. Edit the template to add your own data loading and
3184
+ validation rules, then run it with 'pb run'.
2270
3185
 
2271
- console.print(
2272
- f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2273
- )
2274
- elif check == "col-vals-not-null":
2275
- # Create validation for not null values in specified column
2276
- validation = (
2277
- pb.Validate(
2278
- data=data,
2279
- tbl_name=f"Data from {data_source}",
2280
- label=f"CLI Simple Validation: {check} on column '{column}'",
2281
- )
2282
- .col_vals_not_null(columns=column)
2283
- .interrogate()
2284
- )
3186
+ OUTPUT_FILE is the path where the template script will be created.
2285
3187
 
2286
- # Get the result
2287
- all_passed = validation.all_passed()
3188
+ Examples:
2288
3189
 
2289
- console.print(
2290
- f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2291
- )
2292
- elif check == "rows-complete":
2293
- # Create validation for complete rows (no missing values in any column)
2294
- validation = (
2295
- pb.Validate(
2296
- data=data,
2297
- tbl_name=f"Data from {data_source}",
2298
- label=f"CLI Simple Validation: {check}",
2299
- )
2300
- .rows_complete()
2301
- .interrogate()
2302
- )
3190
+ \b
3191
+ pb make-template my_validation.py
3192
+ pb make-template validation_template.py
3193
+ """
3194
+ example_script = '''"""
3195
+ Example Pointblank validation script.
2303
3196
 
2304
- # Get the result
2305
- all_passed = validation.all_passed()
3197
+ This script demonstrates how to create validation rules for your data.
3198
+ Modify the data loading and validation rules below to match your requirements.
3199
+ """
2306
3200
 
2307
- console.print(
2308
- f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2309
- )
2310
- elif check == "col-exists":
2311
- # Create validation for column existence
2312
- validation = (
2313
- pb.Validate(
2314
- data=data,
2315
- tbl_name=f"Data from {data_source}",
2316
- label=f"CLI Simple Validation: {check} for column '{column}'",
2317
- )
2318
- .col_exists(columns=column)
2319
- .interrogate()
2320
- )
3201
+ import pointblank as pb
2321
3202
 
2322
- # Get the result
2323
- all_passed = validation.all_passed()
3203
+ # Load your data (replace this with your actual data source)
3204
+ # You can load from various sources:
3205
+ # data = pb.load_dataset("small_table") # Built-in dataset
3206
+ # data = pd.read_csv("your_data.csv") # CSV file
3207
+ # data = pl.read_parquet("data.parquet") # Parquet file
3208
+ # data = pb.load_data("database://connection") # Database
2324
3209
 
2325
- console.print(
2326
- f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2327
- )
2328
- elif check == "col-vals-in-set":
2329
- # Parse the comma-separated set values
2330
- allowed_values = [value.strip() for value in set.split(",")]
2331
-
2332
- # Create validation for values in set
2333
- validation = (
2334
- pb.Validate(
2335
- data=data,
2336
- tbl_name=f"Data from {data_source}",
2337
- label=f"CLI Simple Validation: {check} for column '{column}'",
2338
- )
2339
- .col_vals_in_set(columns=column, set=allowed_values)
2340
- .interrogate()
2341
- )
3210
+ data = pb.load_dataset("small_table") # Example with built-in dataset
2342
3211
 
2343
- # Get the result
2344
- all_passed = validation.all_passed()
3212
+ # Create a validation object
3213
+ validation = (
3214
+ pb.Validate(
3215
+ data=data,
3216
+ tbl_name="Example Data",
3217
+ label="Validation Example",
3218
+ thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
3219
+ )
3220
+ # Add your validation rules here
3221
+ # Example rules (modify these based on your data structure):
2345
3222
 
2346
- console.print(
2347
- f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2348
- )
2349
- elif check == "col-vals-gt":
2350
- # Create validation for values greater than threshold
2351
- validation = (
2352
- pb.Validate(
2353
- data=data,
2354
- tbl_name=f"Data from {data_source}",
2355
- label=f"CLI Simple Validation: {check} for column '{column}' > {value}",
2356
- )
2357
- .col_vals_gt(columns=column, value=value)
2358
- .interrogate()
2359
- )
3223
+ # Check that specific columns exist
3224
+ # .col_exists(["column1", "column2"])
3225
+
3226
+ # Check for null values
3227
+ # .col_vals_not_null(columns="important_column")
3228
+
3229
+ # Check value ranges
3230
+ # .col_vals_gt(columns="amount", value=0)
3231
+ # .col_vals_between(columns="score", left=0, right=100)
3232
+
3233
+ # Check string patterns
3234
+ # .col_vals_regex(columns="email", pattern=r"^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$")
3235
+
3236
+ # Check unique values
3237
+ # .col_vals_unique(columns="id")
3238
+
3239
+ # Finalize the validation
3240
+ .interrogate()
3241
+ )
3242
+
3243
+ # The validation object will be automatically used by the CLI
3244
+ # You can also access results programmatically:
3245
+ # print(f"All passed: {validation.all_passed()}")
3246
+ # print(f"Failed steps: {validation.n_failed()}")
3247
+ '''
3248
+
3249
+ Path(output_file).write_text(example_script)
3250
+ console.print(f"[green]✓[/green] Validation script template created: {output_file}")
3251
+ console.print("\nEdit the template to add your data loading and validation rules, then run:")
3252
+ console.print(f"[cyan]pb run {output_file}[/cyan]")
3253
+ console.print(
3254
+ f"[cyan]pb run {output_file} --data your_data.csv[/cyan] [dim]# Override data source[/dim]"
3255
+ )
3256
+
3257
+
3258
+ @cli.command()
3259
+ @click.argument("validation_script", type=click.Path(exists=True))
3260
+ @click.option("--data", type=str, help="Optional data source to override script's data loading")
3261
+ @click.option("--output-html", type=click.Path(), help="Save HTML validation report to file")
3262
+ @click.option("--output-json", type=click.Path(), help="Save JSON validation summary to file")
3263
+ @click.option(
3264
+ "--show-extract", is_flag=True, help="Show extract of failing rows if validation fails"
3265
+ )
3266
+ @click.option(
3267
+ "--write-extract",
3268
+ type=str,
3269
+ help="Save failing rows to folders (one CSV per step). Provide base name for folder.",
3270
+ )
3271
+ @click.option(
3272
+ "--limit", "-l", default=10, help="Maximum number of failing rows to show/save (default: 10)"
3273
+ )
3274
+ @click.option(
3275
+ "--fail-on",
3276
+ type=click.Choice(["critical", "error", "warning", "any"], case_sensitive=False),
3277
+ help="Exit with non-zero code when validation reaches this threshold level",
3278
+ )
3279
+ def run(
3280
+ validation_script: str,
3281
+ data: str | None,
3282
+ output_html: str | None,
3283
+ output_json: str | None,
3284
+ show_extract: bool,
3285
+ write_extract: str | None,
3286
+ limit: int,
3287
+ fail_on: str | None,
3288
+ ):
3289
+ """
3290
+ Run a Pointblank validation script.
2360
3291
 
2361
- # Get the result
2362
- all_passed = validation.all_passed()
3292
+ VALIDATION_SCRIPT should be a Python file that defines validation logic.
3293
+ The script should load its own data and create validation objects.
2363
3294
 
2364
- console.print(
2365
- f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2366
- )
2367
- elif check == "col-vals-ge":
2368
- # Create validation for values greater than or equal to threshold
2369
- validation = (
2370
- pb.Validate(
2371
- data=data,
2372
- tbl_name=f"Data from {data_source}",
2373
- label=f"CLI Simple Validation: {check} for column '{column}' >= {value}",
2374
- )
2375
- .col_vals_ge(columns=column, value=value)
2376
- .interrogate()
2377
- )
3295
+ If --data is provided, it will be available as a 'cli_data' variable in the script,
3296
+ allowing you to optionally override your script's data loading.
2378
3297
 
2379
- # Get the result
2380
- all_passed = validation.all_passed()
3298
+ DATA can be:
2381
3299
 
2382
- console.print(
2383
- f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2384
- )
2385
- elif check == "col-vals-lt":
2386
- # Create validation for values less than threshold
2387
- validation = (
2388
- pb.Validate(
2389
- data=data,
2390
- tbl_name=f"Data from {data_source}",
2391
- label=f"CLI Simple Validation: {check} for column '{column}' < {value}",
2392
- )
2393
- .col_vals_lt(columns=column, value=value)
2394
- .interrogate()
2395
- )
3300
+ \b
3301
+ - CSV file path (e.g., data.csv)
3302
+ - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
3303
+ - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
3304
+ - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
3305
+ - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
2396
3306
 
2397
- # Get the result
2398
- all_passed = validation.all_passed()
3307
+ Examples:
2399
3308
 
2400
- console.print(
2401
- f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2402
- )
2403
- elif check == "col-vals-le":
2404
- # Create validation for values less than or equal to threshold
2405
- validation = (
2406
- pb.Validate(
2407
- data=data,
2408
- tbl_name=f"Data from {data_source}",
2409
- label=f"CLI Simple Validation: {check} for column '{column}' <= {value}",
2410
- )
2411
- .col_vals_le(columns=column, value=value)
2412
- .interrogate()
2413
- )
3309
+ \b
3310
+ pb run validation_script.py
3311
+ pb run validation_script.py --data data.csv
3312
+ pb run validation_script.py --data small_table --output-html report.html
3313
+ pb run validation_script.py --show-extract --fail-on error
3314
+ pb run validation_script.py --write-extract extracts_folder --fail-on critical
3315
+ """
3316
+ try:
3317
+ # Load optional data override if provided
3318
+ cli_data = None
3319
+ if data:
3320
+ with console.status(f"[bold green]Loading data from {data}..."):
3321
+ cli_data = _load_data_source(data)
3322
+ console.print(f"[green]✓[/green] Loaded data override: {data}")
2414
3323
 
2415
- # Get the result
2416
- all_passed = validation.all_passed()
3324
+ # Execute the validation script
3325
+ with console.status("[bold green]Running validation script..."):
3326
+ # Read and execute the validation script
3327
+ script_content = Path(validation_script).read_text()
2417
3328
 
2418
- console.print(
2419
- f"[green]✓[/green] {check.replace('-', ' ').title()} validation completed"
2420
- )
2421
- else:
2422
- # This shouldn't happen due to click.Choice, but just in case
2423
- console.print(f"[red]Error:[/red] Unknown check type: {check}")
3329
+ # Create a namespace with pointblank and optional CLI data
3330
+ namespace = {
3331
+ "pb": pb,
3332
+ "pointblank": pb,
3333
+ "cli_data": cli_data, # Available if --data was provided
3334
+ "__name__": "__main__",
3335
+ "__file__": str(Path(validation_script).resolve()),
3336
+ }
3337
+
3338
+ # Execute the script
3339
+ try:
3340
+ exec(script_content, namespace)
3341
+ except Exception as e:
3342
+ console.print(f"[red]Error executing validation script:[/red] {e}")
2424
3343
  sys.exit(1)
2425
3344
 
2426
- # Display results
2427
- from rich.box import SIMPLE_HEAD
3345
+ # Look for validation objects in the namespace
3346
+ validations = []
2428
3347
 
2429
- # Create friendly title for table
2430
- if check == "rows-distinct":
2431
- table_title = "Validation Result: Rows Distinct"
2432
- elif check == "col-vals-not-null":
2433
- table_title = "Validation Result: Column Values Not Null"
2434
- elif check == "rows-complete":
2435
- table_title = "Validation Result: Rows Complete"
2436
- elif check == "col-exists":
2437
- table_title = "Validation Result: Column Exists"
2438
- elif check == "col-vals-in-set":
2439
- table_title = "Validation Result: Column Values In Set"
2440
- elif check == "col-vals-gt":
2441
- table_title = "Validation Result: Column Values Greater Than"
2442
- elif check == "col-vals-ge":
2443
- table_title = "Validation Result: Column Values Greater Than Or Equal"
2444
- elif check == "col-vals-lt":
2445
- table_title = "Validation Result: Column Values Less Than"
2446
- elif check == "col-vals-le":
2447
- table_title = "Validation Result: Column Values Less Than Or Equal"
2448
- else:
2449
- table_title = f"Validation Result: {check.replace('-', ' ').title()}"
3348
+ # Look for the 'validation' variable specifically first
3349
+ if "validation" in namespace:
3350
+ validations.append(namespace["validation"])
2450
3351
 
2451
- result_table = Table(
2452
- title=table_title,
2453
- show_header=True,
2454
- header_style="bold magenta",
2455
- box=SIMPLE_HEAD,
2456
- title_style="bold cyan",
2457
- title_justify="left",
2458
- )
2459
- result_table.add_column("Property", style="cyan", no_wrap=True)
2460
- result_table.add_column("Value", style="white")
3352
+ # Also look for any other validation objects
3353
+ for key, value in namespace.items():
3354
+ if (
3355
+ key != "validation"
3356
+ and hasattr(value, "interrogate")
3357
+ and hasattr(value, "validation_info")
3358
+ ):
3359
+ validations.append(value)
3360
+ # Also check if it's a Validate object that has been interrogated
3361
+ elif key != "validation" and str(type(value)).find("Validate") != -1:
3362
+ validations.append(value)
2461
3363
 
2462
- # Add basic info
2463
- result_table.add_row("Data Source", data_source)
2464
- result_table.add_row("Check Type", check)
3364
+ if not validations:
3365
+ raise ValueError(
3366
+ "No validation objects found in script. "
3367
+ "Script should create Validate objects and call .interrogate() on them."
3368
+ )
2465
3369
 
2466
- # Add column info for column-specific checks
2467
- if check in [
2468
- "col-vals-not-null",
2469
- "col-exists",
2470
- "col-vals-in-set",
2471
- "col-vals-gt",
2472
- "col-vals-ge",
2473
- "col-vals-lt",
2474
- "col-vals-le",
2475
- ]:
2476
- result_table.add_row("Column", column)
2477
-
2478
- # Add set info for col-vals-in-set check
2479
- if check == "col-vals-in-set":
2480
- allowed_values = [value.strip() for value in set.split(",")]
2481
- result_table.add_row("Allowed Values", ", ".join(allowed_values))
2482
-
2483
- # Add value info for range checks
2484
- if check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"]:
2485
- if check == "col-vals-gt":
2486
- operator = ">"
2487
- elif check == "col-vals-ge":
2488
- operator = ">="
2489
- elif check == "col-vals-lt":
2490
- operator = "<"
2491
- elif check == "col-vals-le":
2492
- operator = "<="
2493
- result_table.add_row("Threshold", f"{operator} {value}")
2494
-
2495
- # Get validation details
2496
- if hasattr(validation, "validation_info") and validation.validation_info:
2497
- step_info = validation.validation_info[0] # Should only be one step
2498
- result_table.add_row("Total Rows Tested", f"{step_info.n:,}")
2499
- result_table.add_row("Passing Rows", f"{step_info.n_passed:,}")
2500
- result_table.add_row("Failing Rows", f"{step_info.n_failed:,}")
2501
-
2502
- # Overall result with color coding
2503
- if all_passed:
2504
- result_table.add_row("Result", "[green]✓ PASSED[/green]")
2505
- if check == "rows-distinct":
2506
- result_table.add_row("Duplicate Rows", "[green]None found[/green]")
2507
- elif check == "col-vals-not-null":
2508
- result_table.add_row("Null Values", "[green]None found[/green]")
2509
- elif check == "rows-complete":
2510
- result_table.add_row("Incomplete Rows", "[green]None found[/green]")
2511
- elif check == "col-exists":
2512
- result_table.add_row("Column Status", "[green]Column exists[/green]")
2513
- elif check == "col-vals-in-set":
2514
- result_table.add_row(
2515
- "Values Status", "[green]All values in allowed set[/green]"
2516
- )
2517
- elif check == "col-vals-gt":
2518
- result_table.add_row("Values Status", f"[green]All values > {value}[/green]")
2519
- elif check == "col-vals-ge":
2520
- result_table.add_row("Values Status", f"[green]All values >= {value}[/green]")
2521
- else:
2522
- result_table.add_row("Result", "[red]✗ FAILED[/red]")
2523
- if check == "rows-distinct":
2524
- result_table.add_row(
2525
- "Duplicate Rows", f"[red]{step_info.n_failed:,} found[/red]"
2526
- )
2527
- elif check == "col-vals-not-null":
2528
- result_table.add_row("Null Values", f"[red]{step_info.n_failed:,} found[/red]")
2529
- elif check == "rows-complete":
2530
- result_table.add_row(
2531
- "Incomplete Rows", f"[red]{step_info.n_failed:,} found[/red]"
2532
- )
2533
- elif check == "col-exists":
2534
- result_table.add_row("Column Status", "[red]Column does not exist[/red]")
2535
- elif check == "col-vals-in-set":
2536
- result_table.add_row(
2537
- "Invalid Values", f"[red]{step_info.n_failed:,} found[/red]"
2538
- )
2539
- elif check == "col-vals-gt":
2540
- result_table.add_row(
2541
- "Invalid Values", f"[red]{step_info.n_failed:,} values <= {value}[/red]"
2542
- )
2543
- elif check == "col-vals-ge":
2544
- result_table.add_row(
2545
- "Invalid Values", f"[red]{step_info.n_failed:,} values < {value}[/red]"
2546
- )
3370
+ console.print(f"[green]✓[/green] Found {len(validations)} validation object(s)")
2547
3371
 
2548
- console.print()
2549
- console.print(result_table)
3372
+ # Process each validation
3373
+ overall_failed = False
3374
+ overall_critical = False
3375
+ overall_error = False
3376
+ overall_warning = False
2550
3377
 
2551
- # Show extract if requested and validation failed
2552
- if show_extract and not all_passed:
2553
- console.print()
3378
+ for i, validation in enumerate(validations, 1):
3379
+ if len(validations) > 1:
3380
+ console.print(f"\n[bold cyan]Validation {i}:[/bold cyan]")
2554
3381
 
2555
- # Dynamic message based on check type
2556
- if check == "rows-distinct":
2557
- extract_message = "[yellow]Preview of failing rows (duplicates):[/yellow]"
2558
- row_type = "duplicate rows"
2559
- elif check == "rows-complete":
2560
- extract_message = "[yellow]Preview of failing rows (incomplete rows):[/yellow]"
2561
- row_type = "incomplete rows"
2562
- elif check == "col-exists":
2563
- extract_message = (
2564
- f"[yellow]Column '{column}' does not exist in the dataset[/yellow]"
2565
- )
2566
- row_type = "missing column"
2567
- elif check == "col-vals-in-set":
2568
- extract_message = (
2569
- f"[yellow]Preview of failing rows (invalid values in '{column}'):[/yellow]"
2570
- )
2571
- row_type = "rows with invalid values"
2572
- elif check == "col-vals-gt":
2573
- extract_message = (
2574
- f"[yellow]Preview of failing rows (values in '{column}' <= {value}):[/yellow]"
2575
- )
2576
- row_type = f"rows with values <= {value}"
2577
- elif check == "col-vals-ge":
2578
- extract_message = (
2579
- f"[yellow]Preview of failing rows (values in '{column}' < {value}):[/yellow]"
2580
- )
2581
- row_type = f"rows with values < {value}"
2582
- else:
2583
- extract_message = "[yellow]Preview of failing rows:[/yellow]"
2584
- row_type = "failing rows"
3382
+ # Display summary
3383
+ _display_validation_summary(validation)
2585
3384
 
2586
- console.print(extract_message)
3385
+ # Check failure status
3386
+ validation_failed = False
3387
+ has_critical = False
3388
+ has_error = False
3389
+ has_warning = False
2587
3390
 
2588
- # Special handling for col-exists check - no rows to show when column doesn't exist
2589
- if check == "col-exists" and not all_passed:
2590
- console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
2591
- console.print(
2592
- "[dim]Use --show-extract with other check types to see failing data rows.[/dim]"
2593
- )
2594
- else:
2595
- try:
2596
- # Get failing rows extract
2597
- failing_rows = validation.get_data_extracts(i=1, frame=True)
3391
+ if hasattr(validation, "validation_info") and validation.validation_info:
3392
+ for step_info in validation.validation_info:
3393
+ if step_info.critical:
3394
+ has_critical = True
3395
+ overall_critical = True
3396
+ if step_info.error:
3397
+ has_error = True
3398
+ overall_error = True
3399
+ if step_info.warning:
3400
+ has_warning = True
3401
+ overall_warning = True
3402
+ if step_info.n_failed > 0:
3403
+ validation_failed = True
3404
+ overall_failed = True
3405
+
3406
+ # Handle extract functionality for failed validations
3407
+ failed_steps = []
3408
+ if (
3409
+ validation_failed
3410
+ and hasattr(validation, "validation_info")
3411
+ and validation.validation_info
3412
+ ):
3413
+ for j, step_info in enumerate(validation.validation_info, 1):
3414
+ if step_info.n_failed > 0:
3415
+ failed_steps.append((j, step_info))
3416
+
3417
+ if validation_failed and failed_steps and (show_extract or write_extract):
3418
+ console.print()
3419
+
3420
+ if show_extract:
3421
+ extract_title = "Extract of failing rows from validation steps"
3422
+ if len(validations) > 1:
3423
+ extract_title += f" (Validation {i})"
3424
+ console.print(f"[yellow]{extract_title}:[/yellow]")
3425
+
3426
+ for step_num, step_info in failed_steps:
3427
+ try:
3428
+ failing_rows = validation.get_data_extracts(i=step_num, frame=True)
3429
+
3430
+ if failing_rows is not None and len(failing_rows) > 0:
3431
+ console.print(
3432
+ f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
3433
+ )
2598
3434
 
2599
- if failing_rows is not None and len(failing_rows) > 0:
2600
- # Limit the number of rows shown
2601
- if len(failing_rows) > limit:
2602
- display_rows = failing_rows.head(limit)
3435
+ # Limit the number of rows shown
3436
+ if len(failing_rows) > limit:
3437
+ display_rows = failing_rows.head(limit)
3438
+ console.print(
3439
+ f"[dim]Showing first {limit} of {len(failing_rows)} failing rows[/dim]"
3440
+ )
3441
+ else:
3442
+ display_rows = failing_rows
3443
+ console.print(
3444
+ f"[dim]Showing all {len(failing_rows)} failing rows[/dim]"
3445
+ )
3446
+
3447
+ # Create a preview table using pointblank's preview function
3448
+ preview_table = pb.preview(
3449
+ data=display_rows,
3450
+ n_head=min(limit, len(display_rows)),
3451
+ n_tail=0,
3452
+ limit=limit,
3453
+ show_row_numbers=True,
3454
+ )
3455
+
3456
+ # Display using our Rich table function
3457
+ _rich_print_gt_table(preview_table, show_summary=False)
3458
+ else:
3459
+ console.print(
3460
+ f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
3461
+ )
3462
+ console.print("[yellow]No failing rows could be extracted[/yellow]")
3463
+ except Exception as e:
2603
3464
  console.print(
2604
- f"[dim]Showing first {limit} of {len(failing_rows)} {row_type}[/dim]"
3465
+ f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
3466
+ )
3467
+ console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
3468
+
3469
+ if write_extract:
3470
+ try:
3471
+ folder_name = write_extract
3472
+
3473
+ # Add validation number if multiple validations
3474
+ if len(validations) > 1:
3475
+ folder_name = f"{folder_name}_validation_{i}"
3476
+
3477
+ # Create the output folder
3478
+ output_folder = Path(folder_name)
3479
+ output_folder.mkdir(parents=True, exist_ok=True)
3480
+
3481
+ saved_files = []
3482
+
3483
+ # Save each failing step to its own CSV file
3484
+ for step_num, step_info in failed_steps:
3485
+ try:
3486
+ failing_rows = validation.get_data_extracts(i=step_num, frame=True)
3487
+ if failing_rows is not None and len(failing_rows) > 0:
3488
+ # Create safe filename from assertion type
3489
+ safe_assertion_type = (
3490
+ step_info.assertion_type.replace(" ", "_")
3491
+ .replace("/", "_")
3492
+ .replace("\\", "_")
3493
+ .replace(":", "_")
3494
+ .replace("<", "_")
3495
+ .replace(">", "_")
3496
+ .replace("|", "_")
3497
+ .replace("?", "_")
3498
+ .replace("*", "_")
3499
+ .replace('"', "_")
3500
+ )
3501
+
3502
+ filename = f"step_{step_num:02d}_{safe_assertion_type}.csv"
3503
+ filepath = output_folder / filename
3504
+
3505
+ # Limit the output if needed
3506
+ save_rows = failing_rows
3507
+ if hasattr(failing_rows, "head") and len(failing_rows) > limit:
3508
+ save_rows = failing_rows.head(limit)
3509
+
3510
+ # Save to CSV
3511
+ if hasattr(save_rows, "write_csv"):
3512
+ # Polars
3513
+ save_rows.write_csv(str(filepath))
3514
+ elif hasattr(save_rows, "to_csv"):
3515
+ # Pandas
3516
+ save_rows.to_csv(str(filepath), index=False)
3517
+ else:
3518
+ # Try converting to pandas as fallback
3519
+ import pandas as pd
3520
+
3521
+ pd_data = pd.DataFrame(save_rows)
3522
+ pd_data.to_csv(str(filepath), index=False)
3523
+
3524
+ saved_files.append((filename, len(failing_rows)))
3525
+
3526
+ except Exception as e:
3527
+ console.print(
3528
+ f"[yellow]Warning: Could not save failing rows from step {step_num}: {e}[/yellow]"
3529
+ )
3530
+
3531
+ if saved_files:
3532
+ console.print(
3533
+ f"[green]✓[/green] Failing rows saved to folder: {output_folder}"
2605
3534
  )
3535
+ for filename, row_count in saved_files:
3536
+ console.print(f"[dim] - {filename}: {row_count} rows[/dim]")
2606
3537
  else:
2607
- display_rows = failing_rows
2608
- console.print(f"[dim]Showing all {len(failing_rows)} {row_type}[/dim]")
3538
+ console.print(
3539
+ "[yellow]No failing rows could be extracted to save[/yellow]"
3540
+ )
2609
3541
 
2610
- # Create a preview table using pointblank's preview function
2611
- preview_table = pb.preview(
2612
- data=display_rows,
2613
- n_head=min(limit, len(display_rows)),
2614
- n_tail=0,
2615
- limit=limit,
2616
- show_row_numbers=True,
3542
+ except Exception as e:
3543
+ console.print(
3544
+ f"[yellow]Warning: Could not save failing rows to CSV: {e}[/yellow]"
2617
3545
  )
2618
3546
 
2619
- # Display using our Rich table function
2620
- _rich_print_gt_table(preview_table)
2621
- else:
2622
- console.print("[yellow]No failing rows could be extracted[/yellow]")
2623
- except Exception as e:
2624
- console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
3547
+ # Save HTML and JSON outputs (combine multiple validations if needed)
3548
+ if output_html:
3549
+ try:
3550
+ if len(validations) == 1:
3551
+ # Single validation - save directly
3552
+ html_content = validations[0]._repr_html_()
3553
+ Path(output_html).write_text(html_content, encoding="utf-8")
3554
+ else:
3555
+ # Multiple validations - combine them
3556
+ html_parts = []
3557
+ html_parts.append("<html><body>")
3558
+ html_parts.append("<h1>Pointblank Validation Report</h1>")
2625
3559
 
2626
- # Summary message
2627
- console.print()
2628
- if all_passed:
2629
- if check == "rows-distinct":
2630
- success_message = (
2631
- f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
2632
- )
2633
- elif check == "col-vals-not-null":
2634
- success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
2635
- elif check == "rows-complete":
2636
- success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
2637
- elif check == "col-exists":
2638
- success_message = (
2639
- f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
2640
- )
2641
- elif check == "col-vals-in-set":
2642
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
2643
- elif check == "col-vals-gt":
2644
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
2645
- elif check == "col-vals-ge":
2646
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
2647
- elif check == "col-vals-lt":
2648
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
2649
- elif check == "col-vals-le":
2650
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
2651
- else:
2652
- success_message = (
2653
- f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
2654
- )
3560
+ for i, validation in enumerate(validations, 1):
3561
+ html_parts.append(f"<h2>Validation {i}</h2>")
3562
+ html_parts.append(validation._repr_html_())
2655
3563
 
2656
- console.print(
2657
- Panel(
2658
- success_message,
2659
- border_style="green",
2660
- )
2661
- )
2662
- else:
2663
- if hasattr(validation, "validation_info") and validation.validation_info:
2664
- step_info = validation.validation_info[0]
3564
+ html_parts.append("</body></html>")
3565
+ html_content = "\n".join(html_parts)
3566
+ Path(output_html).write_text(html_content, encoding="utf-8")
2665
3567
 
2666
- if check == "rows-distinct":
2667
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
2668
- elif check == "col-vals-not-null":
2669
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
2670
- elif check == "rows-complete":
2671
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
2672
- elif check == "col-exists":
2673
- failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
2674
- elif check == "col-vals-in-set":
2675
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
2676
- elif check == "col-vals-gt":
2677
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
2678
- elif check == "col-vals-ge":
2679
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
2680
- elif check == "col-vals-lt":
2681
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
2682
- elif check == "col-vals-le":
2683
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
3568
+ console.print(f"[green]✓[/green] HTML report saved to: {output_html}")
3569
+ except Exception as e:
3570
+ console.print(f"[yellow]Warning: Could not save HTML report: {e}[/yellow]")
3571
+
3572
+ if output_json:
3573
+ try:
3574
+ if len(validations) == 1:
3575
+ # Single validation - save directly
3576
+ json_report = validations[0].get_json_report()
3577
+ Path(output_json).write_text(json_report, encoding="utf-8")
2684
3578
  else:
2685
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
3579
+ # Multiple validations - combine them
3580
+ import json
2686
3581
 
2687
- # Add hint about --show-extract if not already used (except for col-exists which has no rows to show)
2688
- if not show_extract and check != "col-exists":
2689
- failure_message += (
2690
- "\n[dim]💡 Tip: Use --show-extract to see the failing rows[/dim]"
2691
- )
3582
+ combined_report = {"validations": []}
2692
3583
 
2693
- console.print(
2694
- Panel(
2695
- failure_message,
2696
- border_style="red",
2697
- )
2698
- )
2699
- else:
2700
- if check == "rows-distinct":
2701
- failure_message = (
2702
- f"[red]✗ Validation FAILED: Duplicate rows found in {data_source}[/red]"
2703
- )
2704
- elif check == "rows-complete":
2705
- failure_message = (
2706
- f"[red]✗ Validation FAILED: Incomplete rows found in {data_source}[/red]"
2707
- )
2708
- else:
2709
- failure_message = (
2710
- f"[red]✗ Validation FAILED: {check} check failed for {data_source}[/red]"
2711
- )
3584
+ for i, validation in enumerate(validations, 1):
3585
+ validation_json = json.loads(validation.get_json_report())
3586
+ validation_json["validation_id"] = i
3587
+ combined_report["validations"].append(validation_json)
2712
3588
 
2713
- # Add hint about --show-extract if not already used
2714
- if not show_extract:
2715
- failure_message += (
2716
- "\n[dim]💡 Tip: Use --show-extract to see the failing rows[/dim]"
3589
+ Path(output_json).write_text(
3590
+ json.dumps(combined_report, indent=2), encoding="utf-8"
2717
3591
  )
2718
3592
 
2719
- console.print(
2720
- Panel(
2721
- failure_message,
2722
- border_style="red",
2723
- )
2724
- )
3593
+ console.print(f"[green]✓[/green] JSON summary saved to: {output_json}")
3594
+ except Exception as e:
3595
+ console.print(f"[yellow]Warning: Could not save JSON report: {e}[/yellow]")
2725
3596
 
2726
- # Exit with appropriate code if requested
2727
- if exit_code and not all_passed:
2728
- console.print("[dim]Exiting with non-zero code due to validation failure[/dim]")
2729
- sys.exit(1)
3597
+ # Check if we should fail based on threshold
3598
+ if fail_on:
3599
+ should_exit = False
3600
+ exit_reason = ""
3601
+
3602
+ if fail_on.lower() == "critical" and overall_critical:
3603
+ should_exit = True
3604
+ exit_reason = "critical validation failures"
3605
+ elif fail_on.lower() == "error" and (overall_critical or overall_error):
3606
+ should_exit = True
3607
+ exit_reason = "error or critical validation failures"
3608
+ elif fail_on.lower() == "warning" and (
3609
+ overall_critical or overall_error or overall_warning
3610
+ ):
3611
+ should_exit = True
3612
+ exit_reason = "warning, error, or critical validation failures"
3613
+ elif fail_on.lower() == "any" and overall_failed:
3614
+ should_exit = True
3615
+ exit_reason = "validation failures"
3616
+
3617
+ if should_exit:
3618
+ console.print(f"[red]Exiting with error due to {exit_reason}[/red]")
3619
+ sys.exit(1)
2730
3620
 
2731
3621
  except Exception as e:
2732
3622
  console.print(f"[red]Error:[/red] {e}")
2733
3623
  sys.exit(1)
2734
3624
 
2735
3625
 
2736
- if __name__ == "__main__": # pragma: no cover
2737
- cli()
3626
+ def _format_missing_percentage(value: float) -> str:
3627
+ """Format missing value percentages for display.
3628
+
3629
+ Args:
3630
+ value: The percentage value (0-100)
3631
+
3632
+ Returns:
3633
+ Formatted string with proper percentage display
3634
+ """
3635
+ if value == 0.0:
3636
+ return "[green]●[/green]" # Large green circle for no missing values
3637
+ elif value == 100.0:
3638
+ return "[red]●[/red]" # Large red circle for completely missing values
3639
+ elif value < 1.0 and value > 0:
3640
+ return "<1%" # Less than 1%
3641
+ elif value > 99.0 and value < 100.0:
3642
+ return ">99%" # More than 99%
3643
+ else:
3644
+ return f"{int(round(value))}%" # Round to nearest integer with % sign