pointblank 0.10.0__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/cli.py ADDED
@@ -0,0 +1,3644 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import click
8
+ from rich.console import Console
9
+ from rich.panel import Panel
10
+ from rich.table import Table
11
+
12
+ import pointblank as pb
13
+ from pointblank._utils import _get_tbl_type, _is_lib_present
14
+
15
+ console = Console()
16
+
17
+
18
+ class OrderedGroup(click.Group):
19
+ """A Click Group that displays commands in a custom order."""
20
+
21
+ def list_commands(self, ctx):
22
+ """Return commands in the desired logical order."""
23
+ # Define the desired order
24
+ desired_order = [
25
+ # Data Discovery/Exploration
26
+ "info",
27
+ "preview",
28
+ "scan",
29
+ "missing",
30
+ # Validation
31
+ "validate",
32
+ "run",
33
+ "make-template",
34
+ # Utilities
35
+ "datasets",
36
+ "requirements",
37
+ ]
38
+
39
+ # Get all available commands
40
+ available_commands = super().list_commands(ctx)
41
+
42
+ # Return commands in desired order, followed by any not in the list
43
+ ordered = []
44
+ for cmd in desired_order:
45
+ if cmd in available_commands:
46
+ ordered.append(cmd)
47
+
48
+ # Add any commands not in our desired order (safety fallback)
49
+ for cmd in available_commands:
50
+ if cmd not in ordered:
51
+ ordered.append(cmd)
52
+
53
+ return ordered
54
+
55
+
56
+ def _load_data_source(data_source: str) -> Any:
57
+ """
58
+ Centralized data loading function for CLI that handles all supported data source types.
59
+
60
+ This function provides a consistent way to load data across all CLI commands by leveraging
61
+ the _process_data() utility function and adding support for pointblank dataset names.
62
+
63
+ Parameters
64
+ ----------
65
+ data_source : str
66
+ The data source which could be:
67
+ - A pointblank dataset name (small_table, game_revenue, nycflights, global_sales)
68
+ - A GitHub URL pointing to a CSV or Parquet file
69
+ - A database connection string (e.g., "duckdb:///path/to/file.ddb::table_name")
70
+ - A CSV file path (string or Path object with .csv extension)
71
+ - A Parquet file path, glob pattern, directory, or partitioned dataset
72
+
73
+ Returns
74
+ -------
75
+ Any
76
+ Loaded data as a DataFrame or other data object
77
+
78
+ Raises
79
+ ------
80
+ ValueError
81
+ If the pointblank dataset name is not recognized
82
+ """
83
+ # Check if it's a pointblank dataset name first
84
+ if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
85
+ return pb.load_dataset(data_source)
86
+
87
+ # Otherwise, use the centralized _process_data() function for all other data sources
88
+ from pointblank.validate import _process_data
89
+
90
+ return _process_data(data_source)
91
+
92
+
93
+ def _format_cell_value(
94
+ value: Any, is_row_number: bool = False, max_width: int = 50, num_columns: int = 10
95
+ ) -> str:
96
+ """Format a cell value for Rich table display, highlighting None/NA values in red.
97
+
98
+ Args:
99
+ value: The raw cell value from the dataframe
100
+ is_row_number: Whether this is a row number column value
101
+ max_width: Maximum character width for text truncation
102
+ num_columns: Number of columns in the table (affects truncation aggressiveness)
103
+
104
+ Returns:
105
+ Formatted string with Rich markup for None/NA values or row numbers
106
+ """
107
+ # Special formatting for row numbers: never truncate them
108
+ if is_row_number:
109
+ return f"[dim]{value}[/dim]"
110
+
111
+ # Check for actual None/null values (not string representations)
112
+ if value is None:
113
+ return "[red]None[/red]"
114
+
115
+ # Check for pandas/numpy specific NA values
116
+ try:
117
+ import numpy as np
118
+ import pandas as pd
119
+
120
+ # Check for pandas NA
121
+ if pd.isna(value):
122
+ # If it's specifically numpy.nan, show as NaN
123
+ if isinstance(value, float) and np.isnan(value):
124
+ return "[red]NaN[/red]"
125
+ # If it's pandas NA, show as NA
126
+ elif str(type(value)).find("pandas") != -1:
127
+ return "[red]NA[/red]"
128
+ # Generic NA for other pandas missing values
129
+ else:
130
+ return "[red]NA[/red]"
131
+
132
+ except (ImportError, TypeError, ValueError): # pragma: no cover
133
+ # If pandas/numpy not available, value not compatible, or ambiguous array
134
+ pass
135
+
136
+ # Check for empty strings (but only actual empty strings, not whitespace)
137
+ if isinstance(value, str) and value == "":
138
+ return "[red][/red]" # Empty string shown as red empty space
139
+
140
+ # Convert to string and apply intelligent truncation
141
+ str_value = str(value)
142
+
143
+ # Adjust max_width based on number of columns to prevent overly wide tables
144
+ if num_columns > 15:
145
+ adjusted_max_width = min(max_width, 30) # Be more aggressive with many columns
146
+ elif num_columns > 10:
147
+ adjusted_max_width = min(max_width, 40)
148
+ else:
149
+ adjusted_max_width = max_width
150
+
151
+ # Apply truncation if the string is too long
152
+ if len(str_value) > adjusted_max_width:
153
+ # For very long text, truncate more aggressively
154
+ if len(str_value) > adjusted_max_width * 2:
155
+ # For extremely long text, use a shorter truncation
156
+ truncated = str_value[: adjusted_max_width // 2] + "…"
157
+ else:
158
+ # For moderately long text, use a more generous truncation
159
+ truncated = str_value[: adjusted_max_width - 1] + "…"
160
+
161
+ return truncated
162
+
163
+ return str_value
164
+
165
+
166
+ def _get_column_dtypes(df: Any, columns: list[str]) -> dict[str, str]:
167
+ """Extract data types for columns and format them in a compact way.
168
+
169
+ Args:
170
+ df: The dataframe object
171
+ columns: List of column names
172
+
173
+ Returns:
174
+ Dictionary mapping column names to formatted data type strings
175
+ """
176
+ dtypes_dict = {}
177
+
178
+ try:
179
+ if hasattr(df, "dtypes"):
180
+ # Polars/Pandas style
181
+ if hasattr(df.dtypes, "to_dict"):
182
+ # Polars DataFrame dtypes
183
+ raw_dtypes = df.dtypes.to_dict() if hasattr(df.dtypes, "to_dict") else {}
184
+ for col in columns:
185
+ if col in raw_dtypes:
186
+ dtype_str = str(raw_dtypes[col])
187
+ # Convert to compact format similar to Polars glimpse()
188
+ dtypes_dict[col] = _format_dtype_compact(dtype_str)
189
+ else:
190
+ dtypes_dict[col] = "?"
191
+ else:
192
+ # Pandas DataFrame dtypes (Series-like)
193
+ for i, col in enumerate(columns):
194
+ if i < len(df.dtypes):
195
+ dtype_str = str(
196
+ df.dtypes.iloc[i] if hasattr(df.dtypes, "iloc") else df.dtypes[i]
197
+ )
198
+ dtypes_dict[col] = _format_dtype_compact(dtype_str)
199
+ else:
200
+ dtypes_dict[col] = "?"
201
+ elif hasattr(df, "schema"):
202
+ # Other schema-based systems (e.g., Ibis)
203
+ schema = df.schema
204
+ if hasattr(schema, "to_dict"): # pragma: no cover
205
+ raw_dtypes = schema.to_dict()
206
+ for col in columns:
207
+ if col in raw_dtypes:
208
+ dtypes_dict[col] = _format_dtype_compact(str(raw_dtypes[col]))
209
+ else: # pragma: no cover
210
+ dtypes_dict[col] = "?"
211
+ else: # pragma: no cover
212
+ for col in columns:
213
+ try:
214
+ dtype_str = str(getattr(schema, col, "Unknown"))
215
+ dtypes_dict[col] = _format_dtype_compact(dtype_str)
216
+ except Exception: # pragma: no cover
217
+ dtypes_dict[col] = "?"
218
+ else:
219
+ # Fallback: no type information available
220
+ for col in columns:
221
+ dtypes_dict[col] = "?"
222
+
223
+ except Exception: # pragma: no cover
224
+ # If any error occurs, fall back to unknown types
225
+ for col in columns:
226
+ dtypes_dict[col] = "?"
227
+
228
+ return dtypes_dict
229
+
230
+
231
+ def _format_dtype_compact(dtype_str: str) -> str:
232
+ """Format a data type string to a compact representation.
233
+
234
+ Args:
235
+ dtype_str: The raw data type string
236
+
237
+ Returns:
238
+ Compact formatted data type string
239
+ """
240
+ # Remove common prefixes and make compact
241
+ dtype_str = dtype_str.lower()
242
+
243
+ # Polars types
244
+ if "utf8" in dtype_str or "string" in dtype_str:
245
+ return "str"
246
+ elif "int64" in dtype_str:
247
+ return "i64"
248
+ elif "int32" in dtype_str:
249
+ return "i32"
250
+ elif "float64" in dtype_str:
251
+ return "f64"
252
+ elif "float32" in dtype_str:
253
+ return "f32"
254
+ elif "boolean" in dtype_str or "bool" in dtype_str:
255
+ return "bool"
256
+ elif "datetime" in dtype_str:
257
+ return "datetime"
258
+ elif "date" in dtype_str and "datetime" not in dtype_str:
259
+ return "date"
260
+ elif "time" in dtype_str:
261
+ return "time"
262
+
263
+ # Pandas types
264
+ elif "object" in dtype_str:
265
+ return "obj"
266
+ elif "category" in dtype_str:
267
+ return "cat"
268
+
269
+ # Generic fallbacks
270
+ elif "int" in dtype_str:
271
+ return "int"
272
+ elif "float" in dtype_str:
273
+ return "float"
274
+ elif "str" in dtype_str:
275
+ return "str"
276
+
277
+ # Unknown or complex types - truncate if too long
278
+ elif len(dtype_str) > 8:
279
+ return dtype_str[:8] + "…"
280
+ else:
281
+ return dtype_str
282
+
283
+
284
+ def _rich_print_scan_table(
285
+ scan_result: Any,
286
+ data_source: str,
287
+ source_type: str,
288
+ table_type: str,
289
+ total_rows: int | None = None,
290
+ total_columns: int | None = None,
291
+ ) -> None:
292
+ """
293
+ Display scan results as a Rich table in the terminal with statistical measures.
294
+
295
+ Args:
296
+ scan_result: The GT object from col_summary_tbl()
297
+ data_source: Name of the data source being scanned
298
+ source_type: Type of data source (e.g., "Pointblank dataset: small_table")
299
+ table_type: Type of table (e.g., "polars.LazyFrame")
300
+ total_rows: Total number of rows in the dataset
301
+ total_columns: Total number of columns in the dataset
302
+ """
303
+ try:
304
+ import re
305
+
306
+ import narwhals as nw
307
+ from rich.box import SIMPLE_HEAD
308
+
309
+ # Extract the underlying DataFrame from the GT object
310
+ # The GT object has a _tbl_data attribute that contains the DataFrame
311
+ gt_data = scan_result._tbl_data
312
+
313
+ # Convert to Narwhals DataFrame for consistent handling
314
+ nw_data = nw.from_native(gt_data)
315
+
316
+ # Convert to dictionary for easier access
317
+ data_dict = nw_data.to_dict(as_series=False)
318
+
319
+ # Create main scan table with missing data table styling
320
+ # Create a comprehensive title with data source, source type, and table type
321
+ title_text = f"Column Summary / {source_type} / {table_type}"
322
+
323
+ # Add dimensions subtitle in gray if available
324
+ if total_rows is not None and total_columns is not None:
325
+ title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
326
+
327
+ scan_table = Table(
328
+ title=title_text,
329
+ show_header=True,
330
+ header_style="bold magenta",
331
+ box=SIMPLE_HEAD,
332
+ title_style="bold cyan",
333
+ title_justify="left",
334
+ )
335
+
336
+ # Add columns with specific styling and appropriate widths
337
+ scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
338
+ scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
339
+ scan_table.add_column(
340
+ "NA", style="red", width=6, justify="right"
341
+ ) # Adjusted for better formatting
342
+ scan_table.add_column(
343
+ "UQ", style="green", width=8, justify="right"
344
+ ) # Adjusted for boolean values
345
+
346
+ # Add statistical columns if they exist with appropriate widths
347
+ stat_columns = []
348
+ column_mapping = {
349
+ "mean": ("Mean", "blue", 9),
350
+ "std": ("SD", "blue", 9),
351
+ "min": ("Min", "yellow", 9),
352
+ "median": ("Med", "yellow", 9),
353
+ "max": ("Max", "yellow", 9),
354
+ "q_1": ("Q₁", "magenta", 8),
355
+ "q_3": ("Q₃", "magenta", 9),
356
+ "iqr": ("IQR", "magenta", 8),
357
+ }
358
+
359
+ for col_key, (display_name, color, width) in column_mapping.items():
360
+ if col_key in data_dict:
361
+ scan_table.add_column(display_name, style=color, width=width, justify="right")
362
+ stat_columns.append(col_key)
363
+
364
+ # Helper function to extract column name and type from HTML
365
+ def extract_column_info(html_content: str) -> tuple[str, str]:
366
+ """Extract column name and type from HTML formatted content."""
367
+ # Extract column name from first div
368
+ name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
369
+ column_name = name_match.group(1) if name_match else "Unknown"
370
+
371
+ # Extract data type from second div (with gray color)
372
+ type_match = re.search(r"<div[^>]*color: gray[^>]*>([^<]+)</div>", html_content)
373
+ if type_match:
374
+ data_type = type_match.group(1)
375
+ # Convert to compact format using the existing function
376
+ compact_type = _format_dtype_compact(data_type)
377
+ data_type = compact_type
378
+ else:
379
+ data_type = "unknown"
380
+
381
+ return column_name, data_type
382
+
383
+ # Helper function to format values with improved number formatting
384
+ def format_value(
385
+ value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
386
+ ) -> str:
387
+ """Format values for display with smart number formatting and HTML cleanup."""
388
+ if value is None or (isinstance(value, str) and value.strip() == ""):
389
+ return "[dim]—[/dim]"
390
+
391
+ # Handle missing values indicator
392
+ if is_missing and str(value) == "0":
393
+ return "[green]●[/green]" # No missing values
394
+
395
+ # Clean up HTML formatting from the raw data
396
+ str_val = str(value)
397
+
398
+ # Handle multi-line values with <br> tags FIRST - take the first line (absolute number)
399
+ if "<br>" in str_val:
400
+ str_val = str_val.split("<br>")[0].strip()
401
+ # For unique values, we want just the integer part
402
+ if is_unique:
403
+ try:
404
+ # Try to extract just the integer part for unique counts
405
+ num_val = float(str_val)
406
+ return str(int(num_val))
407
+ except (ValueError, TypeError):
408
+ pass
409
+
410
+ # Now handle HTML content (especially from boolean unique values)
411
+ if "<" in str_val and ">" in str_val:
412
+ # Remove HTML tags completely for cleaner display
413
+ str_val = re.sub(r"<[^>]+>", "", str_val).strip()
414
+ # Clean up extra whitespace
415
+ str_val = re.sub(r"\s+", " ", str_val).strip()
416
+
417
+ # Handle values like "2<.01" - extract the first number
418
+ if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
419
+ # Extract number before the < symbol
420
+ before_lt = str_val.split("<")[0].strip()
421
+ if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
422
+ str_val = before_lt
423
+
424
+ # Handle boolean unique values like "T0.62F0.38" - extract the more readable format
425
+ if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
426
+ # Extract T and F values
427
+ t_match = re.search(r"T(\d+\.\d+)", str_val)
428
+ f_match = re.search(r"F(\d+\.\d+)", str_val)
429
+ if t_match and f_match:
430
+ t_val = float(t_match.group(1))
431
+ f_val = float(f_match.group(1))
432
+ # Show as "T0.62F0.38" but truncated if needed
433
+ formatted = f"T{t_val:.2f}F{f_val:.2f}"
434
+ if len(formatted) > max_width:
435
+ # Truncate to fit, showing dominant value
436
+ if t_val > f_val:
437
+ return f"T{t_val:.1f}"
438
+ else:
439
+ return f"F{f_val:.1f}"
440
+ return formatted
441
+
442
+ # Try to parse as a number for better formatting
443
+ try:
444
+ # Try to convert to float first
445
+ num_val = float(str_val)
446
+
447
+ # Handle special cases
448
+ if num_val == 0:
449
+ return "0"
450
+ elif abs(num_val) == int(abs(num_val)) and abs(num_val) < 10000:
451
+ # Simple integers under 10000
452
+ return str(int(num_val))
453
+ elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
454
+ # Likely dates in YYYYMMDD format - format as date-like
455
+ int_val = int(num_val)
456
+ if 19000101 <= int_val <= 29991231: # Reasonable date range
457
+ str_date = str(int_val)
458
+ if len(str_date) == 8:
459
+ return (
460
+ f"{str_date[:4]}-{str_date[4:6]}-{str_date[6:]}"[: max_width - 1]
461
+ + "…"
462
+ )
463
+ # Otherwise treat as large number
464
+ return f"{num_val / 1000000:.1f}M"
465
+ elif abs(num_val) >= 1000000:
466
+ # Large numbers - use scientific notation or M/k notation
467
+
468
+ if abs(num_val) >= 1000000000:
469
+ return f"{num_val:.1e}"
470
+ else:
471
+ return f"{num_val / 1000000:.1f}M"
472
+ elif abs(num_val) >= 10000:
473
+ # Numbers >= 10k - use compact notation
474
+ return f"{num_val / 1000:.1f}k"
475
+ elif abs(num_val) >= 100:
476
+ # Numbers 100-9999 - show with minimal decimals
477
+ return f"{num_val:.1f}"
478
+ elif abs(num_val) >= 10:
479
+ # Numbers 10-99 - show with one decimal
480
+ return f"{num_val:.1f}"
481
+ elif abs(num_val) >= 1:
482
+ # Numbers 1-9 - show with two decimals
483
+ return f"{num_val:.2f}"
484
+ elif abs(num_val) >= 0.01:
485
+ # Small numbers - show with appropriate precision
486
+ return f"{num_val:.2f}"
487
+ else:
488
+ # Very small numbers - use scientific notation
489
+
490
+ return f"{num_val:.1e}"
491
+
492
+ except (ValueError, TypeError):
493
+ # Not a number, handle as string
494
+ pass
495
+
496
+ # Handle date/datetime strings - show abbreviated format
497
+ if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
498
+ # Likely a date/datetime, show abbreviated
499
+ if len(str_val) > max_width:
500
+ return str_val[: max_width - 1] + "…"
501
+
502
+ # General string truncation with ellipsis
503
+ if len(str_val) > max_width:
504
+ return str_val[: max_width - 1] + "…"
505
+
506
+ return str_val
507
+
508
+ # Populate table rows
509
+ num_rows = len(data_dict["colname"])
510
+ for i in range(num_rows):
511
+ row_data = []
512
+
513
+ # Column name and type from HTML content
514
+ colname_html = data_dict["colname"][i]
515
+ column_name, data_type = extract_column_info(colname_html)
516
+ row_data.append(column_name)
517
+ row_data.append(data_type)
518
+
519
+ # Missing values (NA)
520
+ missing_val = data_dict.get("n_missing", [None] * num_rows)[i]
521
+ row_data.append(format_value(missing_val, is_missing=True, max_width=6))
522
+
523
+ # Unique values (UQ)
524
+ unique_val = data_dict.get("n_unique", [None] * num_rows)[i]
525
+ row_data.append(format_value(unique_val, is_unique=True, max_width=8))
526
+
527
+ # Statistical columns
528
+ for stat_col in stat_columns:
529
+ stat_val = data_dict.get(stat_col, [None] * num_rows)[i]
530
+ # Use appropriate width based on column type
531
+ if stat_col in ["q_1", "iqr"]:
532
+ width = 8
533
+ elif stat_col in ["mean", "std", "min", "median", "max", "q_3"]:
534
+ width = 9
535
+ else:
536
+ width = 8
537
+ row_data.append(format_value(stat_val, max_width=width))
538
+
539
+ scan_table.add_row(*row_data)
540
+
541
+ # Display the results
542
+ console.print()
543
+ console.print(scan_table)
544
+
545
+ except Exception as e:
546
+ # Fallback to simple message if table creation fails
547
+ console.print(f"[yellow]Scan results available for {data_source}[/yellow]")
548
+ console.print(f"[red]Error displaying table: {str(e)}[/red]")
549
+
550
+
551
+ def _rich_print_gt_table(
552
+ gt_table: Any, preview_info: dict | None = None, show_summary: bool = True
553
+ ) -> None:
554
+ """Convert a GT table to Rich table and display it in the terminal.
555
+
556
+ Args:
557
+ gt_table: The GT table object to display
558
+ preview_info: Optional dict with preview context info:
559
+ - total_rows: Total rows in the dataset
560
+ - head_rows: Number of head rows shown
561
+ - tail_rows: Number of tail rows shown
562
+ - is_complete: Whether the entire dataset is shown
563
+ show_summary: Whether to show the row count summary at the bottom
564
+ """
565
+ try:
566
+ # Try to extract the underlying data from the GT table
567
+ df = None
568
+
569
+ # Great Tables stores the original data in different places depending on how it was created
570
+ # Let's try multiple approaches to get the data
571
+ if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
572
+ df = gt_table._tbl_data
573
+ elif (
574
+ hasattr(gt_table, "_body")
575
+ and hasattr(gt_table._body, "body")
576
+ and gt_table._body.body is not None
577
+ ):
578
+ df = gt_table._body.body
579
+ elif hasattr(gt_table, "_data") and gt_table._data is not None:
580
+ df = gt_table._data
581
+ elif hasattr(gt_table, "data") and gt_table.data is not None:
582
+ df = gt_table.data
583
+
584
+ if df is not None:
585
+ # Create a Rich table with horizontal lines
586
+ from rich.box import SIMPLE_HEAD
587
+
588
+ # Create enhanced title if preview_info contains metadata
589
+ table_title = None
590
+ if preview_info and "source_type" in preview_info and "table_type" in preview_info:
591
+ source_type = preview_info["source_type"]
592
+ table_type = preview_info["table_type"]
593
+ table_title = f"Data Preview / {source_type} / {table_type}"
594
+
595
+ rich_table = Table(
596
+ title=table_title,
597
+ show_header=True,
598
+ header_style="bold magenta",
599
+ box=SIMPLE_HEAD,
600
+ title_style="bold cyan",
601
+ title_justify="left",
602
+ )
603
+
604
+ # Get column names
605
+ columns = []
606
+ if hasattr(df, "columns"):
607
+ columns = list(df.columns)
608
+ elif hasattr(df, "schema"): # pragma: no cover
609
+ columns = list(df.schema.names)
610
+ elif hasattr(df, "column_names"): # pragma: no cover
611
+ columns = list(df.column_names)
612
+
613
+ if not columns: # pragma: no cover
614
+ # Fallback: try to determine columns from first row
615
+ try:
616
+ if hasattr(df, "to_dicts") and len(df) > 0:
617
+ first_dict = df.to_dicts()[0]
618
+ columns = list(first_dict.keys())
619
+ elif hasattr(df, "to_dict") and len(df) > 0:
620
+ first_dict = df.to_dict("records")[0]
621
+ columns = list(first_dict.keys())
622
+ except Exception: # pragma: no cover
623
+ columns = [f"Column {i + 1}" for i in range(10)] # Default fallback
624
+
625
+ # Add columns to Rich table
626
+ # Handle wide tables by limiting columns displayed
627
+ max_terminal_cols = 15 # Reasonable limit for terminal display
628
+
629
+ # Get terminal width to adjust column behavior
630
+ try:
631
+ terminal_width = console.size.width
632
+ # Estimate max column width based on terminal size and number of columns
633
+ if len(columns) <= 5:
634
+ max_col_width = min(60, terminal_width // 4)
635
+ elif len(columns) <= 10:
636
+ max_col_width = min(40, terminal_width // 6)
637
+ else:
638
+ max_col_width = min(30, terminal_width // 8)
639
+ except Exception: # pragma: no cover
640
+ # Fallback if we can't get terminal width
641
+ max_col_width = 40 if len(columns) <= 10 else 25
642
+
643
+ if len(columns) > max_terminal_cols:
644
+ # For wide tables, show first few, middle indicator, and last few columns
645
+ first_cols = 7
646
+ last_cols = 7
647
+
648
+ display_columns = columns[:first_cols] + ["...more..."] + columns[-last_cols:]
649
+
650
+ console.print(
651
+ f"\n[yellow]⚠ Table has {len(columns)} columns. Showing first {first_cols} and last {last_cols} columns.[/yellow]"
652
+ )
653
+ console.print("[dim]Use --columns to specify which columns to display.[/dim]")
654
+ console.print(
655
+ f"[dim]Full column list: {', '.join(columns[:5])}...{', '.join(columns[-5:])}[/dim]\n"
656
+ )
657
+ else:
658
+ display_columns = columns
659
+
660
+ # Get data types for columns
661
+ dtypes_dict = _get_column_dtypes(df, columns)
662
+
663
+ # Calculate row number column width if needed
664
+ row_num_width = 6 # Default width
665
+ if "_row_num_" in columns:
666
+ try:
667
+ # Get the maximum row number to calculate appropriate width
668
+ if hasattr(df, "to_dicts"):
669
+ data_dict = df.to_dicts()
670
+ if data_dict:
671
+ row_nums = [row.get("_row_num_", 0) for row in data_dict]
672
+ max_row_num = max(row_nums) if row_nums else 0
673
+ row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
674
+ elif hasattr(df, "to_dict"):
675
+ data_dict = df.to_dict("records")
676
+ if data_dict:
677
+ row_nums = [row.get("_row_num_", 0) for row in data_dict]
678
+ max_row_num = max(row_nums) if row_nums else 0
679
+ row_num_width = max(len(str(max_row_num)) + 1, 6) # +1 for padding
680
+ except Exception: # pragma: no cover
681
+ # If we can't determine max row number, use default
682
+ row_num_width = 8 # Slightly larger default for safety
683
+
684
+ for i, col in enumerate(display_columns):
685
+ if col == "...more...":
686
+ # Add a special indicator column
687
+ rich_table.add_column("···", style="dim", width=3, no_wrap=True)
688
+ else:
689
+ # Handle row number column specially
690
+ if col == "_row_num_":
691
+ # Row numbers get no header, right alignment, and dim gray style
692
+ # Use dynamic width to prevent truncation
693
+ rich_table.add_column(
694
+ "", style="dim", justify="right", no_wrap=True, width=row_num_width
695
+ )
696
+ else:
697
+ display_col = str(col)
698
+
699
+ # Get data type for this column (if available)
700
+ if col in dtypes_dict:
701
+ dtype_display = f"<{dtypes_dict[col]}>"
702
+ # Create header with column name and data type
703
+ header_text = f"{display_col}\n[dim yellow]{dtype_display}[/dim yellow]"
704
+ else:
705
+ header_text = display_col
706
+
707
+ rich_table.add_column(
708
+ header_text,
709
+ style="cyan",
710
+ no_wrap=False,
711
+ overflow="ellipsis",
712
+ max_width=max_col_width,
713
+ )
714
+
715
+ # Convert data to list of rows
716
+ rows = []
717
+ try:
718
+ if hasattr(df, "to_dicts"):
719
+ # Polars interface
720
+ data_dict = df.to_dicts()
721
+ if len(columns) > max_terminal_cols:
722
+ # For wide tables, extract only the displayed columns
723
+ display_data_columns = (
724
+ columns[:7] + columns[-7:]
725
+ ) # Skip the "...more..." placeholder
726
+ rows = [
727
+ [
728
+ _format_cell_value(
729
+ row.get(col, ""),
730
+ is_row_number=(col == "_row_num_"),
731
+ max_width=max_col_width,
732
+ num_columns=len(columns),
733
+ )
734
+ for col in display_data_columns
735
+ ]
736
+ for row in data_dict
737
+ ]
738
+ # Add the "..." column in the middle
739
+ for i, row in enumerate(rows):
740
+ rows[i] = row[:7] + ["···"] + row[7:]
741
+ else:
742
+ rows = [
743
+ [
744
+ _format_cell_value(
745
+ row.get(col, ""),
746
+ is_row_number=(col == "_row_num_"),
747
+ max_width=max_col_width,
748
+ num_columns=len(columns),
749
+ )
750
+ for col in columns
751
+ ]
752
+ for row in data_dict
753
+ ]
754
+ elif hasattr(df, "to_dict"):
755
+ # Pandas-like interface
756
+ data_dict = df.to_dict("records")
757
+ if len(columns) > max_terminal_cols:
758
+ # For wide tables, extract only the displayed columns
759
+ display_data_columns = columns[:7] + columns[-7:]
760
+ rows = [
761
+ [
762
+ _format_cell_value(
763
+ row.get(col, ""),
764
+ is_row_number=(col == "_row_num_"),
765
+ max_width=max_col_width,
766
+ num_columns=len(columns),
767
+ )
768
+ for col in display_data_columns
769
+ ]
770
+ for row in data_dict
771
+ ]
772
+ # Add the "..." column in the middle
773
+ for i, row in enumerate(rows):
774
+ rows[i] = row[:7] + ["···"] + row[7:]
775
+ else:
776
+ rows = [
777
+ [
778
+ _format_cell_value(
779
+ row.get(col, ""),
780
+ is_row_number=(col == "_row_num_"),
781
+ max_width=max_col_width,
782
+ num_columns=len(columns),
783
+ )
784
+ for col in columns
785
+ ]
786
+ for row in data_dict
787
+ ]
788
+ elif hasattr(df, "iter_rows"):
789
+ # Polars lazy frame
790
+ rows = [
791
+ [
792
+ _format_cell_value(
793
+ val,
794
+ is_row_number=(i == 0 and columns[0] == "_row_num_"),
795
+ max_width=max_col_width,
796
+ num_columns=len(columns),
797
+ )
798
+ for i, val in enumerate(row)
799
+ ]
800
+ for row in df.iter_rows()
801
+ ]
802
+ elif hasattr(df, "__iter__"):
803
+ # Try to iterate directly
804
+ rows = [
805
+ [
806
+ _format_cell_value(
807
+ val,
808
+ is_row_number=(i == 0 and columns[0] == "_row_num_"),
809
+ max_width=max_col_width,
810
+ num_columns=len(columns),
811
+ )
812
+ for i, val in enumerate(row)
813
+ ]
814
+ for row in df
815
+ ]
816
+ else:
817
+ rows = [["Could not extract data from this format"]] # pragma: no cover
818
+ except Exception as e:
819
+ rows = [[f"Error extracting data: {e}"]] # pragma: no cover
820
+
821
+ # Add rows to Rich table with separator between head and tail
822
+ max_rows = 50 # Reasonable limit for terminal display
823
+
824
+ # Get preview info to determine head/tail separation
825
+ head_rows_count = 0
826
+ tail_rows_count = 0
827
+ total_dataset_rows = 0
828
+
829
+ if preview_info:
830
+ head_rows_count = preview_info.get("head_rows", 0)
831
+ tail_rows_count = preview_info.get("tail_rows", 0)
832
+ total_dataset_rows = preview_info.get("total_rows", len(rows))
833
+ is_complete = preview_info.get("is_complete", False)
834
+ else:
835
+ # Fallback: assume all rows are shown
836
+ is_complete = True
837
+
838
+ # Add rows with optional separator
839
+ for i, row in enumerate(rows[:max_rows]):
840
+ try:
841
+ # Add separator between head and tail rows
842
+ if (
843
+ not is_complete
844
+ and head_rows_count > 0
845
+ and tail_rows_count > 0
846
+ and i == head_rows_count
847
+ ):
848
+ # Add a visual separator row with dashes
849
+ separator_row = [
850
+ "─" * 3 if col != "_row_num_" else "⋮"
851
+ for col in (
852
+ display_columns if "display_columns" in locals() else columns
853
+ )
854
+ ]
855
+ rich_table.add_row(*separator_row, style="dim")
856
+
857
+ rich_table.add_row(*row)
858
+ except Exception as e: # pragma: no cover
859
+ # If there's an issue with row data, show error
860
+ rich_table.add_row(*[f"Error: {e}" for _ in columns]) # pragma: no cover
861
+ break # pragma: no cover
862
+
863
+ # Show the table
864
+ console.print()
865
+ console.print(rich_table)
866
+
867
+ # Show summary info (conditionally)
868
+ if show_summary:
869
+ total_rows = len(rows)
870
+
871
+ # Use preview info if available, otherwise fall back to old logic
872
+ if preview_info:
873
+ total_dataset_rows = preview_info.get("total_rows", total_rows)
874
+ head_rows = preview_info.get("head_rows", 0)
875
+ tail_rows = preview_info.get("tail_rows", 0)
876
+ is_complete = preview_info.get("is_complete", False)
877
+
878
+ if is_complete:
879
+ console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
880
+ elif head_rows > 0 and tail_rows > 0:
881
+ console.print(
882
+ f"\n[dim]Showing first {head_rows} and last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
883
+ )
884
+ elif head_rows > 0:
885
+ console.print(
886
+ f"\n[dim]Showing first {head_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
887
+ )
888
+ elif tail_rows > 0:
889
+ console.print(
890
+ f"\n[dim]Showing last {tail_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
891
+ )
892
+ else:
893
+ # Fallback for other cases
894
+ console.print(
895
+ f"\n[dim]Showing {total_rows} rows from {total_dataset_rows:,} total rows.[/dim]"
896
+ )
897
+ else:
898
+ # Original logic as fallback
899
+ max_rows = 50 # This should match the limit used above
900
+ if total_rows > max_rows:
901
+ console.print(
902
+ f"\n[dim]Showing first {max_rows} of {total_rows} rows. Use --output-html to see all data.[/dim]"
903
+ )
904
+ else:
905
+ console.print(f"\n[dim]Showing all {total_rows} rows.[/dim]")
906
+
907
+ else:
908
+ # If we can't extract data, show the success message
909
+ console.print(
910
+ Panel(
911
+ "[green]✓[/green] Table rendered successfully. "
912
+ "Use --output-html to save the full interactive report.",
913
+ title="Table Preview",
914
+ border_style="green",
915
+ )
916
+ )
917
+
918
+ except Exception as e: # pragma: no cover
919
+ console.print(f"[red]Error rendering table:[/red] {e}")
920
+ console.print(
921
+ f"[dim]GT table type: {type(gt_table) if 'gt_table' in locals() else 'undefined'}[/dim]"
922
+ )
923
+
924
+ # Fallback: show the success message
925
+ console.print(
926
+ Panel(
927
+ "[green]✓[/green] Table rendered successfully. "
928
+ "Use --output-html to save the full interactive report.",
929
+ title="Table Preview",
930
+ border_style="green",
931
+ )
932
+ )
933
+
934
+
935
+ def _display_validation_summary(validation: Any) -> None:
936
+ """Display a validation summary in a Rich table format."""
937
+ try:
938
+ # Try to get the summary from the validation report
939
+ if hasattr(validation, "validation_info") and validation.validation_info is not None:
940
+ # Use the validation_info to create a summary
941
+ info = validation.validation_info
942
+ n_steps = len(info)
943
+ n_passed = sum(1 for step in info if step.all_passed)
944
+ n_failed = n_steps - n_passed
945
+
946
+ # Calculate severity counts
947
+ n_warning = sum(1 for step in info if step.warning)
948
+ n_error = sum(1 for step in info if step.error)
949
+ n_critical = sum(1 for step in info if step.critical)
950
+
951
+ all_passed = n_failed == 0
952
+
953
+ # Determine highest severity
954
+ if n_critical > 0:
955
+ highest_severity = "critical"
956
+ elif n_error > 0:
957
+ highest_severity = "error"
958
+ elif n_warning > 0:
959
+ highest_severity = "warning"
960
+ elif n_failed > 0:
961
+ highest_severity = "some failing"
962
+ else:
963
+ highest_severity = "all passed"
964
+
965
+ # Create a summary table
966
+ table = Table(title="Validation Summary", show_header=True, header_style="bold magenta")
967
+ table.add_column("Metric", style="cyan", no_wrap=True)
968
+ table.add_column("Value", style="green")
969
+
970
+ # Add summary statistics
971
+ table.add_row("Total Steps", str(n_steps))
972
+ table.add_row("Passing Steps", str(n_passed))
973
+ table.add_row("Failing Steps", str(n_failed))
974
+ table.add_row("Warning Steps", str(n_warning))
975
+ table.add_row("Error Steps", str(n_error))
976
+ table.add_row("Critical Steps", str(n_critical))
977
+ table.add_row("All Passed", str(all_passed))
978
+ table.add_row("Highest Severity", highest_severity)
979
+
980
+ console.print(table)
981
+
982
+ # Display step details
983
+ if n_steps > 0:
984
+ steps_table = Table(
985
+ title="Validation Steps", show_header=True, header_style="bold cyan"
986
+ )
987
+ steps_table.add_column("Step", style="dim")
988
+ steps_table.add_column("Type", style="white")
989
+ steps_table.add_column("Column", style="cyan")
990
+ steps_table.add_column("Status", style="white")
991
+ steps_table.add_column("Passed/Total", style="green")
992
+
993
+ for step in info:
994
+ status_icon = "✓" if step.all_passed else "✗"
995
+ status_color = "green" if step.all_passed else "red"
996
+
997
+ severity = ""
998
+ if step.critical:
999
+ severity = " [red](CRITICAL)[/red]"
1000
+ elif step.error:
1001
+ severity = " [red](ERROR)[/red]"
1002
+ elif step.warning:
1003
+ severity = " [yellow](WARNING)[/yellow]"
1004
+
1005
+ steps_table.add_row(
1006
+ str(step.i),
1007
+ step.assertion_type,
1008
+ str(step.column) if step.column else "—",
1009
+ f"[{status_color}]{status_icon}[/{status_color}]{severity}",
1010
+ f"{step.n_passed}/{step.n}",
1011
+ )
1012
+
1013
+ console.print(steps_table)
1014
+
1015
+ # Display status with appropriate color
1016
+ if highest_severity == "all passed":
1017
+ console.print(
1018
+ Panel("[green]✓ All validations passed![/green]", border_style="green")
1019
+ )
1020
+ elif highest_severity == "some failing":
1021
+ console.print(
1022
+ Panel("[yellow]⚠ Some validations failed[/yellow]", border_style="yellow")
1023
+ )
1024
+ elif highest_severity in ["warning", "error", "critical"]:
1025
+ color = "yellow" if highest_severity == "warning" else "red"
1026
+ console.print(
1027
+ Panel(
1028
+ f"[{color}]✗ Validation failed with {highest_severity} severity[/{color}]",
1029
+ border_style=color,
1030
+ )
1031
+ )
1032
+ else:
1033
+ console.print("[yellow]Validation object does not contain validation results.[/yellow]")
1034
+
1035
+ except Exception as e: # pragma: no cover
1036
+ console.print(f"[red]Error displaying validation summary:[/red] {e}")
1037
+ import traceback # pragma: no cover
1038
+
1039
+ console.print(f"[dim]{traceback.format_exc()}[/dim]") # pragma: no cover
1040
+
1041
+
1042
+ @click.group(cls=OrderedGroup)
1043
+ @click.version_option(version=pb.__version__, prog_name="pb")
1044
+ def cli():
1045
+ """
1046
+ Pointblank CLI - Data validation and quality tools for data engineers.
1047
+
1048
+ Use this CLI to run validation scripts, preview tables, and generate reports
1049
+ directly from the command line.
1050
+ """
1051
+ pass
1052
+
1053
+
1054
+ @cli.command()
1055
+ @click.argument("data_source", type=str)
1056
+ def info(data_source: str):
1057
+ """
1058
+ Display information about a data source.
1059
+
1060
+ Shows table type, dimensions, column names, and data types.
1061
+
1062
+ DATA_SOURCE can be:
1063
+
1064
+ \b
1065
+ - CSV file path (e.g., data.csv)
1066
+ - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
1067
+ - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1068
+ - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1069
+ - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1070
+ """
1071
+ try:
1072
+ with console.status("[bold green]Loading data..."):
1073
+ # Load the data source using the centralized function
1074
+ data = _load_data_source(data_source)
1075
+
1076
+ # Get table information
1077
+ tbl_type = _get_tbl_type(data)
1078
+ row_count = pb.get_row_count(data)
1079
+ col_count = pb.get_column_count(data)
1080
+
1081
+ # Import the box style
1082
+ from rich.box import SIMPLE_HEAD
1083
+
1084
+ # Create info table
1085
+ info_table = Table(
1086
+ title="Data Source Information",
1087
+ show_header=True,
1088
+ header_style="bold magenta",
1089
+ box=SIMPLE_HEAD,
1090
+ title_style="bold cyan",
1091
+ title_justify="left",
1092
+ )
1093
+ info_table.add_column("Property", style="cyan", no_wrap=True)
1094
+ info_table.add_column("Value", style="green")
1095
+
1096
+ info_table.add_row("Source", data_source)
1097
+ info_table.add_row("Table Type", tbl_type)
1098
+ info_table.add_row("Rows", f"{row_count:,}")
1099
+ info_table.add_row("Columns", f"{col_count:,}")
1100
+
1101
+ console.print()
1102
+ console.print(info_table)
1103
+
1104
+ except Exception as e:
1105
+ console.print(f"[red]Error:[/red] {e}")
1106
+ sys.exit(1)
1107
+
1108
+
1109
+ @cli.command()
1110
+ @click.argument("data_source", type=str)
1111
+ @click.option("--columns", "-c", help="Comma-separated list of columns to display")
1112
+ @click.option("--col-range", help="Column range like '1:10' or '5:' or ':15' (1-based indexing)")
1113
+ @click.option("--col-first", type=int, help="Show first N columns")
1114
+ @click.option("--col-last", type=int, help="Show last N columns")
1115
+ @click.option("--head", "-h", default=5, help="Number of rows from the top (default: 5)")
1116
+ @click.option("--tail", "-t", default=5, help="Number of rows from the bottom (default: 5)")
1117
+ @click.option("--limit", "-l", default=50, help="Maximum total rows to display (default: 50)")
1118
+ @click.option("--no-row-numbers", is_flag=True, help="Hide row numbers")
1119
+ @click.option("--max-col-width", default=250, help="Maximum column width in pixels (default: 250)")
1120
+ @click.option("--min-table-width", default=500, help="Minimum table width in pixels (default: 500)")
1121
+ @click.option("--no-header", is_flag=True, help="Hide table header")
1122
+ @click.option("--output-html", type=click.Path(), help="Save HTML output to file")
1123
+ def preview(
1124
+ data_source: str,
1125
+ columns: str | None,
1126
+ col_range: str | None,
1127
+ col_first: int | None,
1128
+ col_last: int | None,
1129
+ head: int,
1130
+ tail: int,
1131
+ limit: int,
1132
+ no_row_numbers: bool,
1133
+ max_col_width: int,
1134
+ min_table_width: int,
1135
+ no_header: bool,
1136
+ output_html: str | None,
1137
+ ):
1138
+ """
1139
+ Preview a data table showing head and tail rows.
1140
+
1141
+ DATA_SOURCE can be:
1142
+
1143
+ \b
1144
+ - CSV file path (e.g., data.csv)
1145
+ - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
1146
+ - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1147
+ - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1148
+ - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1149
+
1150
+ COLUMN SELECTION OPTIONS:
1151
+
1152
+ For tables with many columns, use these options to control which columns are displayed:
1153
+
1154
+ \b
1155
+ - --columns: Specify exact columns (e.g., --columns "name,age,email")
1156
+ - --col-range: Select column range (e.g., --col-range "1:10", --col-range "5:", --col-range ":15")
1157
+ - --col-first: Show first N columns (e.g., --col-first 5)
1158
+ - --col-last: Show last N columns (e.g., --col-last 3)
1159
+
1160
+ Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
1161
+ """
1162
+ try:
1163
+ with console.status("[bold green]Loading data..."):
1164
+ # Load the data source using the centralized function
1165
+ data = _load_data_source(data_source)
1166
+
1167
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1168
+
1169
+ # Parse columns if provided
1170
+ columns_list = None
1171
+ if columns:
1172
+ columns_list = [col.strip() for col in columns.split(",")]
1173
+
1174
+ # If data has _row_num_ and it's not explicitly included, add it at the beginning
1175
+ try:
1176
+ # Data is already processed, just use it directly
1177
+ processed_data = data
1178
+
1179
+ # Get column names from the processed data
1180
+ all_columns = []
1181
+ if hasattr(processed_data, "columns"):
1182
+ all_columns = list(processed_data.columns)
1183
+ elif hasattr(processed_data, "schema"):
1184
+ all_columns = list(processed_data.schema.names)
1185
+
1186
+ # If _row_num_ exists in data but not in user selection, add it at beginning
1187
+ if all_columns and "_row_num_" in all_columns and "_row_num_" not in columns_list:
1188
+ columns_list = ["_row_num_"] + columns_list
1189
+ except Exception: # pragma: no cover
1190
+ # If we can't process the data, just use the user's column list as-is
1191
+ pass
1192
+ elif col_range or col_first or col_last:
1193
+ # Need to get column names to apply range/first/last selection
1194
+ # Data is already processed, just use it directly
1195
+ processed_data = data
1196
+
1197
+ # Get column names from the processed data
1198
+ all_columns = []
1199
+ if hasattr(processed_data, "columns"):
1200
+ all_columns = list(processed_data.columns)
1201
+ elif hasattr(processed_data, "schema"):
1202
+ all_columns = list(processed_data.schema.names)
1203
+ else:
1204
+ console.print(
1205
+ "[yellow]Warning: Could not determine column names for range selection[/yellow]"
1206
+ )
1207
+
1208
+ if all_columns:
1209
+ # Check if _row_num_ exists and preserve it
1210
+ has_row_num = "_row_num_" in all_columns
1211
+
1212
+ if col_range:
1213
+ # Parse range like "1:10", "5:", ":15"
1214
+ if ":" in col_range:
1215
+ parts = col_range.split(":")
1216
+ start_idx = int(parts[0]) - 1 if parts[0] else 0 # Convert to 0-based
1217
+ end_idx = int(parts[1]) if parts[1] else len(all_columns)
1218
+
1219
+ # Filter out _row_num_ from the range selection, we'll add it back later
1220
+ columns_for_range = [col for col in all_columns if col != "_row_num_"]
1221
+ selected_columns = columns_for_range[start_idx:end_idx]
1222
+
1223
+ # Always include _row_num_ at the beginning if it exists
1224
+ if has_row_num:
1225
+ columns_list = ["_row_num_"] + selected_columns
1226
+ else:
1227
+ columns_list = selected_columns
1228
+ else:
1229
+ console.print(
1230
+ "[yellow]Warning: Invalid range format. Use 'start:end' format[/yellow]"
1231
+ )
1232
+ elif col_first:
1233
+ # Filter out _row_num_ from the first N selection, we'll add it back later
1234
+ columns_for_first = [col for col in all_columns if col != "_row_num_"]
1235
+ selected_columns = columns_for_first[:col_first]
1236
+
1237
+ # Always include _row_num_ at the beginning if it exists
1238
+ if has_row_num:
1239
+ columns_list = ["_row_num_"] + selected_columns
1240
+ else:
1241
+ columns_list = selected_columns
1242
+ elif col_last:
1243
+ # Filter out _row_num_ from the last N selection, we'll add it back later
1244
+ columns_for_last = [col for col in all_columns if col != "_row_num_"]
1245
+ selected_columns = columns_for_last[-col_last:]
1246
+
1247
+ # Always include _row_num_ at the beginning if it exists
1248
+ if has_row_num:
1249
+ columns_list = ["_row_num_"] + selected_columns
1250
+ else:
1251
+ columns_list = selected_columns
1252
+
1253
+ # Generate preview
1254
+ with console.status("[bold green]Generating preview..."):
1255
+ # Get total dataset size before preview and gather metadata
1256
+ try:
1257
+ # Data is already processed, just use it directly
1258
+ processed_data = data
1259
+
1260
+ total_dataset_rows = pb.get_row_count(processed_data)
1261
+ total_dataset_columns = pb.get_column_count(processed_data)
1262
+
1263
+ # Determine source type and table type for enhanced preview title
1264
+ if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1265
+ source_type = f"Pointblank dataset: {data_source}"
1266
+ else:
1267
+ source_type = f"External source: {data_source}"
1268
+
1269
+ table_type = _get_tbl_type(processed_data)
1270
+ except Exception:
1271
+ # If we can't get metadata, set defaults
1272
+ total_dataset_rows = None
1273
+ total_dataset_columns = None
1274
+ source_type = f"Data source: {data_source}"
1275
+ table_type = "unknown"
1276
+
1277
+ gt_table = pb.preview(
1278
+ data=data,
1279
+ columns_subset=columns_list,
1280
+ n_head=head,
1281
+ n_tail=tail,
1282
+ limit=limit,
1283
+ show_row_numbers=not no_row_numbers,
1284
+ max_col_width=max_col_width,
1285
+ min_tbl_width=min_table_width,
1286
+ incl_header=not no_header,
1287
+ )
1288
+
1289
+ if output_html:
1290
+ # Save HTML to file
1291
+ html_content = gt_table.as_raw_html()
1292
+ Path(output_html).write_text(html_content, encoding="utf-8")
1293
+ console.print(f"[green]✓[/green] HTML saved to: {output_html}")
1294
+ else:
1295
+ # Display in terminal with preview context info
1296
+ preview_info = None
1297
+ if total_dataset_rows is not None:
1298
+ # Determine if we're showing the complete dataset
1299
+ expected_rows = min(head + tail, limit, total_dataset_rows)
1300
+ is_complete = total_dataset_rows <= expected_rows
1301
+
1302
+ preview_info = {
1303
+ "total_rows": total_dataset_rows,
1304
+ "total_columns": total_dataset_columns,
1305
+ "head_rows": head,
1306
+ "tail_rows": tail,
1307
+ "is_complete": is_complete,
1308
+ "source_type": source_type,
1309
+ "table_type": table_type,
1310
+ }
1311
+
1312
+ _rich_print_gt_table(gt_table, preview_info)
1313
+
1314
+ except Exception as e: # pragma: no cover
1315
+ console.print(f"[red]Error:[/red] {e}")
1316
+ sys.exit(1) # pragma: no cover
1317
+
1318
+
1319
+ @cli.command()
1320
+ @click.argument("data_source", type=str)
1321
+ @click.option("--output-html", type=click.Path(), help="Save HTML scan report to file")
1322
+ @click.option("--columns", "-c", help="Comma-separated list of columns to scan")
1323
+ def scan(
1324
+ data_source: str,
1325
+ output_html: str | None,
1326
+ columns: str | None,
1327
+ ):
1328
+ """
1329
+ Generate a data scan profile report.
1330
+
1331
+ Produces a comprehensive data profile including:
1332
+
1333
+ \b
1334
+ - Column types and distributions
1335
+ - Missing value patterns
1336
+ - Basic statistics
1337
+ - Data quality indicators
1338
+
1339
+ DATA_SOURCE can be:
1340
+
1341
+ \b
1342
+ - CSV file path (e.g., data.csv)
1343
+ - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
1344
+ - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1345
+ - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1346
+ - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1347
+ """
1348
+ try:
1349
+ import time
1350
+
1351
+ start_time = time.time()
1352
+
1353
+ with console.status("[bold green]Loading data..."):
1354
+ # Load the data source using the centralized function
1355
+ data = _load_data_source(data_source)
1356
+
1357
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1358
+
1359
+ # Parse columns if provided
1360
+ columns_list = None
1361
+ if columns:
1362
+ columns_list = [col.strip() for col in columns.split(",")]
1363
+
1364
+ # Generate data scan
1365
+ with console.status("[bold green]Generating data scan..."):
1366
+ # Use col_summary_tbl for comprehensive column scanning
1367
+ # Data is already processed by _load_data_source
1368
+ scan_result = pb.col_summary_tbl(data=data)
1369
+
1370
+ if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1371
+ source_type = f"Pointblank dataset: {data_source}"
1372
+ else:
1373
+ source_type = f"External source: {data_source}"
1374
+
1375
+ table_type = _get_tbl_type(data)
1376
+ # Get row count and column count for header
1377
+ try:
1378
+ total_rows = pb.get_row_count(data)
1379
+ total_columns = pb.get_column_count(data)
1380
+ except Exception:
1381
+ total_rows = None
1382
+ total_columns = None
1383
+
1384
+ scan_time = time.time() - start_time
1385
+
1386
+ if output_html:
1387
+ # Save HTML to file
1388
+ try:
1389
+ html_content = scan_result.as_raw_html()
1390
+ Path(output_html).write_text(html_content, encoding="utf-8")
1391
+ console.print(f"[green]✓[/green] Data scan report saved to: {output_html}")
1392
+ except Exception as e:
1393
+ console.print(f"[yellow]Warning: Could not save HTML report: {e}[/yellow]")
1394
+ else:
1395
+ # Display rich scan table in terminal
1396
+ console.print(f"[green]✓[/green] Data scan completed in {scan_time:.2f}s")
1397
+ console.print("Use --output-html to save the full interactive scan report.")
1398
+
1399
+ # Display detailed column summary using rich formatting
1400
+ try:
1401
+ _rich_print_scan_table(
1402
+ scan_result, data_source, source_type, table_type, total_rows, total_columns
1403
+ )
1404
+
1405
+ except Exception as e:
1406
+ console.print(f"[yellow]Could not display scan summary: {e}[/yellow]")
1407
+
1408
+ except Exception as e:
1409
+ console.print(f"[red]Error:[/red] {e}")
1410
+ sys.exit(1)
1411
+
1412
+
1413
+ @cli.command()
1414
+ @click.argument("data_source", type=str)
1415
+ @click.option("--output-html", type=click.Path(), help="Save HTML output to file")
1416
+ def missing(data_source: str, output_html: str | None):
1417
+ """
1418
+ Generate a missing values report for a data table.
1419
+
1420
+ DATA_SOURCE can be:
1421
+
1422
+ \b
1423
+ - CSV file path (e.g., data.csv)
1424
+ - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
1425
+ - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1426
+ - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1427
+ - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1428
+ """
1429
+ try:
1430
+ with console.status("[bold green]Loading data..."):
1431
+ # Load the data source using the centralized function
1432
+ data = _load_data_source(data_source)
1433
+
1434
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1435
+
1436
+ # Generate missing values table
1437
+ with console.status("[bold green]Analyzing missing values..."):
1438
+ gt_table = pb.missing_vals_tbl(data)
1439
+
1440
+ # Data is already processed, just use it directly
1441
+ original_data = data
1442
+
1443
+ if output_html:
1444
+ # Save HTML to file
1445
+ html_content = gt_table.as_raw_html()
1446
+ Path(output_html).write_text(html_content, encoding="utf-8")
1447
+ console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
1448
+ else:
1449
+ # Display in terminal with special missing values formatting
1450
+ _rich_print_missing_table(gt_table, original_data)
1451
+
1452
+ except Exception as e:
1453
+ console.print(f"[red]Error:[/red] {e}")
1454
+ sys.exit(1)
1455
+
1456
+
1457
+ @cli.command(name="validate")
1458
+ @click.argument("data_source", type=str)
1459
+ @click.option(
1460
+ "--check",
1461
+ "checks", # Changed to collect multiple values
1462
+ type=click.Choice(
1463
+ [
1464
+ "rows-distinct",
1465
+ "col-vals-not-null",
1466
+ "rows-complete",
1467
+ "col-exists",
1468
+ "col-vals-in-set",
1469
+ "col-vals-gt",
1470
+ "col-vals-ge",
1471
+ "col-vals-lt",
1472
+ "col-vals-le",
1473
+ ]
1474
+ ),
1475
+ multiple=True, # Allow multiple --check options
1476
+ help="Type of validation check to perform. Can be used multiple times for multiple checks.",
1477
+ )
1478
+ @click.option("--list-checks", is_flag=True, help="List available validation checks and exit")
1479
+ @click.option(
1480
+ "--column",
1481
+ "columns", # Changed to collect multiple values
1482
+ multiple=True, # Allow multiple --column options
1483
+ help="Column name or integer position as #N (1-based index) for validation.",
1484
+ )
1485
+ @click.option(
1486
+ "--set",
1487
+ "sets", # Changed to collect multiple values
1488
+ multiple=True, # Allow multiple --set options
1489
+ help="Comma-separated allowed values for col-vals-in-set checks.",
1490
+ )
1491
+ @click.option(
1492
+ "--value",
1493
+ "values", # Changed to collect multiple values
1494
+ type=float,
1495
+ multiple=True, # Allow multiple --value options
1496
+ help="Numeric value for comparison checks.",
1497
+ )
1498
+ @click.option(
1499
+ "--show-extract", is_flag=True, help="Show extract of failing rows if validation fails"
1500
+ )
1501
+ @click.option(
1502
+ "--write-extract", type=str, help="Save failing rows to folder. Provide base name for folder."
1503
+ )
1504
+ @click.option(
1505
+ "--limit", "-l", default=10, help="Maximum number of failing rows to show/save (default: 10)"
1506
+ )
1507
+ @click.option("--exit-code", is_flag=True, help="Exit with non-zero code if validation fails")
1508
+ @click.pass_context
1509
+ def validate(
1510
+ ctx: click.Context,
1511
+ data_source: str,
1512
+ checks: tuple[str, ...], # Changed to tuple
1513
+ columns: tuple[str, ...], # Changed to tuple
1514
+ sets: tuple[str, ...], # Changed to tuple
1515
+ values: tuple[float, ...], # Changed to tuple
1516
+ show_extract: bool,
1517
+ write_extract: str | None,
1518
+ limit: int,
1519
+ exit_code: bool,
1520
+ list_checks: bool,
1521
+ ):
1522
+ """
1523
+ Perform single or multiple data validations.
1524
+
1525
+ Run one or more validation checks on your data in a single command.
1526
+ Use multiple --check options to perform multiple validations.
1527
+
1528
+ DATA_SOURCE can be:
1529
+
1530
+ \b
1531
+ - CSV file path (e.g., data.csv)
1532
+ - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
1533
+ - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1534
+ - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1535
+ - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1536
+
1537
+ AVAILABLE CHECKS:
1538
+
1539
+ Use --list-checks to see all available validation methods with examples.
1540
+
1541
+ The default check is 'rows-distinct' which checks for duplicate rows.
1542
+
1543
+ \b
1544
+ - rows-distinct: Check if all rows in the dataset are unique (no duplicates)
1545
+ - rows-complete: Check if all rows are complete (no missing values in any column)
1546
+ - col-exists: Check if a specific column exists in the dataset (requires --column)
1547
+ - col-vals-not-null: Check if all values in a column are not null/missing (requires --column)
1548
+ - col-vals-gt: Check if all values in a column are greater than a threshold (requires --column and --value)
1549
+ - col-vals-ge: Check if all values in a column are greater than or equal to a threshold (requires --column and --value)
1550
+ - col-vals-lt: Check if all values in a column are less than a threshold (requires --column and --value)
1551
+ - col-vals-le: Check if all values in a column are less than or equal to a threshold (requires --column and --value)
1552
+ - col-vals-in-set: Check if all values in a column are in an allowed set (requires --column and --set)
1553
+
1554
+ Examples:
1555
+
1556
+ \b
1557
+ pb validate data.csv # Uses default validation (rows-distinct)
1558
+ pb validate data.csv --list-checks # Show all available checks
1559
+ pb validate data.csv --check rows-distinct
1560
+ pb validate data.csv --check rows-distinct --show-extract
1561
+ pb validate data.csv --check rows-distinct --write-extract failing_rows_folder
1562
+ pb validate data.csv --check rows-distinct --exit-code
1563
+ pb validate data.csv --check rows-complete
1564
+ pb validate data.csv --check col-exists --column price
1565
+ pb validate data.csv --check col-vals-not-null --column email
1566
+ pb validate data.csv --check col-vals-gt --column score --value 50
1567
+ pb validate data.csv --check col-vals-in-set --column status --set "active,inactive,pending"
1568
+
1569
+ Multiple validations in one command:
1570
+ pb validate data.csv --check rows-distinct --check rows-complete
1571
+ pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
1572
+ """
1573
+ try:
1574
+ # Handle backward compatibility and parameter conversion
1575
+ import sys
1576
+
1577
+ # Convert parameter tuples to lists, handling default case
1578
+ if not checks:
1579
+ # No --check options provided, use default
1580
+ checks_list = ["rows-distinct"]
1581
+ is_using_default_check = True
1582
+ else:
1583
+ checks_list = list(checks)
1584
+ is_using_default_check = False
1585
+
1586
+ columns_list = list(columns) if columns else []
1587
+ sets_list = list(sets) if sets else []
1588
+ values_list = list(values) if values else []
1589
+
1590
+ # Map parameters to checks intelligently
1591
+ mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
1592
+ checks_list, columns_list, sets_list, values_list
1593
+ )
1594
+
1595
+ # Handle --list-checks option
1596
+ if list_checks:
1597
+ console.print("[bold bright_cyan]Available Validation Checks:[/bold bright_cyan]")
1598
+ console.print()
1599
+ console.print("[bold magenta]Basic checks:[/bold magenta]")
1600
+ console.print(
1601
+ " • [bold cyan]rows-distinct[/bold cyan] Check for duplicate rows [yellow](default)[/yellow]"
1602
+ )
1603
+ console.print(
1604
+ " • [bold cyan]rows-complete[/bold cyan] Check for missing values in any column"
1605
+ )
1606
+ console.print()
1607
+ console.print(
1608
+ "[bold magenta]Column-specific checks [bright_black](require --column)[/bright_black]:[/bold magenta]"
1609
+ )
1610
+ console.print(" • [bold cyan]col-exists[/bold cyan] Check if a column exists")
1611
+ console.print(
1612
+ " • [bold cyan]col-vals-not-null[/bold cyan] Check for null values in a column"
1613
+ )
1614
+ console.print()
1615
+ console.print(
1616
+ "[bold magenta]Value comparison checks [bright_black](require --column and --value)[/bright_black]:[/bold magenta]"
1617
+ )
1618
+ console.print(
1619
+ " • [bold cyan]col-vals-gt[/bold cyan] Values greater than threshold"
1620
+ )
1621
+ console.print(
1622
+ " • [bold cyan]col-vals-ge[/bold cyan] Values greater than or equal to threshold"
1623
+ )
1624
+ console.print(" • [bold cyan]col-vals-lt[/bold cyan] Values less than threshold")
1625
+ console.print(
1626
+ " • [bold cyan]col-vals-le[/bold cyan] Values less than or equal to threshold"
1627
+ )
1628
+ console.print()
1629
+ console.print(
1630
+ "[bold magenta]Set validation check [bright_black](requires --column and --set)[/bright_black]:[/bold magenta]"
1631
+ )
1632
+ console.print(
1633
+ " • [bold cyan]col-vals-in-set[/bold cyan] Values must be in allowed set"
1634
+ )
1635
+ console.print()
1636
+ console.print("[bold bright_yellow]Examples:[/bold bright_yellow]")
1637
+ console.print(
1638
+ f" [bright_blue]pb validate {data_source} --check rows-distinct[/bright_blue]"
1639
+ )
1640
+ console.print(
1641
+ f" [bright_blue]pb validate {data_source} --check col-vals-not-null --column price[/bright_blue]"
1642
+ )
1643
+ console.print(
1644
+ f" [bright_blue]pb validate {data_source} --check col-vals-gt --column age --value 18[/bright_blue]"
1645
+ )
1646
+ import sys
1647
+
1648
+ sys.exit(0)
1649
+
1650
+ # Validate required parameters for different check types
1651
+ # Check parameters for each check in the list using mapped parameters
1652
+ for i, check in enumerate(checks_list):
1653
+ # Get corresponding mapped parameters for this check
1654
+ column = mapped_columns[i] if i < len(mapped_columns) else None
1655
+ set_val = mapped_sets[i] if i < len(mapped_sets) else None
1656
+ value = mapped_values[i] if i < len(mapped_values) else None
1657
+
1658
+ if check == "col-vals-not-null" and not column:
1659
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
1660
+ console.print(
1661
+ "Example: pb validate data.csv --check col-vals-not-null --column email"
1662
+ )
1663
+ sys.exit(1)
1664
+
1665
+ if check == "col-exists" and not column:
1666
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
1667
+ console.print("Example: pb validate data.csv --check col-exists --column price")
1668
+ sys.exit(1)
1669
+
1670
+ if check == "col-vals-in-set" and not column:
1671
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
1672
+ console.print(
1673
+ "Example: pb validate data.csv --check col-vals-in-set --column status --set 'active,inactive'"
1674
+ )
1675
+ sys.exit(1)
1676
+
1677
+ if check == "col-vals-in-set" and not set_val:
1678
+ console.print(f"[red]Error:[/red] --set is required for {check} check")
1679
+ console.print(
1680
+ "Example: pb validate data.csv --check col-vals-in-set --column status --set 'active,inactive'"
1681
+ )
1682
+ sys.exit(1)
1683
+
1684
+ if check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"] and not column:
1685
+ console.print(f"[red]Error:[/red] --column is required for {check} check")
1686
+ console.print(
1687
+ f"Example: pb validate data.csv --check {check} --column score --value 50"
1688
+ )
1689
+ sys.exit(1)
1690
+
1691
+ if (
1692
+ check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"]
1693
+ and value is None
1694
+ ):
1695
+ console.print(f"[red]Error:[/red] --value is required for {check} check")
1696
+ console.print(
1697
+ f"Example: pb validate data.csv --check {check} --column score --value 50"
1698
+ )
1699
+ sys.exit(1)
1700
+
1701
+ with console.status("[bold green]Loading data..."):
1702
+ # Load the data source using the centralized function
1703
+ data = _load_data_source(data_source)
1704
+
1705
+ # Get all column names for error reporting
1706
+ if hasattr(data, "columns"):
1707
+ all_columns = list(data.columns)
1708
+ elif hasattr(data, "schema"):
1709
+ all_columns = list(data.schema.names)
1710
+ else:
1711
+ all_columns = []
1712
+
1713
+ # Resolve any '#N' column references to actual column names
1714
+ columns_list = _resolve_column_indices(columns_list, data)
1715
+
1716
+ # Check for out-of-range #N columns and provide a helpful error
1717
+ for col in columns_list:
1718
+ if isinstance(col, str) and col.startswith("#"):
1719
+ try:
1720
+ idx = int(col[1:])
1721
+ if idx < 1 or idx > len(all_columns):
1722
+ console.print(
1723
+ f"[red]Error:[/red] There is no column {idx} (the column position "
1724
+ f"range is 1 to {len(all_columns)})"
1725
+ )
1726
+ sys.exit(1)
1727
+ except Exception:
1728
+ pass # Let later validation handle other errors
1729
+
1730
+ # Update mapped_columns to use resolved column names
1731
+ mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
1732
+ checks_list, columns_list, sets_list, values_list
1733
+ )
1734
+
1735
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1736
+
1737
+ # Build a single validation object with chained checks
1738
+ with console.status(f"[bold green]Running {len(checks_list)} validation check(s)..."):
1739
+ # Initialize validation object
1740
+ validation = pb.Validate(
1741
+ data=data,
1742
+ tbl_name=f"Data from {data_source}",
1743
+ label=f"CLI Validation: {', '.join(checks_list)}",
1744
+ )
1745
+
1746
+ # Add each check to the validation chain
1747
+ for i, check in enumerate(checks_list):
1748
+ # Get corresponding mapped parameters for this check
1749
+ column = mapped_columns[i] if i < len(mapped_columns) else None
1750
+ set_val = mapped_sets[i] if i < len(mapped_sets) else None
1751
+ value = mapped_values[i] if i < len(mapped_values) else None
1752
+
1753
+ if check == "rows-distinct":
1754
+ validation = validation.rows_distinct()
1755
+ elif check == "col-vals-not-null":
1756
+ validation = validation.col_vals_not_null(columns=column)
1757
+ elif check == "rows-complete":
1758
+ validation = validation.rows_complete()
1759
+ elif check == "col-exists":
1760
+ validation = validation.col_exists(columns=column)
1761
+ elif check == "col-vals-in-set":
1762
+ # Parse the comma-separated set values
1763
+ allowed_values = [v.strip() for v in set_val.split(",")]
1764
+ validation = validation.col_vals_in_set(columns=column, set=allowed_values)
1765
+ elif check == "col-vals-gt":
1766
+ validation = validation.col_vals_gt(columns=column, value=value)
1767
+ elif check == "col-vals-ge":
1768
+ validation = validation.col_vals_ge(columns=column, value=value)
1769
+ elif check == "col-vals-lt":
1770
+ validation = validation.col_vals_lt(columns=column, value=value)
1771
+ elif check == "col-vals-le":
1772
+ validation = validation.col_vals_le(columns=column, value=value)
1773
+ else:
1774
+ console.print(f"[red]Error:[/red] Unknown check type: {check}")
1775
+ sys.exit(1)
1776
+
1777
+ # Execute all validations
1778
+ validation = validation.interrogate()
1779
+ all_passed = validation.all_passed()
1780
+
1781
+ # Display completion message
1782
+ if len(checks_list) == 1:
1783
+ if is_using_default_check:
1784
+ console.print(
1785
+ f"[green]✓[/green] {checks_list[0]} validation completed [dim](default validation)[/dim]"
1786
+ )
1787
+ else:
1788
+ console.print(f"[green]✓[/green] {checks_list[0]} validation completed")
1789
+ else:
1790
+ console.print(f"[green]✓[/green] {len(checks_list)} validations completed")
1791
+
1792
+ # Display results based on whether we have single or multiple checks
1793
+ if len(checks_list) == 1:
1794
+ # Single check - use current display format
1795
+ _display_validation_result(
1796
+ validation,
1797
+ checks_list,
1798
+ mapped_columns,
1799
+ mapped_sets,
1800
+ mapped_values,
1801
+ data_source,
1802
+ 0,
1803
+ 1,
1804
+ show_extract,
1805
+ write_extract,
1806
+ limit,
1807
+ )
1808
+ else:
1809
+ # Multiple checks - use stacked display format
1810
+ any_failed = False
1811
+ for i in range(len(checks_list)):
1812
+ console.print() # Add spacing between results
1813
+ _display_validation_result(
1814
+ validation,
1815
+ checks_list,
1816
+ mapped_columns,
1817
+ mapped_sets,
1818
+ mapped_values,
1819
+ data_source,
1820
+ i,
1821
+ len(checks_list),
1822
+ show_extract,
1823
+ write_extract,
1824
+ limit,
1825
+ )
1826
+
1827
+ # Check if this validation failed
1828
+ if hasattr(validation, "validation_info") and len(validation.validation_info) > i:
1829
+ step_info = validation.validation_info[i]
1830
+ if step_info.n_failed > 0:
1831
+ any_failed = True
1832
+
1833
+ # Show tip about --show-extract if any failed and not already used
1834
+ if any_failed and not show_extract:
1835
+ console.print()
1836
+ console.print(
1837
+ "[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
1838
+ )
1839
+
1840
+ # Add informational hints when using default validation (only for single check)
1841
+ if len(checks_list) == 1 and is_using_default_check:
1842
+ console.print()
1843
+ console.print("[bold blue]ℹ️ Information:[/bold blue] Using default validation method")
1844
+ console.print("To specify a different validation, use the --check option.")
1845
+ console.print()
1846
+ console.print("[bold magenta]Common validation options:[/bold magenta]")
1847
+ console.print(
1848
+ " • [bold cyan]--check rows-complete[/bold cyan] Check for rows with missing values"
1849
+ )
1850
+ console.print(
1851
+ " • [bold cyan]--check col-vals-not-null[/bold cyan] Check for null values in a column [bright_black](requires --column)[/bright_black]"
1852
+ )
1853
+ console.print(
1854
+ " • [bold cyan]--check col-exists[/bold cyan] Check if a column exists [bright_black](requires --column)[/bright_black]"
1855
+ )
1856
+ console.print()
1857
+ console.print("[bold bright_yellow]Examples:[/bold bright_yellow]")
1858
+ console.print(
1859
+ f" [bright_blue]pb validate {data_source} --check rows-complete[/bright_blue]"
1860
+ )
1861
+ console.print(
1862
+ f" [bright_blue]pb validate {data_source} --check col-vals-not-null --column price[/bright_blue]"
1863
+ )
1864
+
1865
+ # Exit with appropriate code if requested
1866
+ if exit_code and not all_passed:
1867
+ console.print("[dim]Exiting with non-zero code due to validation failure[/dim]")
1868
+ import sys
1869
+
1870
+ sys.exit(1)
1871
+
1872
+ except Exception as e:
1873
+ console.print(f"[red]Error:[/red] {e}")
1874
+ sys.exit(1)
1875
+
1876
+
1877
+ @cli.command()
1878
+ def datasets():
1879
+ """
1880
+ List available built-in datasets.
1881
+ """
1882
+ from rich.box import SIMPLE_HEAD
1883
+
1884
+ datasets_info = [
1885
+ ("small_table", "13 rows × 8 columns", "Small demo dataset for testing"),
1886
+ ("game_revenue", "2,000 rows × 11 columns", "Game development company revenue data"),
1887
+ ("nycflights", "336,776 rows × 18 columns", "NYC airport flights data from 2013"),
1888
+ ("global_sales", "50,000 rows × 20 columns", "Global sales data across regions"),
1889
+ ]
1890
+
1891
+ table = Table(
1892
+ title="Available Pointblank Datasets", show_header=True, header_style="bold magenta"
1893
+ )
1894
+
1895
+ # Create the datasets table
1896
+ table = Table(
1897
+ title="Available Pointblank Datasets",
1898
+ show_header=True,
1899
+ header_style="bold magenta",
1900
+ box=SIMPLE_HEAD,
1901
+ title_style="bold cyan",
1902
+ title_justify="left",
1903
+ )
1904
+
1905
+ table.add_column("Dataset Name", style="cyan", no_wrap=True)
1906
+ table.add_column("Dimensions", style="green")
1907
+ table.add_column("Description", style="white")
1908
+
1909
+ for name, dims, desc in datasets_info:
1910
+ table.add_row(name, dims, desc)
1911
+
1912
+ console.print(table)
1913
+ console.print("\n[dim]Use these dataset names directly with any pb CLI command.[/dim]")
1914
+ console.print("[dim]Example: pb preview small_table[/dim]")
1915
+
1916
+
1917
+ @cli.command()
1918
+ def requirements():
1919
+ """
1920
+ Check installed dependencies and their availability.
1921
+ """
1922
+ from rich.box import SIMPLE_HEAD
1923
+
1924
+ dependencies = [
1925
+ ("polars", "Polars DataFrame support"),
1926
+ ("pandas", "Pandas DataFrame support"),
1927
+ ("ibis", "Ibis backend support (DuckDB, etc.)"),
1928
+ ("duckdb", "DuckDB database support"),
1929
+ ("pyarrow", "Parquet file support"),
1930
+ ]
1931
+
1932
+ # Create requirements table
1933
+ table = Table(
1934
+ title="Dependency Status",
1935
+ show_header=True,
1936
+ header_style="bold magenta",
1937
+ box=SIMPLE_HEAD,
1938
+ title_style="bold cyan",
1939
+ title_justify="left",
1940
+ )
1941
+
1942
+ table.add_column("Package", style="cyan", no_wrap=True)
1943
+ table.add_column("Status", style="white")
1944
+ table.add_column("Description", style="dim")
1945
+
1946
+ for package, description in dependencies:
1947
+ if _is_lib_present(package):
1948
+ status = "[green]✓ Installed[/green]"
1949
+ else:
1950
+ status = "[red]✗ Not installed[/red]"
1951
+
1952
+ table.add_row(package, status, description)
1953
+
1954
+ console.print(table)
1955
+ console.print("\n[dim]Install missing packages to enable additional functionality.[/dim]")
1956
+
1957
+
1958
+ def _rich_print_scan_table(
1959
+ scan_result: Any,
1960
+ data_source: str,
1961
+ source_type: str,
1962
+ table_type: str,
1963
+ total_rows: int | None = None,
1964
+ total_columns: int | None = None,
1965
+ ) -> None:
1966
+ """
1967
+ Display scan results as a Rich table in the terminal with statistical measures.
1968
+
1969
+ Args:
1970
+ scan_result: The GT object from col_summary_tbl()
1971
+ data_source: Name of the data source being scanned
1972
+ source_type: Type of data source (e.g., "Pointblank dataset: small_table")
1973
+ table_type: Type of table (e.g., "polars.LazyFrame")
1974
+ total_rows: Total number of rows in the dataset
1975
+ total_columns: Total number of columns in the dataset
1976
+ """
1977
+ try:
1978
+ import re
1979
+
1980
+ import narwhals as nw
1981
+ from rich.box import SIMPLE_HEAD
1982
+
1983
+ # Extract the underlying DataFrame from the GT object
1984
+ # The GT object has a _tbl_data attribute that contains the DataFrame
1985
+ gt_data = scan_result._tbl_data
1986
+
1987
+ # Convert to Narwhals DataFrame for consistent handling
1988
+ nw_data = nw.from_native(gt_data)
1989
+
1990
+ # Convert to dictionary for easier access
1991
+ data_dict = nw_data.to_dict(as_series=False)
1992
+
1993
+ # Create main scan table with missing data table styling
1994
+ # Create a comprehensive title with data source, source type, and table type
1995
+ title_text = f"Column Summary / {source_type} / {table_type}"
1996
+
1997
+ # Add dimensions subtitle in gray if available
1998
+ if total_rows is not None and total_columns is not None:
1999
+ title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2000
+
2001
+ # Create the scan table
2002
+ scan_table = Table(
2003
+ title=title_text,
2004
+ show_header=True,
2005
+ header_style="bold magenta",
2006
+ box=SIMPLE_HEAD,
2007
+ title_style="bold cyan",
2008
+ title_justify="left",
2009
+ )
2010
+
2011
+ # Add columns with specific styling and appropriate widths
2012
+ scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
2013
+ scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
2014
+ scan_table.add_column(
2015
+ "NA", style="red", width=6, justify="right"
2016
+ ) # Adjusted for better formatting
2017
+ scan_table.add_column(
2018
+ "UQ", style="green", width=8, justify="right"
2019
+ ) # Adjusted for boolean values
2020
+
2021
+ # Add statistical columns if they exist with appropriate widths
2022
+ stat_columns = []
2023
+ column_mapping = {
2024
+ "mean": ("Mean", "blue", 9),
2025
+ "std": ("SD", "blue", 9),
2026
+ "min": ("Min", "yellow", 9),
2027
+ "median": ("Med", "yellow", 9),
2028
+ "max": ("Max", "yellow", 9),
2029
+ "q_1": ("Q₁", "magenta", 8),
2030
+ "q_3": ("Q₃", "magenta", 9),
2031
+ "iqr": ("IQR", "magenta", 8),
2032
+ }
2033
+
2034
+ for col_key, (display_name, color, width) in column_mapping.items():
2035
+ if col_key in data_dict:
2036
+ scan_table.add_column(display_name, style=color, width=width, justify="right")
2037
+ stat_columns.append(col_key)
2038
+
2039
+ # Helper function to extract column name and type from HTML
2040
+ def extract_column_info(html_content: str) -> tuple[str, str]:
2041
+ """Extract column name and type from HTML formatted content."""
2042
+ # Extract column name from first div
2043
+ name_match = re.search(r"<div[^>]*>([^<]+)</div>", html_content)
2044
+ column_name = name_match.group(1) if name_match else "Unknown"
2045
+
2046
+ # Extract data type from second div (with gray color)
2047
+ type_match = re.search(r"<div[^>]*color: gray[^>]*>([^<]+)</div>", html_content)
2048
+ if type_match:
2049
+ data_type = type_match.group(1)
2050
+ # Convert to compact format using the existing function
2051
+ compact_type = _format_dtype_compact(data_type)
2052
+ data_type = compact_type
2053
+ else:
2054
+ data_type = "unknown"
2055
+
2056
+ return column_name, data_type
2057
+
2058
+ # Helper function to format values with improved number formatting
2059
+ def format_value(
2060
+ value: Any, is_missing: bool = False, is_unique: bool = False, max_width: int = 8
2061
+ ) -> str:
2062
+ """Format values for display with smart number formatting and HTML cleanup."""
2063
+ if value is None or (isinstance(value, str) and value.strip() == ""):
2064
+ return "[dim]—[/dim]"
2065
+
2066
+ # Handle missing values indicator
2067
+ if is_missing and str(value) == "0":
2068
+ return "[green]●[/green]" # No missing values
2069
+
2070
+ # Clean up HTML formatting from the raw data
2071
+ str_val = str(value)
2072
+
2073
+ # Handle multi-line values with <br> tags FIRST - take the first line (absolute number)
2074
+ if "<br>" in str_val:
2075
+ str_val = str_val.split("<br>")[0].strip()
2076
+ # For unique values, we want just the integer part
2077
+ if is_unique:
2078
+ try:
2079
+ # Try to extract just the integer part for unique counts
2080
+ num_val = float(str_val)
2081
+ return str(int(num_val))
2082
+ except (ValueError, TypeError):
2083
+ pass
2084
+
2085
+ # Now handle HTML content (especially from boolean unique values)
2086
+ if "<" in str_val and ">" in str_val:
2087
+ # Remove HTML tags completely for cleaner display
2088
+ str_val = re.sub(r"<[^>]+>", "", str_val).strip()
2089
+ # Clean up extra whitespace
2090
+ str_val = re.sub(r"\s+", " ", str_val).strip()
2091
+
2092
+ # Handle values like "2<.01" - extract the first number
2093
+ if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
2094
+ # Extract number before the < symbol
2095
+ before_lt = str_val.split("<")[0].strip()
2096
+ if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
2097
+ str_val = before_lt
2098
+
2099
+ # Handle boolean unique values like "T0.62F0.38" - extract the more readable format
2100
+ if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
2101
+ # Extract T and F values
2102
+ t_match = re.search(r"T(\d+\.\d+)", str_val)
2103
+ f_match = re.search(r"F(\d+\.\d+)", str_val)
2104
+ if t_match and f_match:
2105
+ t_val = float(t_match.group(1))
2106
+ f_val = float(f_match.group(1))
2107
+ # Show as "T0.62F0.38" but truncated if needed
2108
+ formatted = f"T{t_val:.2f}F{f_val:.2f}"
2109
+ if len(formatted) > max_width:
2110
+ # Truncate to fit, showing dominant value
2111
+ if t_val > f_val:
2112
+ return f"T{t_val:.1f}"
2113
+ else:
2114
+ return f"F{f_val:.1f}"
2115
+ return formatted
2116
+
2117
+ # Try to parse as a number for better formatting
2118
+ try:
2119
+ # Try to convert to float first
2120
+ num_val = float(str_val)
2121
+
2122
+ # Handle special cases
2123
+ if num_val == 0:
2124
+ return "0"
2125
+ elif abs(num_val) == int(abs(num_val)) and abs(num_val) < 10000:
2126
+ # Simple integers under 10000
2127
+ return str(int(num_val))
2128
+ elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
2129
+ # Likely dates in YYYYMMDD format - format as date-like
2130
+ int_val = int(num_val)
2131
+ if 19000101 <= int_val <= 29991231: # Reasonable date range
2132
+ str_date = str(int_val)
2133
+ if len(str_date) == 8:
2134
+ return (
2135
+ f"{str_date[:4]}-{str_date[4:6]}-{str_date[6:]}"[: max_width - 1]
2136
+ + "…"
2137
+ )
2138
+ # Otherwise treat as large number
2139
+ return f"{num_val / 1000000:.1f}M"
2140
+ elif abs(num_val) >= 1000000:
2141
+ # Large numbers - use scientific notation or M/k notation
2142
+
2143
+ if abs(num_val) >= 1000000000:
2144
+ return f"{num_val:.1e}"
2145
+ else:
2146
+ return f"{num_val / 1000000:.1f}M"
2147
+ elif abs(num_val) >= 10000:
2148
+ # Numbers >= 10k - use compact notation
2149
+ return f"{num_val / 1000:.1f}k"
2150
+ elif abs(num_val) >= 100:
2151
+ # Numbers 100-9999 - show with minimal decimals
2152
+ return f"{num_val:.1f}"
2153
+ elif abs(num_val) >= 10:
2154
+ # Numbers 10-99 - show with one decimal
2155
+ return f"{num_val:.1f}"
2156
+ elif abs(num_val) >= 1:
2157
+ # Numbers 1-9 - show with two decimals
2158
+ return f"{num_val:.2f}"
2159
+ elif abs(num_val) >= 0.01:
2160
+ # Small numbers - show with appropriate precision
2161
+ return f"{num_val:.2f}"
2162
+ else:
2163
+ # Very small numbers - use scientific notation
2164
+
2165
+ return f"{num_val:.1e}"
2166
+
2167
+ except (ValueError, TypeError):
2168
+ # Not a number, handle as string
2169
+ pass
2170
+
2171
+ # Handle date/datetime strings - show abbreviated format
2172
+ if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
2173
+ # Likely a date/datetime, show abbreviated
2174
+ if len(str_val) > max_width:
2175
+ return str_val[: max_width - 1] + "…"
2176
+
2177
+ # General string truncation with ellipsis
2178
+ if len(str_val) > max_width:
2179
+ return str_val[: max_width - 1] + "…"
2180
+
2181
+ return str_val
2182
+
2183
+ # Populate table rows
2184
+ num_rows = len(data_dict["colname"])
2185
+ for i in range(num_rows):
2186
+ row_data = []
2187
+
2188
+ # Column name and type from HTML content
2189
+ colname_html = data_dict["colname"][i]
2190
+ column_name, data_type = extract_column_info(colname_html)
2191
+ row_data.append(column_name)
2192
+ row_data.append(data_type)
2193
+
2194
+ # Missing values (NA)
2195
+ missing_val = data_dict.get("n_missing", [None] * num_rows)[i]
2196
+ row_data.append(format_value(missing_val, is_missing=True, max_width=6))
2197
+
2198
+ # Unique values (UQ)
2199
+ unique_val = data_dict.get("n_unique", [None] * num_rows)[i]
2200
+ row_data.append(format_value(unique_val, is_unique=True, max_width=8))
2201
+
2202
+ # Statistical columns
2203
+ for stat_col in stat_columns:
2204
+ stat_val = data_dict.get(stat_col, [None] * num_rows)[i]
2205
+ # Use appropriate width based on column type
2206
+ if stat_col in ["q_1", "iqr"]:
2207
+ width = 8
2208
+ elif stat_col in ["mean", "std", "min", "median", "max", "q_3"]:
2209
+ width = 9
2210
+ else:
2211
+ width = 8
2212
+ row_data.append(format_value(stat_val, max_width=width))
2213
+
2214
+ scan_table.add_row(*row_data)
2215
+
2216
+ # Display the results
2217
+ console.print()
2218
+ console.print(scan_table)
2219
+
2220
+ except Exception as e:
2221
+ # Fallback to simple message if table creation fails
2222
+ console.print(f"[yellow]Scan results available for {data_source}[/yellow]")
2223
+ console.print(f"[red]Error displaying table: {str(e)}[/red]")
2224
+
2225
+
2226
+ def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
2227
+ """Convert a missing values GT table to Rich table with special formatting.
2228
+
2229
+ Args:
2230
+ gt_table: The GT table object for missing values
2231
+ original_data: The original data source to extract column types
2232
+ """
2233
+ try:
2234
+ # Extract the underlying data from the GT table
2235
+ df = None
2236
+
2237
+ if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
2238
+ df = gt_table._tbl_data
2239
+ elif hasattr(gt_table, "_data") and gt_table._data is not None:
2240
+ df = gt_table._data
2241
+ elif hasattr(gt_table, "data") and gt_table.data is not None:
2242
+ df = gt_table.data
2243
+
2244
+ if df is not None:
2245
+ from rich.box import SIMPLE_HEAD
2246
+
2247
+ # Create the missing values table
2248
+ rich_table = Table(show_header=True, header_style="bold magenta", box=SIMPLE_HEAD)
2249
+
2250
+ # Get column names
2251
+ columns = []
2252
+ try:
2253
+ if hasattr(df, "columns"):
2254
+ columns = list(df.columns)
2255
+ elif hasattr(df, "schema"):
2256
+ columns = list(df.schema.names)
2257
+ except Exception as e:
2258
+ console.print(f"[red]Error getting columns:[/red] {e}")
2259
+ columns = []
2260
+
2261
+ if not columns:
2262
+ columns = [f"Column {i + 1}" for i in range(10)] # Fallback
2263
+
2264
+ # Get original data to extract column types
2265
+ column_types = {}
2266
+ if original_data is not None:
2267
+ try:
2268
+ # Get column types from original data
2269
+ if hasattr(original_data, "columns"):
2270
+ original_columns = list(original_data.columns)
2271
+ column_types = _get_column_dtypes(original_data, original_columns)
2272
+ except Exception as e:
2273
+ console.print(f"[red]Error getting column types:[/red] {e}")
2274
+ pass # Use empty dict as fallback
2275
+
2276
+ # Add columns to Rich table with special formatting for missing values table
2277
+ sector_columns = [col for col in columns if col != "columns" and col.isdigit()]
2278
+
2279
+ # Two separate columns: Column name (20 chars) and Data type (10 chars)
2280
+ rich_table.add_column("Column", style="cyan", no_wrap=True, width=20)
2281
+ rich_table.add_column("Type", style="yellow", no_wrap=True, width=10)
2282
+
2283
+ # Sector columns: All same width, optimized for "100%" (4 chars + padding)
2284
+ for sector in sector_columns:
2285
+ rich_table.add_column(
2286
+ sector,
2287
+ style="cyan",
2288
+ justify="center",
2289
+ no_wrap=True,
2290
+ width=5, # Fixed width optimized for percentage values
2291
+ )
2292
+
2293
+ # Convert data to rows with special formatting
2294
+ rows = []
2295
+ try:
2296
+ if hasattr(df, "to_dicts"):
2297
+ data_dict = df.to_dicts()
2298
+ elif hasattr(df, "to_dict"):
2299
+ data_dict = df.to_dict("records")
2300
+ else:
2301
+ data_dict = []
2302
+
2303
+ for i, row in enumerate(data_dict):
2304
+ try:
2305
+ # Each row should have: [column_name, data_type, sector1, sector2, ...]
2306
+ column_name = str(row.get("columns", ""))
2307
+
2308
+ # Truncate column name to 20 characters with ellipsis if needed
2309
+ if len(column_name) > 20:
2310
+ truncated_name = column_name[:17] + "…"
2311
+ else:
2312
+ truncated_name = column_name
2313
+
2314
+ # Get data type for this column
2315
+ if column_name in column_types:
2316
+ dtype = column_types[column_name]
2317
+ if len(dtype) > 10:
2318
+ truncated_dtype = dtype[:9] + "…"
2319
+ else:
2320
+ truncated_dtype = dtype
2321
+ else:
2322
+ truncated_dtype = "?"
2323
+
2324
+ # Start building the row with column name and type
2325
+ formatted_row = [truncated_name, truncated_dtype]
2326
+
2327
+ # Add sector values (formatted percentages)
2328
+ for sector in sector_columns:
2329
+ value = row.get(sector, 0.0)
2330
+ if isinstance(value, (int, float)):
2331
+ formatted_row.append(_format_missing_percentage(float(value)))
2332
+ else:
2333
+ formatted_row.append(str(value))
2334
+
2335
+ rows.append(formatted_row)
2336
+
2337
+ except Exception as e:
2338
+ console.print(f"[red]Error processing row {i}:[/red] {e}")
2339
+ continue
2340
+
2341
+ except Exception as e:
2342
+ console.print(f"[red]Error extracting data:[/red] {e}")
2343
+ rows = [["Error extracting data", "?", *["" for _ in sector_columns]]]
2344
+
2345
+ # Add rows to Rich table
2346
+ for row in rows:
2347
+ try:
2348
+ rich_table.add_row(*row)
2349
+ except Exception as e:
2350
+ console.print(f"[red]Error adding row:[/red] {e}")
2351
+ break
2352
+
2353
+ # Show the table with custom spanner header if we have sector columns
2354
+ if sector_columns:
2355
+ # Create a custom header line that shows the spanner
2356
+ header_parts = []
2357
+ header_parts.append(" " * 20) # Space for Column header
2358
+ header_parts.append(" " * 10) # Space for Type header
2359
+
2360
+ # Left-align "Row Sectors" with the first numbered column
2361
+ row_sectors_text = "Row Sectors"
2362
+ header_parts.append(row_sectors_text)
2363
+
2364
+ # Print the custom spanner header
2365
+ console.print("[dim]" + " ".join(header_parts) + "[/dim]")
2366
+
2367
+ # Add a horizontal rule below the spanner
2368
+ rule_parts = []
2369
+ rule_parts.append(" " * 20) # Space for Column header
2370
+ rule_parts.append(" " * 10) # Space for Type header
2371
+
2372
+ # Use a fixed width horizontal rule for "Row Sectors"
2373
+ horizontal_rule = "─" * 20
2374
+ rule_parts.append(horizontal_rule)
2375
+
2376
+ # Print the horizontal rule
2377
+ console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
2378
+
2379
+ # Print the Rich table (will handle terminal width automatically)
2380
+ console.print(rich_table)
2381
+ footer_text = (
2382
+ "[dim]Symbols: [green]●[/green] = no missing values, "
2383
+ "[red]●[/red] = completely missing, "
2384
+ "<1% = less than 1% missing, "
2385
+ ">99% = more than 99% missing[/dim]"
2386
+ )
2387
+ console.print(footer_text)
2388
+
2389
+ else:
2390
+ # Fallback to regular table display
2391
+ _rich_print_gt_table(gt_table)
2392
+
2393
+ except Exception as e:
2394
+ console.print(f"[red]Error rendering missing values table:[/red] {e}")
2395
+ # Fallback to regular table display
2396
+ _rich_print_gt_table(gt_table)
2397
+
2398
+
2399
+ def _map_parameters_to_checks(
2400
+ checks_list: list[str], columns_list: list[str], sets_list: list[str], values_list: list[float]
2401
+ ) -> tuple[list[str], list[str], list[float]]:
2402
+ """
2403
+ Map parameters to checks intelligently, handling flexible parameter ordering.
2404
+
2405
+ This function distributes the provided parameters across checks based on what each check needs.
2406
+ For checks that don't need certain parameters, None/empty values are assigned.
2407
+
2408
+ Args:
2409
+ checks_list: List of validation check types
2410
+ columns_list: List of column names provided by user
2411
+ sets_list: List of set values provided by user
2412
+ values_list: List of numeric values provided by user
2413
+
2414
+ Returns:
2415
+ Tuple of (mapped_columns, mapped_sets, mapped_values) where each list
2416
+ has the same length as checks_list
2417
+ """
2418
+ mapped_columns = []
2419
+ mapped_sets = []
2420
+ mapped_values = []
2421
+
2422
+ # Keep track of which parameters we've used
2423
+ column_index = 0
2424
+ set_index = 0
2425
+ value_index = 0
2426
+
2427
+ for check in checks_list:
2428
+ # Determine what parameters this check needs
2429
+ needs_column = check in [
2430
+ "col-vals-not-null",
2431
+ "col-exists",
2432
+ "col-vals-in-set",
2433
+ "col-vals-gt",
2434
+ "col-vals-ge",
2435
+ "col-vals-lt",
2436
+ "col-vals-le",
2437
+ ]
2438
+ needs_set = check == "col-vals-in-set"
2439
+ needs_value = check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"]
2440
+
2441
+ # Assign column parameter if needed
2442
+ if needs_column:
2443
+ if column_index < len(columns_list):
2444
+ mapped_columns.append(columns_list[column_index])
2445
+ column_index += 1
2446
+ else:
2447
+ mapped_columns.append(None) # Will cause validation error later
2448
+ else:
2449
+ mapped_columns.append(None)
2450
+
2451
+ # Assign set parameter if needed
2452
+ if needs_set:
2453
+ if set_index < len(sets_list):
2454
+ mapped_sets.append(sets_list[set_index])
2455
+ set_index += 1
2456
+ else:
2457
+ mapped_sets.append(None) # Will cause validation error later
2458
+ else:
2459
+ mapped_sets.append(None)
2460
+
2461
+ # Assign value parameter if needed
2462
+ if needs_value:
2463
+ if value_index < len(values_list):
2464
+ mapped_values.append(values_list[value_index])
2465
+ value_index += 1
2466
+ else:
2467
+ mapped_values.append(None) # Will cause validation error later
2468
+ else:
2469
+ mapped_values.append(None)
2470
+
2471
+ return mapped_columns, mapped_sets, mapped_values
2472
+
2473
+
2474
+ def _resolve_column_indices(columns_list, data):
2475
+ """
2476
+ Replace any '#N' entries in columns_list with the actual column name from data (1-based).
2477
+ """
2478
+ # Get column names from the data
2479
+ if hasattr(data, "columns"):
2480
+ all_columns = list(data.columns)
2481
+ elif hasattr(data, "schema"):
2482
+ all_columns = list(data.schema.names)
2483
+ else:
2484
+ return columns_list # Can't resolve, return as-is
2485
+
2486
+ resolved = []
2487
+ for col in columns_list:
2488
+ if isinstance(col, str) and col.startswith("#"):
2489
+ try:
2490
+ idx = int(col[1:]) - 1 # 1-based to 0-based
2491
+ if 0 <= idx < len(all_columns):
2492
+ resolved.append(all_columns[idx])
2493
+ else:
2494
+ resolved.append(col) # Out of range, keep as-is
2495
+ except Exception:
2496
+ resolved.append(col) # Not a valid number, keep as-is
2497
+ else:
2498
+ resolved.append(col)
2499
+ return resolved
2500
+
2501
+
2502
+ def _display_validation_result(
2503
+ validation: Any,
2504
+ checks_list: list[str],
2505
+ columns_list: list[str],
2506
+ sets_list: list[str],
2507
+ values_list: list[float],
2508
+ data_source: str,
2509
+ step_index: int,
2510
+ total_checks: int,
2511
+ show_extract: bool,
2512
+ write_extract: str | None,
2513
+ limit: int,
2514
+ ) -> None:
2515
+ """Display a single validation result with proper formatting for single or multiple checks."""
2516
+ from rich.box import SIMPLE_HEAD
2517
+
2518
+ # Get parameters for this specific check
2519
+ check = checks_list[step_index]
2520
+ column = columns_list[step_index] if step_index < len(columns_list) else None
2521
+ set_val = sets_list[step_index] if step_index < len(sets_list) else None
2522
+ value = values_list[step_index] if step_index < len(values_list) else None
2523
+
2524
+ # Get validation step info
2525
+ step_info = None
2526
+ if hasattr(validation, "validation_info") and len(validation.validation_info) > step_index:
2527
+ step_info = validation.validation_info[step_index]
2528
+
2529
+ # Create friendly title for table
2530
+ if total_checks == 1:
2531
+ # Single check - use original title format
2532
+ if check == "rows-distinct":
2533
+ table_title = "Validation Result: Rows Distinct"
2534
+ elif check == "col-vals-not-null":
2535
+ table_title = "Validation Result: Column Values Not Null"
2536
+ elif check == "rows-complete":
2537
+ table_title = "Validation Result: Rows Complete"
2538
+ elif check == "col-exists":
2539
+ table_title = "Validation Result: Column Exists"
2540
+ elif check == "col-vals-in-set":
2541
+ table_title = "Validation Result: Column Values In Set"
2542
+ elif check == "col-vals-gt":
2543
+ table_title = "Validation Result: Column Values Greater Than"
2544
+ elif check == "col-vals-ge":
2545
+ table_title = "Validation Result: Column Values Greater Than Or Equal"
2546
+ elif check == "col-vals-lt":
2547
+ table_title = "Validation Result: Column Values Less Than"
2548
+ elif check == "col-vals-le":
2549
+ table_title = "Validation Result: Column Values Less Than Or Equal"
2550
+ else:
2551
+ table_title = f"Validation Result: {check.replace('-', ' ').title()}"
2552
+ else:
2553
+ # Multiple checks - add numbering
2554
+ if check == "rows-distinct":
2555
+ base_title = "Rows Distinct"
2556
+ elif check == "col-vals-not-null":
2557
+ base_title = "Column Values Not Null"
2558
+ elif check == "rows-complete":
2559
+ base_title = "Rows Complete"
2560
+ elif check == "col-exists":
2561
+ base_title = "Column Exists"
2562
+ elif check == "col-vals-in-set":
2563
+ base_title = "Column Values In Set"
2564
+ elif check == "col-vals-gt":
2565
+ base_title = "Column Values Greater Than"
2566
+ elif check == "col-vals-ge":
2567
+ base_title = "Column Values Greater Than Or Equal"
2568
+ elif check == "col-vals-lt":
2569
+ base_title = "Column Values Less Than"
2570
+ elif check == "col-vals-le":
2571
+ base_title = "Column Values Less Than Or Equal"
2572
+ else:
2573
+ base_title = check.replace("-", " ").title()
2574
+
2575
+ table_title = f"Validation Result ({step_index + 1} of {total_checks}): {base_title}"
2576
+
2577
+ # Create the validation results table
2578
+ result_table = Table(
2579
+ title=table_title,
2580
+ show_header=True,
2581
+ header_style="bold magenta",
2582
+ box=SIMPLE_HEAD,
2583
+ title_style="bold cyan",
2584
+ title_justify="left",
2585
+ )
2586
+ result_table.add_column("Property", style="cyan", no_wrap=True)
2587
+ result_table.add_column("Value", style="white")
2588
+
2589
+ # Add basic info
2590
+ result_table.add_row("Data Source", data_source)
2591
+ result_table.add_row("Check Type", check)
2592
+
2593
+ # Add column info for column-specific checks
2594
+ if check in [
2595
+ "col-vals-not-null",
2596
+ "col-exists",
2597
+ "col-vals-in-set",
2598
+ "col-vals-gt",
2599
+ "col-vals-ge",
2600
+ "col-vals-lt",
2601
+ "col-vals-le",
2602
+ ]:
2603
+ result_table.add_row("Column", column)
2604
+
2605
+ # Add set info for col-vals-in-set check
2606
+ if check == "col-vals-in-set" and set_val:
2607
+ allowed_values = [v.strip() for v in set_val.split(",")]
2608
+ result_table.add_row("Allowed Values", ", ".join(allowed_values))
2609
+
2610
+ # Add value info for range checks
2611
+ if check in ["col-vals-gt", "col-vals-ge", "col-vals-lt", "col-vals-le"] and value is not None:
2612
+ if check == "col-vals-gt":
2613
+ operator = ">"
2614
+ elif check == "col-vals-ge":
2615
+ operator = ">="
2616
+ elif check == "col-vals-lt":
2617
+ operator = "<"
2618
+ elif check == "col-vals-le":
2619
+ operator = "<="
2620
+ result_table.add_row("Threshold", f"{operator} {value}")
2621
+
2622
+ # Get validation details
2623
+ if step_info:
2624
+ result_table.add_row("Total Rows Tested", f"{step_info.n:,}")
2625
+ result_table.add_row("Passing Rows", f"{step_info.n_passed:,}")
2626
+ result_table.add_row("Failing Rows", f"{step_info.n_failed:,}")
2627
+
2628
+ # Check if this step passed
2629
+ step_passed = step_info.n_failed == 0
2630
+
2631
+ # Overall result with color coding
2632
+ if step_passed:
2633
+ result_table.add_row("Result", "[green]✓ PASSED[/green]")
2634
+ if check == "rows-distinct":
2635
+ result_table.add_row("Duplicate Rows", "[green]None found[/green]")
2636
+ elif check == "col-vals-not-null":
2637
+ result_table.add_row("Null Values", "[green]None found[/green]")
2638
+ elif check == "rows-complete":
2639
+ result_table.add_row("Incomplete Rows", "[green]None found[/green]")
2640
+ elif check == "col-exists":
2641
+ result_table.add_row("Column Status", "[green]Column exists[/green]")
2642
+ elif check == "col-vals-in-set":
2643
+ result_table.add_row("Values Status", "[green]All values in allowed set[/green]")
2644
+ elif check == "col-vals-gt":
2645
+ result_table.add_row("Values Status", f"[green]All values > {value}[/green]")
2646
+ elif check == "col-vals-ge":
2647
+ result_table.add_row("Values Status", f"[green]All values >= {value}[/green]")
2648
+ elif check == "col-vals-lt":
2649
+ result_table.add_row("Values Status", f"[green]All values < {value}[/green]")
2650
+ elif check == "col-vals-le":
2651
+ result_table.add_row("Values Status", f"[green]All values <= {value}[/green]")
2652
+ else:
2653
+ result_table.add_row("Result", "[red]✗ FAILED[/red]")
2654
+ if check == "rows-distinct":
2655
+ result_table.add_row("Duplicate Rows", f"[red]{step_info.n_failed:,} found[/red]")
2656
+ elif check == "col-vals-not-null":
2657
+ result_table.add_row("Null Values", f"[red]{step_info.n_failed:,} found[/red]")
2658
+ elif check == "rows-complete":
2659
+ result_table.add_row("Incomplete Rows", f"[red]{step_info.n_failed:,} found[/red]")
2660
+ elif check == "col-exists":
2661
+ result_table.add_row("Column Status", "[red]Column does not exist[/red]")
2662
+ elif check == "col-vals-in-set":
2663
+ result_table.add_row("Invalid Values", f"[red]{step_info.n_failed:,} found[/red]")
2664
+ elif check == "col-vals-gt":
2665
+ result_table.add_row(
2666
+ "Invalid Values", f"[red]{step_info.n_failed:,} values <= {value}[/red]"
2667
+ )
2668
+ elif check == "col-vals-ge":
2669
+ result_table.add_row(
2670
+ "Invalid Values", f"[red]{step_info.n_failed:,} values < {value}[/red]"
2671
+ )
2672
+ elif check == "col-vals-lt":
2673
+ result_table.add_row(
2674
+ "Invalid Values", f"[red]{step_info.n_failed:,} values >= {value}[/red]"
2675
+ )
2676
+ elif check == "col-vals-le":
2677
+ result_table.add_row(
2678
+ "Invalid Values", f"[red]{step_info.n_failed:,} values > {value}[/red]"
2679
+ )
2680
+
2681
+ console.print()
2682
+ console.print(result_table)
2683
+
2684
+ # Show extract and summary for single check only, or if this is a failed step in multiple checks
2685
+ if total_checks == 1:
2686
+ # For single check, show extract and summary as before
2687
+ _show_extract_and_summary(
2688
+ validation,
2689
+ check,
2690
+ column,
2691
+ set_val,
2692
+ value,
2693
+ data_source,
2694
+ step_index,
2695
+ step_info,
2696
+ show_extract,
2697
+ write_extract,
2698
+ limit,
2699
+ )
2700
+ else:
2701
+ # For multiple checks, show summary panel and handle extract if needed
2702
+ if step_info:
2703
+ step_passed = step_info.n_failed == 0
2704
+ if step_passed:
2705
+ # Create success message for this step
2706
+ if check == "rows-distinct":
2707
+ success_message = f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
2708
+ elif check == "col-vals-not-null":
2709
+ success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
2710
+ elif check == "rows-complete":
2711
+ success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
2712
+ elif check == "col-exists":
2713
+ success_message = f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
2714
+ elif check == "col-vals-in-set":
2715
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
2716
+ elif check == "col-vals-gt":
2717
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
2718
+ elif check == "col-vals-ge":
2719
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
2720
+ elif check == "col-vals-lt":
2721
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
2722
+ elif check == "col-vals-le":
2723
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
2724
+ else:
2725
+ success_message = f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
2726
+
2727
+ console.print(
2728
+ Panel(
2729
+ success_message,
2730
+ border_style="green",
2731
+ )
2732
+ )
2733
+ else:
2734
+ # Create failure message for this step (without tip)
2735
+ if check == "rows-distinct":
2736
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
2737
+ elif check == "col-vals-not-null":
2738
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
2739
+ elif check == "rows-complete":
2740
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
2741
+ elif check == "col-exists":
2742
+ failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
2743
+ elif check == "col-vals-in-set":
2744
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
2745
+ elif check == "col-vals-gt":
2746
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
2747
+ elif check == "col-vals-ge":
2748
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
2749
+ elif check == "col-vals-lt":
2750
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
2751
+ elif check == "col-vals-le":
2752
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
2753
+ else:
2754
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
2755
+
2756
+ console.print(
2757
+ Panel(
2758
+ failure_message,
2759
+ border_style="red",
2760
+ )
2761
+ )
2762
+
2763
+ # For multiple checks, show extract if requested and this step failed
2764
+ if (show_extract or write_extract) and not step_passed:
2765
+ _show_extract_for_multi_check(
2766
+ validation,
2767
+ check,
2768
+ column,
2769
+ set_val,
2770
+ value,
2771
+ data_source,
2772
+ step_index,
2773
+ step_info,
2774
+ show_extract,
2775
+ write_extract,
2776
+ limit,
2777
+ )
2778
+
2779
+
2780
+ def _show_extract_for_multi_check(
2781
+ validation: Any,
2782
+ check: str,
2783
+ column: str | None,
2784
+ set_val: str | None,
2785
+ value: float | None,
2786
+ data_source: str,
2787
+ step_index: int,
2788
+ step_info: Any,
2789
+ show_extract: bool,
2790
+ write_extract: str | None,
2791
+ limit: int,
2792
+ ) -> None:
2793
+ """Show extract for a single validation step in multiple checks scenario."""
2794
+ # Dynamic message based on check type
2795
+ if check == "rows-distinct":
2796
+ extract_message = "[yellow]Extract of failing rows (duplicates):[/yellow]"
2797
+ row_type = "duplicate rows"
2798
+ elif check == "rows-complete":
2799
+ extract_message = "[yellow]Extract of failing rows (incomplete rows):[/yellow]"
2800
+ row_type = "incomplete rows"
2801
+ elif check == "col-exists":
2802
+ extract_message = f"[yellow]Column '{column}' does not exist in the dataset[/yellow]"
2803
+ row_type = "missing column"
2804
+ elif check == "col-vals-not-null":
2805
+ extract_message = f"[yellow]Extract of failing rows (null values in '{column}'):[/yellow]"
2806
+ row_type = "rows with null values"
2807
+ elif check == "col-vals-in-set":
2808
+ extract_message = (
2809
+ f"[yellow]Extract of failing rows (invalid values in '{column}'):[/yellow]"
2810
+ )
2811
+ row_type = "rows with invalid values"
2812
+ elif check == "col-vals-gt":
2813
+ extract_message = (
2814
+ f"[yellow]Extract of failing rows (values in '{column}' <= {value}):[/yellow]"
2815
+ )
2816
+ row_type = f"rows with values <= {value}"
2817
+ elif check == "col-vals-ge":
2818
+ extract_message = (
2819
+ f"[yellow]Extract of failing rows (values in '{column}' < {value}):[/yellow]"
2820
+ )
2821
+ row_type = f"rows with values < {value}"
2822
+ elif check == "col-vals-lt":
2823
+ extract_message = (
2824
+ f"[yellow]Extract of failing rows (values in '{column}' >= {value}):[/yellow]"
2825
+ )
2826
+ row_type = f"rows with values >= {value}"
2827
+ elif check == "col-vals-le":
2828
+ extract_message = (
2829
+ f"[yellow]Extract of failing rows (values in '{column}' > {value}):[/yellow]"
2830
+ )
2831
+ row_type = f"rows with values > {value}"
2832
+ else:
2833
+ extract_message = "[yellow]Extract of failing rows:[/yellow]"
2834
+ row_type = "failing rows"
2835
+
2836
+ if show_extract:
2837
+ console.print()
2838
+ console.print(extract_message)
2839
+
2840
+ # Special handling for col-exists check - no rows to show when column doesn't exist
2841
+ if check == "col-exists":
2842
+ if show_extract:
2843
+ console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
2844
+ console.print(
2845
+ "[dim]Use --show-extract with other check types to see failing data rows.[/dim]"
2846
+ )
2847
+ if write_extract:
2848
+ console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
2849
+ else:
2850
+ try:
2851
+ # Get failing rows extract - use step_index + 1 since extracts are 1-indexed
2852
+ failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
2853
+
2854
+ if failing_rows is not None and len(failing_rows) > 0:
2855
+ if show_extract:
2856
+ # Limit the number of rows shown
2857
+ if len(failing_rows) > limit:
2858
+ display_rows = failing_rows.head(limit)
2859
+ console.print(
2860
+ f"[dim]Showing first {limit} of {len(failing_rows)} {row_type}[/dim]"
2861
+ )
2862
+ else:
2863
+ display_rows = failing_rows
2864
+ console.print(f"[dim]Showing all {len(failing_rows)} {row_type}[/dim]")
2865
+
2866
+ # Create a preview table using pointblank's preview function
2867
+ import pointblank as pb
2868
+
2869
+ preview_table = pb.preview(
2870
+ data=display_rows,
2871
+ n_head=min(limit, len(display_rows)),
2872
+ n_tail=0,
2873
+ limit=limit,
2874
+ show_row_numbers=True,
2875
+ )
2876
+
2877
+ # Display using our Rich table function
2878
+ _rich_print_gt_table(preview_table, show_summary=False)
2879
+
2880
+ if write_extract:
2881
+ try:
2882
+ from pathlib import Path
2883
+
2884
+ folder_name = write_extract
2885
+
2886
+ # Create the output folder
2887
+ output_folder = Path(folder_name)
2888
+ output_folder.mkdir(parents=True, exist_ok=True)
2889
+
2890
+ # Create safe filename from check type
2891
+ safe_check_type = check.replace("-", "_")
2892
+ filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
2893
+ filepath = output_folder / filename
2894
+
2895
+ # Limit the output if needed
2896
+ write_rows = failing_rows
2897
+ if len(failing_rows) > limit:
2898
+ write_rows = failing_rows.head(limit)
2899
+
2900
+ # Save to CSV
2901
+ if hasattr(write_rows, "write_csv"):
2902
+ # Polars
2903
+ write_rows.write_csv(str(filepath))
2904
+ elif hasattr(write_rows, "to_csv"):
2905
+ # Pandas
2906
+ write_rows.to_csv(str(filepath), index=False)
2907
+ else:
2908
+ # Try converting to pandas as fallback
2909
+ import pandas as pd
2910
+
2911
+ pd_data = pd.DataFrame(write_rows)
2912
+ pd_data.to_csv(str(filepath), index=False)
2913
+
2914
+ rows_saved = len(write_rows) if hasattr(write_rows, "__len__") else limit
2915
+ console.print(
2916
+ f"[green]✓[/green] Failing rows saved to folder: {output_folder}"
2917
+ )
2918
+ console.print(f"[dim] - {filename}: {rows_saved} rows[/dim]")
2919
+ except Exception as e:
2920
+ console.print(f"[yellow]Warning: Could not save failing rows: {e}[/yellow]")
2921
+ else:
2922
+ if show_extract:
2923
+ console.print("[yellow]No failing rows could be extracted[/yellow]")
2924
+ if write_extract:
2925
+ console.print("[yellow]No failing rows could be extracted to save[/yellow]")
2926
+ except Exception as e:
2927
+ if show_extract:
2928
+ console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
2929
+ if write_extract:
2930
+ console.print(f"[yellow]Could not extract failing rows to save: {e}[/yellow]")
2931
+
2932
+
2933
+ def _show_extract_and_summary(
2934
+ validation: Any,
2935
+ check: str,
2936
+ column: str | None,
2937
+ set_val: str | None,
2938
+ value: float | None,
2939
+ data_source: str,
2940
+ step_index: int,
2941
+ step_info: Any,
2942
+ show_extract: bool,
2943
+ write_extract: str | None,
2944
+ limit: int,
2945
+ ) -> None:
2946
+ """Show extract and summary for a validation step (used for single checks)."""
2947
+ step_passed = step_info.n_failed == 0 if step_info else True
2948
+
2949
+ # Show extract if requested and validation failed
2950
+ if (show_extract or write_extract) and not step_passed:
2951
+ console.print()
2952
+
2953
+ # Dynamic message based on check type
2954
+ if check == "rows-distinct":
2955
+ extract_message = "[yellow]Extract of failing rows (duplicates):[/yellow]"
2956
+ row_type = "duplicate rows"
2957
+ elif check == "rows-complete":
2958
+ extract_message = "[yellow]Extract of failing rows (incomplete rows):[/yellow]"
2959
+ row_type = "incomplete rows"
2960
+ elif check == "col-exists":
2961
+ extract_message = f"[yellow]Column '{column}' does not exist in the dataset[/yellow]"
2962
+ row_type = "missing column"
2963
+ elif check == "col-vals-not-null":
2964
+ extract_message = (
2965
+ f"[yellow]Extract of failing rows (null values in '{column}'):[/yellow]"
2966
+ )
2967
+ row_type = "rows with null values"
2968
+ elif check == "col-vals-in-set":
2969
+ extract_message = (
2970
+ f"[yellow]Extract of failing rows (invalid values in '{column}'):[/yellow]"
2971
+ )
2972
+ row_type = "rows with invalid values"
2973
+ elif check == "col-vals-gt":
2974
+ extract_message = (
2975
+ f"[yellow]Extract of failing rows (values in '{column}' <= {value}):[/yellow]"
2976
+ )
2977
+ row_type = f"rows with values <= {value}"
2978
+ elif check == "col-vals-ge":
2979
+ extract_message = (
2980
+ f"[yellow]Extract of failing rows (values in '{column}' < {value}):[/yellow]"
2981
+ )
2982
+ row_type = f"rows with values < {value}"
2983
+ elif check == "col-vals-lt":
2984
+ extract_message = (
2985
+ f"[yellow]Extract of failing rows (values in '{column}' >= {value}):[/yellow]"
2986
+ )
2987
+ row_type = f"rows with values >= {value}"
2988
+ elif check == "col-vals-le":
2989
+ extract_message = (
2990
+ f"[yellow]Extract of failing rows (values in '{column}' > {value}):[/yellow]"
2991
+ )
2992
+ row_type = f"rows with values > {value}"
2993
+ else:
2994
+ extract_message = "[yellow]Extract of failing rows:[/yellow]"
2995
+ row_type = "failing rows"
2996
+
2997
+ if show_extract:
2998
+ console.print(extract_message)
2999
+
3000
+ # Special handling for col-exists check - no rows to show when column doesn't exist
3001
+ if check == "col-exists" and not step_passed:
3002
+ if show_extract:
3003
+ console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
3004
+ console.print(
3005
+ "[dim]Use --show-extract with other check types to see failing data rows.[/dim]"
3006
+ )
3007
+ if write_extract:
3008
+ console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
3009
+ else:
3010
+ try:
3011
+ # Get failing rows extract - use step_index + 1 since extracts are 1-indexed
3012
+ failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
3013
+
3014
+ if failing_rows is not None and len(failing_rows) > 0:
3015
+ if show_extract:
3016
+ # Limit the number of rows shown
3017
+ if len(failing_rows) > limit:
3018
+ display_rows = failing_rows.head(limit)
3019
+ console.print(
3020
+ f"[dim]Showing first {limit} of {len(failing_rows)} {row_type}[/dim]"
3021
+ )
3022
+ else:
3023
+ display_rows = failing_rows
3024
+ console.print(f"[dim]Showing all {len(failing_rows)} {row_type}[/dim]")
3025
+
3026
+ # Create a preview table using pointblank's preview function
3027
+ import pointblank as pb
3028
+
3029
+ preview_table = pb.preview(
3030
+ data=display_rows,
3031
+ n_head=min(limit, len(display_rows)),
3032
+ n_tail=0,
3033
+ limit=limit,
3034
+ show_row_numbers=True,
3035
+ )
3036
+
3037
+ # Display using our Rich table function
3038
+ _rich_print_gt_table(preview_table, show_summary=False)
3039
+
3040
+ if write_extract:
3041
+ try:
3042
+ from pathlib import Path
3043
+
3044
+ folder_name = write_extract
3045
+
3046
+ # Create the output folder
3047
+ output_folder = Path(folder_name)
3048
+ output_folder.mkdir(parents=True, exist_ok=True)
3049
+
3050
+ # Create safe filename from check type
3051
+ safe_check_type = check.replace("-", "_")
3052
+ filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
3053
+ filepath = output_folder / filename
3054
+
3055
+ # Limit the output if needed
3056
+ write_rows = failing_rows
3057
+ if len(failing_rows) > limit:
3058
+ write_rows = failing_rows.head(limit)
3059
+
3060
+ # Save to CSV
3061
+ if hasattr(write_rows, "write_csv"):
3062
+ # Polars
3063
+ write_rows.write_csv(str(filepath))
3064
+ elif hasattr(write_rows, "to_csv"):
3065
+ # Pandas
3066
+ write_rows.to_csv(str(filepath), index=False)
3067
+ else:
3068
+ # Try converting to pandas as fallback
3069
+ import pandas as pd
3070
+
3071
+ pd_data = pd.DataFrame(write_rows)
3072
+ pd_data.to_csv(str(filepath), index=False)
3073
+
3074
+ rows_saved = (
3075
+ len(write_rows) if hasattr(write_rows, "__len__") else limit
3076
+ )
3077
+ console.print(
3078
+ f"[green]✓[/green] Failing rows saved to folder: {output_folder}"
3079
+ )
3080
+ console.print(f"[dim] - {filename}: {rows_saved} rows[/dim]")
3081
+ except Exception as e:
3082
+ console.print(
3083
+ f"[yellow]Warning: Could not save failing rows: {e}[/yellow]"
3084
+ )
3085
+ else:
3086
+ if show_extract:
3087
+ console.print("[yellow]No failing rows could be extracted[/yellow]")
3088
+ if write_extract:
3089
+ console.print("[yellow]No failing rows could be extracted to save[/yellow]")
3090
+ except Exception as e:
3091
+ if show_extract:
3092
+ console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
3093
+ if write_extract:
3094
+ console.print(f"[yellow]Could not extract failing rows to save: {e}[/yellow]")
3095
+
3096
+ # Summary message
3097
+ console.print()
3098
+ if step_passed:
3099
+ if check == "rows-distinct":
3100
+ success_message = (
3101
+ f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
3102
+ )
3103
+ elif check == "col-vals-not-null":
3104
+ success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
3105
+ elif check == "rows-complete":
3106
+ success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
3107
+ elif check == "col-exists":
3108
+ success_message = (
3109
+ f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
3110
+ )
3111
+ elif check == "col-vals-in-set":
3112
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
3113
+ elif check == "col-vals-gt":
3114
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
3115
+ elif check == "col-vals-ge":
3116
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
3117
+ elif check == "col-vals-lt":
3118
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
3119
+ elif check == "col-vals-le":
3120
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
3121
+ else:
3122
+ success_message = (
3123
+ f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
3124
+ )
3125
+
3126
+ console.print(Panel(success_message, border_style="green"))
3127
+ else:
3128
+ if step_info:
3129
+ if check == "rows-distinct":
3130
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
3131
+ elif check == "col-vals-not-null":
3132
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
3133
+ elif check == "rows-complete":
3134
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
3135
+ elif check == "col-exists":
3136
+ failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
3137
+ elif check == "col-vals-in-set":
3138
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
3139
+ elif check == "col-vals-gt":
3140
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
3141
+ elif check == "col-vals-ge":
3142
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
3143
+ elif check == "col-vals-lt":
3144
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
3145
+ elif check == "col-vals-le":
3146
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
3147
+ else:
3148
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
3149
+
3150
+ # Add hint about --show-extract if not already used (except for col-exists which has no rows to show)
3151
+ if not show_extract and check != "col-exists":
3152
+ failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
3153
+
3154
+ console.print(Panel(failure_message, border_style="red"))
3155
+ else:
3156
+ if check == "rows-distinct":
3157
+ failure_message = (
3158
+ f"[red]✗ Validation FAILED: Duplicate rows found in {data_source}[/red]"
3159
+ )
3160
+ elif check == "rows-complete":
3161
+ failure_message = (
3162
+ f"[red]✗ Validation FAILED: Incomplete rows found in {data_source}[/red]"
3163
+ )
3164
+ else:
3165
+ failure_message = (
3166
+ f"[red]✗ Validation FAILED: {check} check failed for {data_source}[/red]"
3167
+ )
3168
+
3169
+ # Add hint about --show-extract if not already used
3170
+ if not show_extract:
3171
+ failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
3172
+
3173
+ console.print(Panel(failure_message, border_style="red"))
3174
+
3175
+
3176
+ @cli.command()
3177
+ @click.argument("output_file", type=click.Path())
3178
+ def make_template(output_file: str):
3179
+ """
3180
+ Create a validation script template.
3181
+
3182
+ Creates a sample Python script with examples showing how to use Pointblank
3183
+ for data validation. Edit the template to add your own data loading and
3184
+ validation rules, then run it with 'pb run'.
3185
+
3186
+ OUTPUT_FILE is the path where the template script will be created.
3187
+
3188
+ Examples:
3189
+
3190
+ \b
3191
+ pb make-template my_validation.py
3192
+ pb make-template validation_template.py
3193
+ """
3194
+ example_script = '''"""
3195
+ Example Pointblank validation script.
3196
+
3197
+ This script demonstrates how to create validation rules for your data.
3198
+ Modify the data loading and validation rules below to match your requirements.
3199
+ """
3200
+
3201
+ import pointblank as pb
3202
+
3203
+ # Load your data (replace this with your actual data source)
3204
+ # You can load from various sources:
3205
+ # data = pb.load_dataset("small_table") # Built-in dataset
3206
+ # data = pd.read_csv("your_data.csv") # CSV file
3207
+ # data = pl.read_parquet("data.parquet") # Parquet file
3208
+ # data = pb.load_data("database://connection") # Database
3209
+
3210
+ data = pb.load_dataset("small_table") # Example with built-in dataset
3211
+
3212
+ # Create a validation object
3213
+ validation = (
3214
+ pb.Validate(
3215
+ data=data,
3216
+ tbl_name="Example Data",
3217
+ label="Validation Example",
3218
+ thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
3219
+ )
3220
+ # Add your validation rules here
3221
+ # Example rules (modify these based on your data structure):
3222
+
3223
+ # Check that specific columns exist
3224
+ # .col_exists(["column1", "column2"])
3225
+
3226
+ # Check for null values
3227
+ # .col_vals_not_null(columns="important_column")
3228
+
3229
+ # Check value ranges
3230
+ # .col_vals_gt(columns="amount", value=0)
3231
+ # .col_vals_between(columns="score", left=0, right=100)
3232
+
3233
+ # Check string patterns
3234
+ # .col_vals_regex(columns="email", pattern=r"^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$")
3235
+
3236
+ # Check unique values
3237
+ # .col_vals_unique(columns="id")
3238
+
3239
+ # Finalize the validation
3240
+ .interrogate()
3241
+ )
3242
+
3243
+ # The validation object will be automatically used by the CLI
3244
+ # You can also access results programmatically:
3245
+ # print(f"All passed: {validation.all_passed()}")
3246
+ # print(f"Failed steps: {validation.n_failed()}")
3247
+ '''
3248
+
3249
+ Path(output_file).write_text(example_script)
3250
+ console.print(f"[green]✓[/green] Validation script template created: {output_file}")
3251
+ console.print("\nEdit the template to add your data loading and validation rules, then run:")
3252
+ console.print(f"[cyan]pb run {output_file}[/cyan]")
3253
+ console.print(
3254
+ f"[cyan]pb run {output_file} --data your_data.csv[/cyan] [dim]# Override data source[/dim]"
3255
+ )
3256
+
3257
+
3258
+ @cli.command()
3259
+ @click.argument("validation_script", type=click.Path(exists=True))
3260
+ @click.option("--data", type=str, help="Optional data source to override script's data loading")
3261
+ @click.option("--output-html", type=click.Path(), help="Save HTML validation report to file")
3262
+ @click.option("--output-json", type=click.Path(), help="Save JSON validation summary to file")
3263
+ @click.option(
3264
+ "--show-extract", is_flag=True, help="Show extract of failing rows if validation fails"
3265
+ )
3266
+ @click.option(
3267
+ "--write-extract",
3268
+ type=str,
3269
+ help="Save failing rows to folders (one CSV per step). Provide base name for folder.",
3270
+ )
3271
+ @click.option(
3272
+ "--limit", "-l", default=10, help="Maximum number of failing rows to show/save (default: 10)"
3273
+ )
3274
+ @click.option(
3275
+ "--fail-on",
3276
+ type=click.Choice(["critical", "error", "warning", "any"], case_sensitive=False),
3277
+ help="Exit with non-zero code when validation reaches this threshold level",
3278
+ )
3279
+ def run(
3280
+ validation_script: str,
3281
+ data: str | None,
3282
+ output_html: str | None,
3283
+ output_json: str | None,
3284
+ show_extract: bool,
3285
+ write_extract: str | None,
3286
+ limit: int,
3287
+ fail_on: str | None,
3288
+ ):
3289
+ """
3290
+ Run a Pointblank validation script.
3291
+
3292
+ VALIDATION_SCRIPT should be a Python file that defines validation logic.
3293
+ The script should load its own data and create validation objects.
3294
+
3295
+ If --data is provided, it will be available as a 'cli_data' variable in the script,
3296
+ allowing you to optionally override your script's data loading.
3297
+
3298
+ DATA can be:
3299
+
3300
+ \b
3301
+ - CSV file path (e.g., data.csv)
3302
+ - Parquet file path or pattern (e.g., data.parquet, data/*.parquet)
3303
+ - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
3304
+ - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
3305
+ - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
3306
+
3307
+ Examples:
3308
+
3309
+ \b
3310
+ pb run validation_script.py
3311
+ pb run validation_script.py --data data.csv
3312
+ pb run validation_script.py --data small_table --output-html report.html
3313
+ pb run validation_script.py --show-extract --fail-on error
3314
+ pb run validation_script.py --write-extract extracts_folder --fail-on critical
3315
+ """
3316
+ try:
3317
+ # Load optional data override if provided
3318
+ cli_data = None
3319
+ if data:
3320
+ with console.status(f"[bold green]Loading data from {data}..."):
3321
+ cli_data = _load_data_source(data)
3322
+ console.print(f"[green]✓[/green] Loaded data override: {data}")
3323
+
3324
+ # Execute the validation script
3325
+ with console.status("[bold green]Running validation script..."):
3326
+ # Read and execute the validation script
3327
+ script_content = Path(validation_script).read_text()
3328
+
3329
+ # Create a namespace with pointblank and optional CLI data
3330
+ namespace = {
3331
+ "pb": pb,
3332
+ "pointblank": pb,
3333
+ "cli_data": cli_data, # Available if --data was provided
3334
+ "__name__": "__main__",
3335
+ "__file__": str(Path(validation_script).resolve()),
3336
+ }
3337
+
3338
+ # Execute the script
3339
+ try:
3340
+ exec(script_content, namespace)
3341
+ except Exception as e:
3342
+ console.print(f"[red]Error executing validation script:[/red] {e}")
3343
+ sys.exit(1)
3344
+
3345
+ # Look for validation objects in the namespace
3346
+ validations = []
3347
+
3348
+ # Look for the 'validation' variable specifically first
3349
+ if "validation" in namespace:
3350
+ validations.append(namespace["validation"])
3351
+
3352
+ # Also look for any other validation objects
3353
+ for key, value in namespace.items():
3354
+ if (
3355
+ key != "validation"
3356
+ and hasattr(value, "interrogate")
3357
+ and hasattr(value, "validation_info")
3358
+ ):
3359
+ validations.append(value)
3360
+ # Also check if it's a Validate object that has been interrogated
3361
+ elif key != "validation" and str(type(value)).find("Validate") != -1:
3362
+ validations.append(value)
3363
+
3364
+ if not validations:
3365
+ raise ValueError(
3366
+ "No validation objects found in script. "
3367
+ "Script should create Validate objects and call .interrogate() on them."
3368
+ )
3369
+
3370
+ console.print(f"[green]✓[/green] Found {len(validations)} validation object(s)")
3371
+
3372
+ # Process each validation
3373
+ overall_failed = False
3374
+ overall_critical = False
3375
+ overall_error = False
3376
+ overall_warning = False
3377
+
3378
+ for i, validation in enumerate(validations, 1):
3379
+ if len(validations) > 1:
3380
+ console.print(f"\n[bold cyan]Validation {i}:[/bold cyan]")
3381
+
3382
+ # Display summary
3383
+ _display_validation_summary(validation)
3384
+
3385
+ # Check failure status
3386
+ validation_failed = False
3387
+ has_critical = False
3388
+ has_error = False
3389
+ has_warning = False
3390
+
3391
+ if hasattr(validation, "validation_info") and validation.validation_info:
3392
+ for step_info in validation.validation_info:
3393
+ if step_info.critical:
3394
+ has_critical = True
3395
+ overall_critical = True
3396
+ if step_info.error:
3397
+ has_error = True
3398
+ overall_error = True
3399
+ if step_info.warning:
3400
+ has_warning = True
3401
+ overall_warning = True
3402
+ if step_info.n_failed > 0:
3403
+ validation_failed = True
3404
+ overall_failed = True
3405
+
3406
+ # Handle extract functionality for failed validations
3407
+ failed_steps = []
3408
+ if (
3409
+ validation_failed
3410
+ and hasattr(validation, "validation_info")
3411
+ and validation.validation_info
3412
+ ):
3413
+ for j, step_info in enumerate(validation.validation_info, 1):
3414
+ if step_info.n_failed > 0:
3415
+ failed_steps.append((j, step_info))
3416
+
3417
+ if validation_failed and failed_steps and (show_extract or write_extract):
3418
+ console.print()
3419
+
3420
+ if show_extract:
3421
+ extract_title = "Extract of failing rows from validation steps"
3422
+ if len(validations) > 1:
3423
+ extract_title += f" (Validation {i})"
3424
+ console.print(f"[yellow]{extract_title}:[/yellow]")
3425
+
3426
+ for step_num, step_info in failed_steps:
3427
+ try:
3428
+ failing_rows = validation.get_data_extracts(i=step_num, frame=True)
3429
+
3430
+ if failing_rows is not None and len(failing_rows) > 0:
3431
+ console.print(
3432
+ f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
3433
+ )
3434
+
3435
+ # Limit the number of rows shown
3436
+ if len(failing_rows) > limit:
3437
+ display_rows = failing_rows.head(limit)
3438
+ console.print(
3439
+ f"[dim]Showing first {limit} of {len(failing_rows)} failing rows[/dim]"
3440
+ )
3441
+ else:
3442
+ display_rows = failing_rows
3443
+ console.print(
3444
+ f"[dim]Showing all {len(failing_rows)} failing rows[/dim]"
3445
+ )
3446
+
3447
+ # Create a preview table using pointblank's preview function
3448
+ preview_table = pb.preview(
3449
+ data=display_rows,
3450
+ n_head=min(limit, len(display_rows)),
3451
+ n_tail=0,
3452
+ limit=limit,
3453
+ show_row_numbers=True,
3454
+ )
3455
+
3456
+ # Display using our Rich table function
3457
+ _rich_print_gt_table(preview_table, show_summary=False)
3458
+ else:
3459
+ console.print(
3460
+ f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
3461
+ )
3462
+ console.print("[yellow]No failing rows could be extracted[/yellow]")
3463
+ except Exception as e:
3464
+ console.print(
3465
+ f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
3466
+ )
3467
+ console.print(f"[yellow]Could not extract failing rows: {e}[/yellow]")
3468
+
3469
+ if write_extract:
3470
+ try:
3471
+ folder_name = write_extract
3472
+
3473
+ # Add validation number if multiple validations
3474
+ if len(validations) > 1:
3475
+ folder_name = f"{folder_name}_validation_{i}"
3476
+
3477
+ # Create the output folder
3478
+ output_folder = Path(folder_name)
3479
+ output_folder.mkdir(parents=True, exist_ok=True)
3480
+
3481
+ saved_files = []
3482
+
3483
+ # Save each failing step to its own CSV file
3484
+ for step_num, step_info in failed_steps:
3485
+ try:
3486
+ failing_rows = validation.get_data_extracts(i=step_num, frame=True)
3487
+ if failing_rows is not None and len(failing_rows) > 0:
3488
+ # Create safe filename from assertion type
3489
+ safe_assertion_type = (
3490
+ step_info.assertion_type.replace(" ", "_")
3491
+ .replace("/", "_")
3492
+ .replace("\\", "_")
3493
+ .replace(":", "_")
3494
+ .replace("<", "_")
3495
+ .replace(">", "_")
3496
+ .replace("|", "_")
3497
+ .replace("?", "_")
3498
+ .replace("*", "_")
3499
+ .replace('"', "_")
3500
+ )
3501
+
3502
+ filename = f"step_{step_num:02d}_{safe_assertion_type}.csv"
3503
+ filepath = output_folder / filename
3504
+
3505
+ # Limit the output if needed
3506
+ save_rows = failing_rows
3507
+ if hasattr(failing_rows, "head") and len(failing_rows) > limit:
3508
+ save_rows = failing_rows.head(limit)
3509
+
3510
+ # Save to CSV
3511
+ if hasattr(save_rows, "write_csv"):
3512
+ # Polars
3513
+ save_rows.write_csv(str(filepath))
3514
+ elif hasattr(save_rows, "to_csv"):
3515
+ # Pandas
3516
+ save_rows.to_csv(str(filepath), index=False)
3517
+ else:
3518
+ # Try converting to pandas as fallback
3519
+ import pandas as pd
3520
+
3521
+ pd_data = pd.DataFrame(save_rows)
3522
+ pd_data.to_csv(str(filepath), index=False)
3523
+
3524
+ saved_files.append((filename, len(failing_rows)))
3525
+
3526
+ except Exception as e:
3527
+ console.print(
3528
+ f"[yellow]Warning: Could not save failing rows from step {step_num}: {e}[/yellow]"
3529
+ )
3530
+
3531
+ if saved_files:
3532
+ console.print(
3533
+ f"[green]✓[/green] Failing rows saved to folder: {output_folder}"
3534
+ )
3535
+ for filename, row_count in saved_files:
3536
+ console.print(f"[dim] - {filename}: {row_count} rows[/dim]")
3537
+ else:
3538
+ console.print(
3539
+ "[yellow]No failing rows could be extracted to save[/yellow]"
3540
+ )
3541
+
3542
+ except Exception as e:
3543
+ console.print(
3544
+ f"[yellow]Warning: Could not save failing rows to CSV: {e}[/yellow]"
3545
+ )
3546
+
3547
+ # Save HTML and JSON outputs (combine multiple validations if needed)
3548
+ if output_html:
3549
+ try:
3550
+ if len(validations) == 1:
3551
+ # Single validation - save directly
3552
+ html_content = validations[0]._repr_html_()
3553
+ Path(output_html).write_text(html_content, encoding="utf-8")
3554
+ else:
3555
+ # Multiple validations - combine them
3556
+ html_parts = []
3557
+ html_parts.append("<html><body>")
3558
+ html_parts.append("<h1>Pointblank Validation Report</h1>")
3559
+
3560
+ for i, validation in enumerate(validations, 1):
3561
+ html_parts.append(f"<h2>Validation {i}</h2>")
3562
+ html_parts.append(validation._repr_html_())
3563
+
3564
+ html_parts.append("</body></html>")
3565
+ html_content = "\n".join(html_parts)
3566
+ Path(output_html).write_text(html_content, encoding="utf-8")
3567
+
3568
+ console.print(f"[green]✓[/green] HTML report saved to: {output_html}")
3569
+ except Exception as e:
3570
+ console.print(f"[yellow]Warning: Could not save HTML report: {e}[/yellow]")
3571
+
3572
+ if output_json:
3573
+ try:
3574
+ if len(validations) == 1:
3575
+ # Single validation - save directly
3576
+ json_report = validations[0].get_json_report()
3577
+ Path(output_json).write_text(json_report, encoding="utf-8")
3578
+ else:
3579
+ # Multiple validations - combine them
3580
+ import json
3581
+
3582
+ combined_report = {"validations": []}
3583
+
3584
+ for i, validation in enumerate(validations, 1):
3585
+ validation_json = json.loads(validation.get_json_report())
3586
+ validation_json["validation_id"] = i
3587
+ combined_report["validations"].append(validation_json)
3588
+
3589
+ Path(output_json).write_text(
3590
+ json.dumps(combined_report, indent=2), encoding="utf-8"
3591
+ )
3592
+
3593
+ console.print(f"[green]✓[/green] JSON summary saved to: {output_json}")
3594
+ except Exception as e:
3595
+ console.print(f"[yellow]Warning: Could not save JSON report: {e}[/yellow]")
3596
+
3597
+ # Check if we should fail based on threshold
3598
+ if fail_on:
3599
+ should_exit = False
3600
+ exit_reason = ""
3601
+
3602
+ if fail_on.lower() == "critical" and overall_critical:
3603
+ should_exit = True
3604
+ exit_reason = "critical validation failures"
3605
+ elif fail_on.lower() == "error" and (overall_critical or overall_error):
3606
+ should_exit = True
3607
+ exit_reason = "error or critical validation failures"
3608
+ elif fail_on.lower() == "warning" and (
3609
+ overall_critical or overall_error or overall_warning
3610
+ ):
3611
+ should_exit = True
3612
+ exit_reason = "warning, error, or critical validation failures"
3613
+ elif fail_on.lower() == "any" and overall_failed:
3614
+ should_exit = True
3615
+ exit_reason = "validation failures"
3616
+
3617
+ if should_exit:
3618
+ console.print(f"[red]Exiting with error due to {exit_reason}[/red]")
3619
+ sys.exit(1)
3620
+
3621
+ except Exception as e:
3622
+ console.print(f"[red]Error:[/red] {e}")
3623
+ sys.exit(1)
3624
+
3625
+
3626
+ def _format_missing_percentage(value: float) -> str:
3627
+ """Format missing value percentages for display.
3628
+
3629
+ Args:
3630
+ value: The percentage value (0-100)
3631
+
3632
+ Returns:
3633
+ Formatted string with proper percentage display
3634
+ """
3635
+ if value == 0.0:
3636
+ return "[green]●[/green]" # Large green circle for no missing values
3637
+ elif value == 100.0:
3638
+ return "[red]●[/red]" # Large red circle for completely missing values
3639
+ elif value < 1.0 and value > 0:
3640
+ return "<1%" # Less than 1%
3641
+ elif value > 99.0 and value < 100.0:
3642
+ return ">99%" # More than 99%
3643
+ else:
3644
+ return f"{int(round(value))}%" # Round to nearest integer with % sign