pointblank 0.11.1__py3-none-any.whl → 0.11.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/cli.py CHANGED
@@ -1,5 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import copy
4
+ import os
5
+ import shutil
3
6
  import sys
4
7
  from pathlib import Path
5
8
  from typing import Any
@@ -31,6 +34,8 @@ class OrderedGroup(click.Group):
31
34
  "validate",
32
35
  "run",
33
36
  "make-template",
37
+ # Data Manipulation
38
+ "pl",
34
39
  # Utilities
35
40
  "datasets",
36
41
  "requirements",
@@ -90,6 +95,15 @@ def _load_data_source(data_source: str) -> Any:
90
95
  return _process_data(data_source)
91
96
 
92
97
 
98
+ def _is_piped_data_source(data_source: str) -> bool:
99
+ """Check if the data source is from a piped pb command."""
100
+ return (
101
+ data_source
102
+ and ("pb_pipe_" in data_source)
103
+ and (data_source.startswith("/var/folders/") or data_source.startswith("/tmp/"))
104
+ )
105
+
106
+
93
107
  def _format_cell_value(
94
108
  value: Any, is_row_number: bool = False, max_width: int = 50, num_columns: int = 10
95
109
  ) -> str:
@@ -274,7 +288,7 @@ def _format_dtype_compact(dtype_str: str) -> str:
274
288
  elif "str" in dtype_str:
275
289
  return "str"
276
290
 
277
- # Unknown or complex types - truncate if too long
291
+ # Unknown or complex types: truncate if too long
278
292
  elif len(dtype_str) > 8:
279
293
  return dtype_str[:8] + "…"
280
294
  else:
@@ -395,7 +409,7 @@ def _rich_print_scan_table(
395
409
  # Clean up HTML formatting from the raw data
396
410
  str_val = str(value)
397
411
 
398
- # Handle multi-line values with <br> tags FIRST - take the first line (absolute number)
412
+ # Handle multi-line values with <br> tags FIRST: take the first line (absolute number)
399
413
  if "<br>" in str_val:
400
414
  str_val = str_val.split("<br>")[0].strip()
401
415
  # For unique values, we want just the integer part
@@ -414,14 +428,14 @@ def _rich_print_scan_table(
414
428
  # Clean up extra whitespace
415
429
  str_val = re.sub(r"\s+", " ", str_val).strip()
416
430
 
417
- # Handle values like "2<.01" - extract the first number
431
+ # Handle values like "2<.01": extract the first number
418
432
  if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
419
433
  # Extract number before the < symbol
420
434
  before_lt = str_val.split("<")[0].strip()
421
435
  if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
422
436
  str_val = before_lt
423
437
 
424
- # Handle boolean unique values like "T0.62F0.38" - extract the more readable format
438
+ # Handle boolean unique values like "T0.62F0.38": extract the more readable format
425
439
  if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
426
440
  # Extract T and F values
427
441
  t_match = re.search(r"T(\d+\.\d+)", str_val)
@@ -451,7 +465,7 @@ def _rich_print_scan_table(
451
465
  # Simple integers under 10000
452
466
  return str(int(num_val))
453
467
  elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
454
- # Likely dates in YYYYMMDD format - format as date-like
468
+ # Likely dates in YYYYMMDD format: format as date-like
455
469
  int_val = int(num_val)
456
470
  if 19000101 <= int_val <= 29991231: # Reasonable date range
457
471
  str_date = str(int_val)
@@ -463,29 +477,29 @@ def _rich_print_scan_table(
463
477
  # Otherwise treat as large number
464
478
  return f"{num_val / 1000000:.1f}M"
465
479
  elif abs(num_val) >= 1000000:
466
- # Large numbers - use scientific notation or M/k notation
480
+ # Large numbers: use scientific notation or M/k notation
467
481
 
468
482
  if abs(num_val) >= 1000000000:
469
483
  return f"{num_val:.1e}"
470
484
  else:
471
485
  return f"{num_val / 1000000:.1f}M"
472
486
  elif abs(num_val) >= 10000:
473
- # Numbers >= 10k - use compact notation
487
+ # Numbers >= 10k: use compact notation
474
488
  return f"{num_val / 1000:.1f}k"
475
489
  elif abs(num_val) >= 100:
476
- # Numbers 100-9999 - show with minimal decimals
490
+ # Numbers 100-9999: show with minimal decimals
477
491
  return f"{num_val:.1f}"
478
492
  elif abs(num_val) >= 10:
479
- # Numbers 10-99 - show with one decimal
493
+ # Numbers 10-99: show with one decimal
480
494
  return f"{num_val:.1f}"
481
495
  elif abs(num_val) >= 1:
482
- # Numbers 1-9 - show with two decimals
496
+ # Numbers 1-9: show with two decimals
483
497
  return f"{num_val:.2f}"
484
498
  elif abs(num_val) >= 0.01:
485
- # Small numbers - show with appropriate precision
499
+ # Small numbers: show with appropriate precision
486
500
  return f"{num_val:.2f}"
487
501
  else:
488
- # Very small numbers - use scientific notation
502
+ # Very small numbers: use scientific notation
489
503
 
490
504
  return f"{num_val:.1e}"
491
505
 
@@ -493,7 +507,7 @@ def _rich_print_scan_table(
493
507
  # Not a number, handle as string
494
508
  pass
495
509
 
496
- # Handle date/datetime strings - show abbreviated format
510
+ # Handle date/datetime strings: show abbreviated format
497
511
  if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
498
512
  # Likely a date/datetime, show abbreviated
499
513
  if len(str_val) > max_width:
@@ -557,9 +571,12 @@ def _rich_print_gt_table(
557
571
  gt_table: The GT table object to display
558
572
  preview_info: Optional dict with preview context info:
559
573
  - total_rows: Total rows in the dataset
574
+ - total_columns: Total columns in the dataset
560
575
  - head_rows: Number of head rows shown
561
576
  - tail_rows: Number of tail rows shown
562
577
  - is_complete: Whether the entire dataset is shown
578
+ - source_type: Type of data source (e.g., "External source: worldcities_new.csv")
579
+ - table_type: Type of table (e.g., "polars")
563
580
  show_summary: Whether to show the row count summary at the bottom
564
581
  """
565
582
  try:
@@ -592,6 +609,12 @@ def _rich_print_gt_table(
592
609
  table_type = preview_info["table_type"]
593
610
  table_title = f"Data Preview / {source_type} / {table_type}"
594
611
 
612
+ # Add dimensions subtitle in gray if available
613
+ total_rows = preview_info.get("total_rows")
614
+ total_columns = preview_info.get("total_columns")
615
+ if total_rows is not None and total_columns is not None:
616
+ table_title += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
617
+
595
618
  rich_table = Table(
596
619
  title=table_title,
597
620
  show_header=True,
@@ -933,14 +956,19 @@ def _rich_print_gt_table(
933
956
 
934
957
 
935
958
  def _display_validation_summary(validation: Any) -> None:
936
- """Display a validation summary in a Rich table format."""
959
+ """Display a validation summary in a compact Rich table format."""
937
960
  try:
938
961
  # Try to get the summary from the validation report
939
962
  if hasattr(validation, "validation_info") and validation.validation_info is not None:
940
963
  # Use the validation_info to create a summary
941
964
  info = validation.validation_info
942
965
  n_steps = len(info)
943
- n_passed = sum(1 for step in info if step.all_passed)
966
+
967
+ # Count steps based on their threshold status
968
+ n_passed = sum(
969
+ 1 for step in info if not step.warning and not step.error and not step.critical
970
+ )
971
+ n_all_passed = sum(1 for step in info if step.all_passed)
944
972
  n_failed = n_steps - n_passed
945
973
 
946
974
  # Calculate severity counts
@@ -950,64 +978,213 @@ def _display_validation_summary(validation: Any) -> None:
950
978
 
951
979
  all_passed = n_failed == 0
952
980
 
953
- # Determine highest severity
981
+ # Determine highest severity and its color
954
982
  if n_critical > 0:
955
983
  highest_severity = "critical"
984
+ severity_color = "red"
956
985
  elif n_error > 0:
957
986
  highest_severity = "error"
987
+ severity_color = "yellow"
958
988
  elif n_warning > 0:
959
989
  highest_severity = "warning"
960
- elif n_failed > 0:
961
- highest_severity = "some failing"
962
- else:
990
+ severity_color = "bright_black" # gray
991
+ elif n_all_passed == n_steps:
992
+ # All steps passed AND all steps had 100% pass rate
963
993
  highest_severity = "all passed"
994
+ severity_color = "bold green"
995
+ else:
996
+ # Steps passed (no threshold exceedances) but some had failing test units
997
+ highest_severity = "passed"
998
+ severity_color = "green"
999
+
1000
+ # Create compact summary header
1001
+ # Format: Steps: 6 / P: 3 (3 AP) / W: 3 / E: 0 / C: 0 / warning
1002
+ summary_header = (
1003
+ f"Steps: {n_steps} / P: {n_passed} ({n_all_passed} AP) / "
1004
+ f"W: {n_warning} / E: {n_error} / C: {n_critical} / "
1005
+ f"[{severity_color}]{highest_severity}[/{severity_color}]"
1006
+ )
964
1007
 
965
- # Create a summary table
966
- table = Table(title="Validation Summary", show_header=True, header_style="bold magenta")
967
- table.add_column("Metric", style="cyan", no_wrap=True)
968
- table.add_column("Value", style="green")
969
-
970
- # Add summary statistics
971
- table.add_row("Total Steps", str(n_steps))
972
- table.add_row("Passing Steps", str(n_passed))
973
- table.add_row("Failing Steps", str(n_failed))
974
- table.add_row("Warning Steps", str(n_warning))
975
- table.add_row("Error Steps", str(n_error))
976
- table.add_row("Critical Steps", str(n_critical))
977
- table.add_row("All Passed", str(all_passed))
978
- table.add_row("Highest Severity", highest_severity)
979
-
980
- console.print(table)
1008
+ # Print the report title and summary
1009
+ console.print()
1010
+ console.print("[blue]Validation Report[/blue]")
1011
+ console.print(f"[white]{summary_header}[/white]")
981
1012
 
982
1013
  # Display step details
983
1014
  if n_steps > 0:
1015
+ from rich.box import SIMPLE_HEAD
1016
+
984
1017
  steps_table = Table(
985
- title="Validation Steps", show_header=True, header_style="bold cyan"
1018
+ show_header=True,
1019
+ header_style="bold cyan",
1020
+ box=SIMPLE_HEAD,
986
1021
  )
987
- steps_table.add_column("Step", style="dim")
988
- steps_table.add_column("Type", style="white")
1022
+ steps_table.add_column("", style="dim")
1023
+ steps_table.add_column("Step", style="white")
989
1024
  steps_table.add_column("Column", style="cyan")
990
- steps_table.add_column("Status", style="white")
991
- steps_table.add_column("Passed/Total", style="green")
1025
+ steps_table.add_column("Values", style="yellow")
1026
+ steps_table.add_column("Units", style="blue")
1027
+ steps_table.add_column("Pass", style="green")
1028
+ steps_table.add_column("Fail", style="red")
1029
+ steps_table.add_column("W", style="bright_black")
1030
+ steps_table.add_column("E", style="yellow")
1031
+ steps_table.add_column("C", style="red")
1032
+ steps_table.add_column("Ext", style="blue", justify="center")
1033
+
1034
+ def format_units(n: int) -> str:
1035
+ """Format large numbers with K, M, B abbreviations for values above 10,000."""
1036
+ if n is None:
1037
+ return "—"
1038
+ if n >= 1000000000: # Billions
1039
+ return f"{n / 1000000000:.1f}B"
1040
+ elif n >= 1000000: # Millions
1041
+ return f"{n / 1000000:.1f}M"
1042
+ elif n >= 10000: # Use K for 10,000 and above
1043
+ return f"{n / 1000:.0f}K"
1044
+ else:
1045
+ return str(n)
1046
+
1047
+ def format_pass_fail(passed: int, total: int) -> str:
1048
+ """Format pass/fail counts with abbreviated numbers and fractions."""
1049
+ if passed is None or total is None or total == 0:
1050
+ return "—/—"
1051
+
1052
+ # Calculate fraction
1053
+ fraction = passed / total
1054
+
1055
+ # Format fraction with special handling for very small and very large values
1056
+ if fraction == 0.0:
1057
+ fraction_str = "0.00"
1058
+ elif fraction == 1.0:
1059
+ fraction_str = "1.00"
1060
+ elif fraction < 0.005: # Less than 0.005 rounds to 0.00
1061
+ fraction_str = "<0.01"
1062
+ elif fraction > 0.995: # Greater than 0.995 rounds to 1.00
1063
+ fraction_str = ">0.99"
1064
+ else:
1065
+ fraction_str = f"{fraction:.2f}"
1066
+
1067
+ # Format absolute number with abbreviations
1068
+ absolute_str = format_units(passed)
1069
+
1070
+ return f"{absolute_str}/{fraction_str}"
992
1071
 
993
1072
  for step in info:
994
- status_icon = "✓" if step.all_passed else "✗"
995
- status_color = "green" if step.all_passed else "red"
1073
+ # Extract values information for the Values column
1074
+ values_str = "" # Default to em dash if no values
1075
+
1076
+ # Handle different validation types
1077
+ if step.assertion_type == "col_schema_match":
1078
+ values_str = "—" # Schema is too complex to display inline
1079
+ elif step.assertion_type == "col_vals_between":
1080
+ # For between validations, try to get left and right bounds
1081
+ if (
1082
+ hasattr(step, "left")
1083
+ and hasattr(step, "right")
1084
+ and step.left is not None
1085
+ and step.right is not None
1086
+ ):
1087
+ values_str = f"[{step.left}, {step.right}]"
1088
+ elif hasattr(step, "values") and step.values is not None:
1089
+ if isinstance(step.values, (list, tuple)) and len(step.values) >= 2:
1090
+ values_str = f"[{step.values[0]}, {step.values[1]}]"
1091
+ else:
1092
+ values_str = str(step.values)
1093
+ elif step.assertion_type in ["row_count_match", "col_count_match"]:
1094
+ # For count match validations, extract the 'count' value from the dictionary
1095
+ if hasattr(step, "values") and step.values is not None:
1096
+ if isinstance(step.values, dict) and "count" in step.values:
1097
+ values_str = str(step.values["count"])
1098
+ else:
1099
+ values_str = str(step.values)
1100
+ else:
1101
+ values_str = "—"
1102
+ elif step.assertion_type in ["col_vals_expr", "conjointly"]:
1103
+ values_str = "COLUMN EXPR"
1104
+ elif step.assertion_type == "specially":
1105
+ values_str = "EXPR"
1106
+ elif hasattr(step, "values") and step.values is not None:
1107
+ if isinstance(step.values, (list, tuple)):
1108
+ if len(step.values) <= 3:
1109
+ values_str = ", ".join(str(v) for v in step.values)
1110
+ else:
1111
+ values_str = f"{', '.join(str(v) for v in step.values[:3])}..."
1112
+ else:
1113
+ values_str = str(step.values)
1114
+ elif hasattr(step, "value") and step.value is not None:
1115
+ values_str = str(step.value)
1116
+ elif hasattr(step, "set") and step.set is not None:
1117
+ if isinstance(step.set, (list, tuple)):
1118
+ if len(step.set) <= 3:
1119
+ values_str = ", ".join(str(v) for v in step.set)
1120
+ else:
1121
+ values_str = f"{', '.join(str(v) for v in step.set[:3])}..."
1122
+ else:
1123
+ values_str = str(step.set)
1124
+
1125
+ # Determine threshold status for W, E, C columns
1126
+ # Check if thresholds are set and whether they were exceeded
1127
+
1128
+ # Warning threshold
1129
+ if (
1130
+ hasattr(step, "thresholds")
1131
+ and step.thresholds
1132
+ and hasattr(step.thresholds, "warning")
1133
+ and step.thresholds.warning is not None
1134
+ ):
1135
+ w_status = (
1136
+ "[bright_black]●[/bright_black]"
1137
+ if step.warning
1138
+ else "[bright_black]○[/bright_black]"
1139
+ )
1140
+ else:
1141
+ w_status = "—"
1142
+
1143
+ # Error threshold
1144
+ if (
1145
+ hasattr(step, "thresholds")
1146
+ and step.thresholds
1147
+ and hasattr(step.thresholds, "error")
1148
+ and step.thresholds.error is not None
1149
+ ):
1150
+ e_status = "[yellow]●[/yellow]" if step.error else "[yellow]○[/yellow]"
1151
+ else:
1152
+ e_status = "—"
1153
+
1154
+ # Critical threshold
1155
+ if (
1156
+ hasattr(step, "thresholds")
1157
+ and step.thresholds
1158
+ and hasattr(step.thresholds, "critical")
1159
+ and step.thresholds.critical is not None
1160
+ ):
1161
+ c_status = "[red]●[/red]" if step.critical else "[red]○[/red]"
1162
+ else:
1163
+ c_status = "—"
996
1164
 
997
- severity = ""
998
- if step.critical:
999
- severity = " [red](CRITICAL)[/red]"
1000
- elif step.error:
1001
- severity = " [red](ERROR)[/red]"
1002
- elif step.warning:
1003
- severity = " [yellow](WARNING)[/yellow]"
1165
+ # Extract status, here we check if the step has any extract data
1166
+ if (
1167
+ hasattr(step, "extract")
1168
+ and step.extract is not None
1169
+ and hasattr(step.extract, "__len__")
1170
+ and len(step.extract) > 0
1171
+ ):
1172
+ ext_status = "[blue]✓[/blue]"
1173
+ else:
1174
+ ext_status = "[bright_black]—[/bright_black]"
1004
1175
 
1005
1176
  steps_table.add_row(
1006
1177
  str(step.i),
1007
1178
  step.assertion_type,
1008
1179
  str(step.column) if step.column else "—",
1009
- f"[{status_color}]{status_icon}[/{status_color}]{severity}",
1010
- f"{step.n_passed}/{step.n}",
1180
+ values_str,
1181
+ format_units(step.n),
1182
+ format_pass_fail(step.n_passed, step.n),
1183
+ format_pass_fail(step.n - step.n_passed, step.n),
1184
+ w_status,
1185
+ e_status,
1186
+ c_status,
1187
+ ext_status,
1011
1188
  )
1012
1189
 
1013
1190
  console.print(steps_table)
@@ -1015,18 +1192,32 @@ def _display_validation_summary(validation: Any) -> None:
1015
1192
  # Display status with appropriate color
1016
1193
  if highest_severity == "all passed":
1017
1194
  console.print(
1018
- Panel("[green]✓ All validations passed![/green]", border_style="green")
1195
+ Panel(
1196
+ "[green]✓ All validations passed![/green]",
1197
+ border_style="green",
1198
+ expand=False,
1199
+ )
1019
1200
  )
1020
- elif highest_severity == "some failing":
1201
+ elif highest_severity == "passed":
1021
1202
  console.print(
1022
- Panel("[yellow]⚠ Some validations failed[/yellow]", border_style="yellow")
1203
+ Panel(
1204
+ "[dim green]⚠ Some steps had failing test units[/dim green]",
1205
+ border_style="dim green",
1206
+ expand=False,
1207
+ )
1023
1208
  )
1024
1209
  elif highest_severity in ["warning", "error", "critical"]:
1025
- color = "yellow" if highest_severity == "warning" else "red"
1210
+ if highest_severity == "warning":
1211
+ color = "bright_black" # gray
1212
+ elif highest_severity == "error":
1213
+ color = "yellow"
1214
+ else: # critical
1215
+ color = "red"
1026
1216
  console.print(
1027
1217
  Panel(
1028
1218
  f"[{color}]✗ Validation failed with {highest_severity} severity[/{color}]",
1029
1219
  border_style=color,
1220
+ expand=False,
1030
1221
  )
1031
1222
  )
1032
1223
  else:
@@ -1040,20 +1231,31 @@ def _display_validation_summary(validation: Any) -> None:
1040
1231
 
1041
1232
 
1042
1233
  @click.group(cls=OrderedGroup)
1043
- @click.version_option(version=pb.__version__, prog_name="pb")
1234
+ @click.version_option(pb.__version__, "-v", "--version", prog_name="pb")
1235
+ @click.help_option("-h", "--help")
1044
1236
  def cli():
1045
1237
  """
1046
- Pointblank CLI - Data validation and quality tools for data engineers.
1238
+ Pointblank CLI: Data validation and quality tools for data engineers.
1239
+
1240
+ Use this CLI to validate data quality, explore datasets, and generate comprehensive
1241
+ reports for CSV, Parquet, and database sources. Suitable for data pipelines, ETL
1242
+ validation, and exploratory data analysis from the command line.
1243
+
1244
+ Quick Examples:
1047
1245
 
1048
- Use this CLI to run validation scripts, preview tables, and generate reports
1049
- directly from the command line.
1246
+ \b
1247
+ pb preview data.csv Preview your data
1248
+ pb scan data.csv Generate data profile
1249
+ pb validate data.csv Run basic validation
1250
+
1251
+ Use pb COMMAND --help for detailed help on any command.
1050
1252
  """
1051
1253
  pass
1052
1254
 
1053
1255
 
1054
1256
  @cli.command()
1055
- @click.argument("data_source", type=str)
1056
- def info(data_source: str):
1257
+ @click.argument("data_source", type=str, required=False)
1258
+ def info(data_source: str | None):
1057
1259
  """
1058
1260
  Display information about a data source.
1059
1261
 
@@ -1069,6 +1271,11 @@ def info(data_source: str):
1069
1271
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1070
1272
  """
1071
1273
  try:
1274
+ # Handle missing data_source with concise help
1275
+ if data_source is None:
1276
+ _show_concise_help("info", None)
1277
+ return
1278
+
1072
1279
  with console.status("[bold green]Loading data..."):
1073
1280
  # Load the data source using the centralized function
1074
1281
  data = _load_data_source(data_source)
@@ -1107,21 +1314,21 @@ def info(data_source: str):
1107
1314
 
1108
1315
 
1109
1316
  @cli.command()
1110
- @click.argument("data_source", type=str)
1111
- @click.option("--columns", "-c", help="Comma-separated list of columns to display")
1317
+ @click.argument("data_source", type=str, required=False)
1318
+ @click.option("--columns", help="Comma-separated list of columns to display")
1112
1319
  @click.option("--col-range", help="Column range like '1:10' or '5:' or ':15' (1-based indexing)")
1113
1320
  @click.option("--col-first", type=int, help="Show first N columns")
1114
1321
  @click.option("--col-last", type=int, help="Show last N columns")
1115
- @click.option("--head", "-h", default=5, help="Number of rows from the top (default: 5)")
1116
- @click.option("--tail", "-t", default=5, help="Number of rows from the bottom (default: 5)")
1117
- @click.option("--limit", "-l", default=50, help="Maximum total rows to display (default: 50)")
1322
+ @click.option("--head", default=5, help="Number of rows from the top (default: 5)")
1323
+ @click.option("--tail", default=5, help="Number of rows from the bottom (default: 5)")
1324
+ @click.option("--limit", default=50, help="Maximum total rows to display (default: 50)")
1118
1325
  @click.option("--no-row-numbers", is_flag=True, help="Hide row numbers")
1119
1326
  @click.option("--max-col-width", default=250, help="Maximum column width in pixels (default: 250)")
1120
1327
  @click.option("--min-table-width", default=500, help="Minimum table width in pixels (default: 500)")
1121
1328
  @click.option("--no-header", is_flag=True, help="Hide table header")
1122
1329
  @click.option("--output-html", type=click.Path(), help="Save HTML output to file")
1123
1330
  def preview(
1124
- data_source: str,
1331
+ data_source: str | None,
1125
1332
  columns: str | None,
1126
1333
  col_range: str | None,
1127
1334
  col_first: int | None,
@@ -1146,6 +1353,7 @@ def preview(
1146
1353
  - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1147
1354
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1148
1355
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1356
+ - Piped data from pb pl command
1149
1357
 
1150
1358
  COLUMN SELECTION OPTIONS:
1151
1359
 
@@ -1160,11 +1368,52 @@ def preview(
1160
1368
  Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
1161
1369
  """
1162
1370
  try:
1371
+ import sys
1372
+
1373
+ # Handle piped input
1374
+ if data_source is None:
1375
+ if not sys.stdin.isatty():
1376
+ # Data is being piped in - read the file path from stdin
1377
+ piped_input = sys.stdin.read().strip()
1378
+ if piped_input:
1379
+ data_source = piped_input
1380
+
1381
+ # Determine the format from the file extension
1382
+ if piped_input.endswith(".parquet"):
1383
+ format_type = "Parquet"
1384
+ elif piped_input.endswith(".csv"):
1385
+ format_type = "CSV"
1386
+ else:
1387
+ format_type = "unknown"
1388
+
1389
+ console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
1390
+ else:
1391
+ console.print("[red]Error:[/red] No data provided via pipe")
1392
+ sys.exit(1)
1393
+ else:
1394
+ # Show concise help and exit
1395
+ _show_concise_help("preview", None)
1396
+ return
1397
+
1163
1398
  with console.status("[bold green]Loading data..."):
1164
1399
  # Load the data source using the centralized function
1165
1400
  data = _load_data_source(data_source)
1166
1401
 
1167
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1402
+ # Check if this is a piped data source and create friendly display name
1403
+ is_piped_data = _is_piped_data_source(data_source)
1404
+
1405
+ if is_piped_data:
1406
+ if data_source.endswith(".parquet"):
1407
+ display_source = "Parquet file via `pb pl`"
1408
+ elif data_source.endswith(".csv"):
1409
+ display_source = "CSV file via `pb pl`"
1410
+ else:
1411
+ display_source = "File via `pb pl`"
1412
+ console.print(
1413
+ f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
1414
+ )
1415
+ else:
1416
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1168
1417
 
1169
1418
  # Parse columns if provided
1170
1419
  columns_list = None
@@ -1186,7 +1435,7 @@ def preview(
1186
1435
  # If _row_num_ exists in data but not in user selection, add it at beginning
1187
1436
  if all_columns and "_row_num_" in all_columns and "_row_num_" not in columns_list:
1188
1437
  columns_list = ["_row_num_"] + columns_list
1189
- except Exception: # pragma: no cover
1438
+ except Exception:
1190
1439
  # If we can't process the data, just use the user's column list as-is
1191
1440
  pass
1192
1441
  elif col_range or col_first or col_last:
@@ -1261,7 +1510,14 @@ def preview(
1261
1510
  total_dataset_columns = pb.get_column_count(processed_data)
1262
1511
 
1263
1512
  # Determine source type and table type for enhanced preview title
1264
- if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1513
+ if is_piped_data:
1514
+ if data_source.endswith(".parquet"):
1515
+ source_type = "Polars expression (serialized to Parquet) from `pb pl`"
1516
+ elif data_source.endswith(".csv"):
1517
+ source_type = "Polars expression (serialized to CSV) from `pb pl`"
1518
+ else:
1519
+ source_type = "Polars expression from `pb pl`"
1520
+ elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1265
1521
  source_type = f"Pointblank dataset: {data_source}"
1266
1522
  else:
1267
1523
  source_type = f"External source: {data_source}"
@@ -1311,17 +1567,17 @@ def preview(
1311
1567
 
1312
1568
  _rich_print_gt_table(gt_table, preview_info)
1313
1569
 
1314
- except Exception as e: # pragma: no cover
1570
+ except Exception as e:
1315
1571
  console.print(f"[red]Error:[/red] {e}")
1316
- sys.exit(1) # pragma: no cover
1572
+ sys.exit(1)
1317
1573
 
1318
1574
 
1319
1575
  @cli.command()
1320
- @click.argument("data_source", type=str)
1576
+ @click.argument("data_source", type=str, required=False)
1321
1577
  @click.option("--output-html", type=click.Path(), help="Save HTML scan report to file")
1322
1578
  @click.option("--columns", "-c", help="Comma-separated list of columns to scan")
1323
1579
  def scan(
1324
- data_source: str,
1580
+ data_source: str | None,
1325
1581
  output_html: str | None,
1326
1582
  columns: str | None,
1327
1583
  ):
@@ -1344,17 +1600,58 @@ def scan(
1344
1600
  - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1345
1601
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1346
1602
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1603
+ - Piped data from pb pl command
1347
1604
  """
1348
1605
  try:
1606
+ import sys
1349
1607
  import time
1350
1608
 
1351
1609
  start_time = time.time()
1352
1610
 
1611
+ # Handle piped input
1612
+ if data_source is None:
1613
+ if not sys.stdin.isatty():
1614
+ # Data is being piped in - read the file path from stdin
1615
+ piped_input = sys.stdin.read().strip()
1616
+ if piped_input:
1617
+ data_source = piped_input
1618
+
1619
+ # Determine the format from the file extension
1620
+ if piped_input.endswith(".parquet"):
1621
+ format_type = "Parquet"
1622
+ elif piped_input.endswith(".csv"):
1623
+ format_type = "CSV"
1624
+ else:
1625
+ format_type = "unknown"
1626
+
1627
+ console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
1628
+ else:
1629
+ console.print("[red]Error:[/red] No data provided via pipe")
1630
+ sys.exit(1)
1631
+ else:
1632
+ # Show concise help and exit
1633
+ _show_concise_help("scan", None)
1634
+ return
1635
+
1353
1636
  with console.status("[bold green]Loading data..."):
1354
1637
  # Load the data source using the centralized function
1355
1638
  data = _load_data_source(data_source)
1356
1639
 
1357
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1640
+ # Check if this is a piped data source and create friendly display name
1641
+ is_piped_data = _is_piped_data_source(data_source)
1642
+
1643
+ if is_piped_data:
1644
+ if data_source.endswith(".parquet"):
1645
+ display_source = "Parquet file via `pb pl`"
1646
+ elif data_source.endswith(".csv"):
1647
+ display_source = "CSV file via `pb pl`"
1648
+ else:
1649
+ display_source = "File via `pb pl`"
1650
+ console.print(
1651
+ f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
1652
+ )
1653
+ else:
1654
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1358
1655
 
1359
1656
  # Parse columns if provided
1360
1657
  columns_list = None
@@ -1367,7 +1664,15 @@ def scan(
1367
1664
  # Data is already processed by _load_data_source
1368
1665
  scan_result = pb.col_summary_tbl(data=data)
1369
1666
 
1370
- if data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1667
+ # Create friendly source type for display
1668
+ if is_piped_data:
1669
+ if data_source.endswith(".parquet"):
1670
+ source_type = "Polars expression (serialized to Parquet) from `pb pl`"
1671
+ elif data_source.endswith(".csv"):
1672
+ source_type = "Polars expression (serialized to CSV) from `pb pl`"
1673
+ else:
1674
+ source_type = "Polars expression from `pb pl`"
1675
+ elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1371
1676
  source_type = f"Pointblank dataset: {data_source}"
1372
1677
  else:
1373
1678
  source_type = f"External source: {data_source}"
@@ -1399,7 +1704,12 @@ def scan(
1399
1704
  # Display detailed column summary using rich formatting
1400
1705
  try:
1401
1706
  _rich_print_scan_table(
1402
- scan_result, data_source, source_type, table_type, total_rows, total_columns
1707
+ scan_result,
1708
+ display_source if is_piped_data else data_source,
1709
+ source_type,
1710
+ table_type,
1711
+ total_rows,
1712
+ total_columns,
1403
1713
  )
1404
1714
 
1405
1715
  except Exception as e:
@@ -1411,9 +1721,9 @@ def scan(
1411
1721
 
1412
1722
 
1413
1723
  @cli.command()
1414
- @click.argument("data_source", type=str)
1724
+ @click.argument("data_source", type=str, required=False)
1415
1725
  @click.option("--output-html", type=click.Path(), help="Save HTML output to file")
1416
- def missing(data_source: str, output_html: str | None):
1726
+ def missing(data_source: str | None, output_html: str | None):
1417
1727
  """
1418
1728
  Generate a missing values report for a data table.
1419
1729
 
@@ -1425,13 +1735,55 @@ def missing(data_source: str, output_html: str | None):
1425
1735
  - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
1426
1736
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1427
1737
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1738
+ - Piped data from pb pl command
1428
1739
  """
1429
1740
  try:
1741
+ import sys
1742
+
1743
+ # Handle piped input
1744
+ if data_source is None:
1745
+ if not sys.stdin.isatty():
1746
+ # Data is being piped in - read the file path from stdin
1747
+ piped_input = sys.stdin.read().strip()
1748
+ if piped_input:
1749
+ data_source = piped_input
1750
+
1751
+ # Determine the format from the file extension
1752
+ if piped_input.endswith(".parquet"):
1753
+ format_type = "Parquet"
1754
+ elif piped_input.endswith(".csv"):
1755
+ format_type = "CSV"
1756
+ else:
1757
+ format_type = "unknown"
1758
+
1759
+ console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
1760
+ else:
1761
+ console.print("[red]Error:[/red] No data provided via pipe")
1762
+ sys.exit(1)
1763
+ else:
1764
+ # Show concise help and exit
1765
+ _show_concise_help("missing", None)
1766
+ return
1767
+
1430
1768
  with console.status("[bold green]Loading data..."):
1431
1769
  # Load the data source using the centralized function
1432
1770
  data = _load_data_source(data_source)
1433
1771
 
1434
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1772
+ # Check if this is a piped data source and create friendly display name
1773
+ is_piped_data = _is_piped_data_source(data_source)
1774
+
1775
+ if is_piped_data:
1776
+ if data_source.endswith(".parquet"):
1777
+ display_source = "Parquet file via `pb pl`"
1778
+ elif data_source.endswith(".csv"):
1779
+ display_source = "CSV file via `pb pl`"
1780
+ else:
1781
+ display_source = "File via `pb pl`"
1782
+ console.print(
1783
+ f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
1784
+ )
1785
+ else:
1786
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1435
1787
 
1436
1788
  # Generate missing values table
1437
1789
  with console.status("[bold green]Analyzing missing values..."):
@@ -1447,7 +1799,38 @@ def missing(data_source: str, output_html: str | None):
1447
1799
  console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
1448
1800
  else:
1449
1801
  # Display in terminal with special missing values formatting
1450
- _rich_print_missing_table(gt_table, original_data)
1802
+ # Create enhanced context info for missing table display
1803
+ missing_info = {}
1804
+ try:
1805
+ # Determine source type and table type for enhanced preview title
1806
+ if is_piped_data:
1807
+ if data_source.endswith(".parquet"):
1808
+ source_type = "Polars expression (serialized to Parquet) from `pb pl`"
1809
+ elif data_source.endswith(".csv"):
1810
+ source_type = "Polars expression (serialized to CSV) from `pb pl`"
1811
+ else:
1812
+ source_type = "Polars expression from `pb pl`"
1813
+ elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
1814
+ source_type = f"Pointblank dataset: {data_source}"
1815
+ else:
1816
+ source_type = f"External source: {data_source}"
1817
+
1818
+ missing_info = {
1819
+ "source_type": source_type,
1820
+ "table_type": _get_tbl_type(original_data),
1821
+ "total_rows": pb.get_row_count(original_data),
1822
+ "total_columns": pb.get_column_count(original_data),
1823
+ }
1824
+ except Exception:
1825
+ # Use defaults if metadata extraction fails
1826
+ missing_info = {
1827
+ "source_type": f"Data source: {data_source}",
1828
+ "table_type": "unknown",
1829
+ "total_rows": None,
1830
+ "total_columns": None,
1831
+ }
1832
+
1833
+ _rich_print_missing_table_enhanced(gt_table, original_data, missing_info)
1451
1834
 
1452
1835
  except Exception as e:
1453
1836
  console.print(f"[red]Error:[/red] {e}")
@@ -1455,10 +1838,11 @@ def missing(data_source: str, output_html: str | None):
1455
1838
 
1456
1839
 
1457
1840
  @cli.command(name="validate")
1458
- @click.argument("data_source", type=str)
1841
+ @click.argument("data_source", type=str, required=False)
1842
+ @click.option("--list-checks", is_flag=True, help="List available validation checks and exit")
1459
1843
  @click.option(
1460
1844
  "--check",
1461
- "checks", # Changed to collect multiple values
1845
+ "checks",
1462
1846
  type=click.Choice(
1463
1847
  [
1464
1848
  "rows-distinct",
@@ -1472,25 +1856,25 @@ def missing(data_source: str, output_html: str | None):
1472
1856
  "col-vals-le",
1473
1857
  ]
1474
1858
  ),
1859
+ metavar="CHECK_TYPE",
1475
1860
  multiple=True, # Allow multiple --check options
1476
1861
  help="Type of validation check to perform. Can be used multiple times for multiple checks.",
1477
1862
  )
1478
- @click.option("--list-checks", is_flag=True, help="List available validation checks and exit")
1479
1863
  @click.option(
1480
1864
  "--column",
1481
- "columns", # Changed to collect multiple values
1865
+ "columns",
1482
1866
  multiple=True, # Allow multiple --column options
1483
1867
  help="Column name or integer position as #N (1-based index) for validation.",
1484
1868
  )
1485
1869
  @click.option(
1486
1870
  "--set",
1487
- "sets", # Changed to collect multiple values
1871
+ "sets",
1488
1872
  multiple=True, # Allow multiple --set options
1489
1873
  help="Comma-separated allowed values for col-vals-in-set checks.",
1490
1874
  )
1491
1875
  @click.option(
1492
1876
  "--value",
1493
- "values", # Changed to collect multiple values
1877
+ "values",
1494
1878
  type=float,
1495
1879
  multiple=True, # Allow multiple --value options
1496
1880
  help="Numeric value for comparison checks.",
@@ -1502,17 +1886,17 @@ def missing(data_source: str, output_html: str | None):
1502
1886
  "--write-extract", type=str, help="Save failing rows to folder. Provide base name for folder."
1503
1887
  )
1504
1888
  @click.option(
1505
- "--limit", "-l", default=10, help="Maximum number of failing rows to show/save (default: 10)"
1889
+ "--limit", default=500, help="Maximum number of failing rows to save to CSV (default: 500)"
1506
1890
  )
1507
1891
  @click.option("--exit-code", is_flag=True, help="Exit with non-zero code if validation fails")
1508
1892
  @click.pass_context
1509
1893
  def validate(
1510
1894
  ctx: click.Context,
1511
- data_source: str,
1512
- checks: tuple[str, ...], # Changed to tuple
1513
- columns: tuple[str, ...], # Changed to tuple
1514
- sets: tuple[str, ...], # Changed to tuple
1515
- values: tuple[float, ...], # Changed to tuple
1895
+ data_source: str | None,
1896
+ checks: tuple[str, ...],
1897
+ columns: tuple[str, ...],
1898
+ sets: tuple[str, ...],
1899
+ values: tuple[float, ...],
1516
1900
  show_extract: bool,
1517
1901
  write_extract: str | None,
1518
1902
  limit: int,
@@ -1534,21 +1918,21 @@ def validate(
1534
1918
  - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
1535
1919
  - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
1536
1920
 
1537
- AVAILABLE CHECKS:
1921
+ AVAILABLE CHECK_TYPES:
1538
1922
 
1539
1923
  Use --list-checks to see all available validation methods with examples.
1540
1924
 
1541
- The default check is 'rows-distinct' which checks for duplicate rows.
1925
+ The default CHECK_TYPE is 'rows-distinct' which checks for duplicate rows.
1542
1926
 
1543
1927
  \b
1544
1928
  - rows-distinct: Check if all rows in the dataset are unique (no duplicates)
1545
1929
  - rows-complete: Check if all rows are complete (no missing values in any column)
1546
1930
  - col-exists: Check if a specific column exists in the dataset (requires --column)
1547
1931
  - col-vals-not-null: Check if all values in a column are not null/missing (requires --column)
1548
- - col-vals-gt: Check if all values in a column are greater than a threshold (requires --column and --value)
1549
- - col-vals-ge: Check if all values in a column are greater than or equal to a threshold (requires --column and --value)
1550
- - col-vals-lt: Check if all values in a column are less than a threshold (requires --column and --value)
1551
- - col-vals-le: Check if all values in a column are less than or equal to a threshold (requires --column and --value)
1932
+ - col-vals-gt: Check if all values in a column are greater than a comparison value (requires --column and --value)
1933
+ - col-vals-ge: Check if all values in a column are greater than or equal to a comparison value (requires --column and --value)
1934
+ - col-vals-lt: Check if all values in a column are less than a comparison value (requires --column and --value)
1935
+ - col-vals-le: Check if all values in a column are less than or equal to a comparison value (requires --column and --value)
1552
1936
  - col-vals-in-set: Check if all values in a column are in an allowed set (requires --column and --set)
1553
1937
 
1554
1938
  Examples:
@@ -1571,28 +1955,9 @@ def validate(
1571
1955
  pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
1572
1956
  """
1573
1957
  try:
1574
- # Handle backward compatibility and parameter conversion
1575
1958
  import sys
1576
1959
 
1577
- # Convert parameter tuples to lists, handling default case
1578
- if not checks:
1579
- # No --check options provided, use default
1580
- checks_list = ["rows-distinct"]
1581
- is_using_default_check = True
1582
- else:
1583
- checks_list = list(checks)
1584
- is_using_default_check = False
1585
-
1586
- columns_list = list(columns) if columns else []
1587
- sets_list = list(sets) if sets else []
1588
- values_list = list(values) if values else []
1589
-
1590
- # Map parameters to checks intelligently
1591
- mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
1592
- checks_list, columns_list, sets_list, values_list
1593
- )
1594
-
1595
- # Handle --list-checks option
1960
+ # Handle --list-checks option early (doesn't need data source)
1596
1961
  if list_checks:
1597
1962
  console.print("[bold bright_cyan]Available Validation Checks:[/bold bright_cyan]")
1598
1963
  console.print()
@@ -1616,14 +1981,16 @@ def validate(
1616
1981
  "[bold magenta]Value comparison checks [bright_black](require --column and --value)[/bright_black]:[/bold magenta]"
1617
1982
  )
1618
1983
  console.print(
1619
- " • [bold cyan]col-vals-gt[/bold cyan] Values greater than threshold"
1984
+ " • [bold cyan]col-vals-gt[/bold cyan] Values greater than comparison value"
1620
1985
  )
1621
1986
  console.print(
1622
- " • [bold cyan]col-vals-ge[/bold cyan] Values greater than or equal to threshold"
1987
+ " • [bold cyan]col-vals-ge[/bold cyan] Values greater than or equal to comparison value"
1623
1988
  )
1624
- console.print(" • [bold cyan]col-vals-lt[/bold cyan] Values less than threshold")
1625
1989
  console.print(
1626
- " • [bold cyan]col-vals-le[/bold cyan] Values less than or equal to threshold"
1990
+ " • [bold cyan]col-vals-lt[/bold cyan] Values less than comparison value"
1991
+ )
1992
+ console.print(
1993
+ " • [bold cyan]col-vals-le[/bold cyan] Values less than or equal to comparison value"
1627
1994
  )
1628
1995
  console.print()
1629
1996
  console.print(
@@ -1634,19 +2001,65 @@ def validate(
1634
2001
  )
1635
2002
  console.print()
1636
2003
  console.print("[bold bright_yellow]Examples:[/bold bright_yellow]")
2004
+ console.print(" [bright_blue]pb validate data.csv --check rows-distinct[/bright_blue]")
1637
2005
  console.print(
1638
- f" [bright_blue]pb validate {data_source} --check rows-distinct[/bright_blue]"
1639
- )
1640
- console.print(
1641
- f" [bright_blue]pb validate {data_source} --check col-vals-not-null --column price[/bright_blue]"
2006
+ " [bright_blue]pb validate data.csv --check col-vals-not-null --column price[/bright_blue]"
1642
2007
  )
1643
2008
  console.print(
1644
- f" [bright_blue]pb validate {data_source} --check col-vals-gt --column age --value 18[/bright_blue]"
2009
+ " [bright_blue]pb validate data.csv --check col-vals-gt --column age --value 18[/bright_blue]"
1645
2010
  )
1646
2011
  import sys
1647
2012
 
1648
2013
  sys.exit(0)
1649
2014
 
2015
+ # Check if data_source is provided (required for all operations except --list-checks)
2016
+ # or if we have piped input
2017
+ if data_source is None:
2018
+ # Check if we have piped input
2019
+ if not sys.stdin.isatty():
2020
+ # Data is being piped in: read the file path from stdin
2021
+ piped_input = sys.stdin.read().strip()
2022
+ if piped_input:
2023
+ data_source = piped_input
2024
+
2025
+ # Determine the format from the file extension
2026
+ if piped_input.endswith(".parquet"):
2027
+ format_type = "Parquet"
2028
+ elif piped_input.endswith(".csv"):
2029
+ format_type = "CSV"
2030
+ else:
2031
+ format_type = "unknown"
2032
+
2033
+ console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
2034
+ else:
2035
+ console.print("[red]Error:[/red] No data provided via pipe")
2036
+ sys.exit(1)
2037
+ else:
2038
+ # Show concise help and exit
2039
+ _show_concise_help("validate", None)
2040
+ return
2041
+
2042
+ # Handle backward compatibility and parameter conversion
2043
+ import sys
2044
+
2045
+ # Convert parameter tuples to lists, handling default case
2046
+ if not checks:
2047
+ # No --check options provided, use default
2048
+ checks_list = ["rows-distinct"]
2049
+ is_using_default_check = True
2050
+ else:
2051
+ checks_list = list(checks)
2052
+ is_using_default_check = False
2053
+
2054
+ columns_list = list(columns) if columns else []
2055
+ sets_list = list(sets) if sets else []
2056
+ values_list = list(values) if values else []
2057
+
2058
+ # Map parameters to checks intelligently
2059
+ mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
2060
+ checks_list, columns_list, sets_list, values_list
2061
+ )
2062
+
1650
2063
  # Validate required parameters for different check types
1651
2064
  # Check parameters for each check in the list using mapped parameters
1652
2065
  for i, check in enumerate(checks_list):
@@ -1732,7 +2145,25 @@ def validate(
1732
2145
  checks_list, columns_list, sets_list, values_list
1733
2146
  )
1734
2147
 
1735
- console.print(f"[green]✓[/green] Loaded data source: {data_source}")
2148
+ # Check if this is a piped data source and create friendly display name
2149
+ is_piped_data = (
2150
+ data_source
2151
+ and data_source.startswith("/var/folders/")
2152
+ and ("pb_pipe_" in data_source or "/T/" in data_source)
2153
+ )
2154
+
2155
+ if is_piped_data:
2156
+ if data_source.endswith(".parquet"):
2157
+ display_source = "Parquet file via `pb pl`"
2158
+ elif data_source.endswith(".csv"):
2159
+ display_source = "CSV file via `pb pl`"
2160
+ else:
2161
+ display_source = "File via `pb pl`"
2162
+ console.print(
2163
+ f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
2164
+ )
2165
+ else:
2166
+ console.print(f"[green]✓[/green] Loaded data source: {data_source}")
1736
2167
 
1737
2168
  # Build a single validation object with chained checks
1738
2169
  with console.status(f"[bold green]Running {len(checks_list)} validation check(s)..."):
@@ -1791,7 +2222,7 @@ def validate(
1791
2222
 
1792
2223
  # Display results based on whether we have single or multiple checks
1793
2224
  if len(checks_list) == 1:
1794
- # Single check - use current display format
2225
+ # Single check: use current display format
1795
2226
  _display_validation_result(
1796
2227
  validation,
1797
2228
  checks_list,
@@ -1806,7 +2237,7 @@ def validate(
1806
2237
  limit,
1807
2238
  )
1808
2239
  else:
1809
- # Multiple checks - use stacked display format
2240
+ # Multiple checks: use stacked display format
1810
2241
  any_failed = False
1811
2242
  for i in range(len(checks_list)):
1812
2243
  console.print() # Add spacing between results
@@ -1845,7 +2276,7 @@ def validate(
1845
2276
  console.print()
1846
2277
  console.print("[bold magenta]Common validation options:[/bold magenta]")
1847
2278
  console.print(
1848
- " • [bold cyan]--check rows-complete[/bold cyan] Check for rows with missing values"
2279
+ " • [bold cyan]--check rows-complete[/bold cyan] Check for rows with missing values"
1849
2280
  )
1850
2281
  console.print(
1851
2282
  " • [bold cyan]--check col-vals-not-null[/bold cyan] Check for null values in a column [bright_black](requires --column)[/bright_black]"
@@ -1955,81 +2386,284 @@ def requirements():
1955
2386
  console.print("\n[dim]Install missing packages to enable additional functionality.[/dim]")
1956
2387
 
1957
2388
 
1958
- def _rich_print_scan_table(
1959
- scan_result: Any,
1960
- data_source: str,
1961
- source_type: str,
1962
- table_type: str,
1963
- total_rows: int | None = None,
1964
- total_columns: int | None = None,
2389
+ def _rich_print_missing_table_enhanced(
2390
+ gt_table: Any, original_data: Any = None, missing_info: dict = None
1965
2391
  ) -> None:
1966
- """
1967
- Display scan results as a Rich table in the terminal with statistical measures.
2392
+ """Convert a missing values GT table to Rich table with enhanced formatting and metadata.
1968
2393
 
1969
2394
  Args:
1970
- scan_result: The GT object from col_summary_tbl()
1971
- data_source: Name of the data source being scanned
1972
- source_type: Type of data source (e.g., "Pointblank dataset: small_table")
1973
- table_type: Type of table (e.g., "polars.LazyFrame")
1974
- total_rows: Total number of rows in the dataset
1975
- total_columns: Total number of columns in the dataset
2395
+ gt_table: The GT table object for missing values
2396
+ original_data: The original data source to extract column types
2397
+ missing_info: Dict with metadata including source_type, table_type, total_rows, total_columns
1976
2398
  """
1977
2399
  try:
1978
- import re
1979
-
1980
- import narwhals as nw
1981
- from rich.box import SIMPLE_HEAD
2400
+ # Extract the underlying data from the GT table
2401
+ df = None
1982
2402
 
1983
- # Extract the underlying DataFrame from the GT object
1984
- # The GT object has a _tbl_data attribute that contains the DataFrame
1985
- gt_data = scan_result._tbl_data
2403
+ if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
2404
+ df = gt_table._tbl_data
2405
+ elif hasattr(gt_table, "_data") and gt_table._data is not None:
2406
+ df = gt_table._data
2407
+ elif hasattr(gt_table, "data") and gt_table.data is not None:
2408
+ df = gt_table.data
1986
2409
 
1987
- # Convert to Narwhals DataFrame for consistent handling
1988
- nw_data = nw.from_native(gt_data)
2410
+ if df is not None:
2411
+ from rich.box import SIMPLE_HEAD
1989
2412
 
1990
- # Convert to dictionary for easier access
1991
- data_dict = nw_data.to_dict(as_series=False)
2413
+ # Extract metadata from missing_info or use defaults
2414
+ source_type = "Data source"
2415
+ table_type = "unknown"
2416
+ total_rows = None
2417
+ total_columns = None
1992
2418
 
1993
- # Create main scan table with missing data table styling
1994
- # Create a comprehensive title with data source, source type, and table type
1995
- title_text = f"Column Summary / {source_type} / {table_type}"
2419
+ if missing_info:
2420
+ source_type = missing_info.get("source_type", "Data source")
2421
+ table_type = missing_info.get("table_type", "unknown")
2422
+ total_rows = missing_info.get("total_rows")
2423
+ total_columns = missing_info.get("total_columns")
1996
2424
 
1997
- # Add dimensions subtitle in gray if available
1998
- if total_rows is not None and total_columns is not None:
1999
- title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2425
+ # Create enhanced title matching the scan table format
2426
+ title_text = f"Missing Values / {source_type} / {table_type}"
2000
2427
 
2001
- # Create the scan table
2002
- scan_table = Table(
2003
- title=title_text,
2004
- show_header=True,
2005
- header_style="bold magenta",
2006
- box=SIMPLE_HEAD,
2007
- title_style="bold cyan",
2008
- title_justify="left",
2009
- )
2428
+ # Add dimensions subtitle in gray if available
2429
+ if total_rows is not None and total_columns is not None:
2430
+ title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2010
2431
 
2011
- # Add columns with specific styling and appropriate widths
2012
- scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
2013
- scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
2014
- scan_table.add_column(
2015
- "NA", style="red", width=6, justify="right"
2016
- ) # Adjusted for better formatting
2017
- scan_table.add_column(
2018
- "UQ", style="green", width=8, justify="right"
2019
- ) # Adjusted for boolean values
2432
+ # Get column names
2433
+ columns = []
2434
+ try:
2435
+ if hasattr(df, "columns"):
2436
+ columns = list(df.columns)
2437
+ elif hasattr(df, "schema"):
2438
+ columns = list(df.schema.names)
2439
+ except Exception as e:
2440
+ console.print(f"[red]Error getting columns:[/red] {e}")
2441
+ columns = []
2020
2442
 
2021
- # Add statistical columns if they exist with appropriate widths
2022
- stat_columns = []
2023
- column_mapping = {
2024
- "mean": ("Mean", "blue", 9),
2025
- "std": ("SD", "blue", 9),
2026
- "min": ("Min", "yellow", 9),
2027
- "median": ("Med", "yellow", 9),
2028
- "max": ("Max", "yellow", 9),
2029
- "q_1": ("Q₁", "magenta", 8),
2030
- "q_3": ("Q₃", "magenta", 9),
2031
- "iqr": ("IQR", "magenta", 8),
2032
- }
2443
+ if not columns:
2444
+ columns = [f"Column {i + 1}" for i in range(10)] # Fallback
2445
+
2446
+ # Get original data to extract column types
2447
+ column_types = {}
2448
+ if original_data is not None:
2449
+ try:
2450
+ # Get column types from original data
2451
+ if hasattr(original_data, "columns"):
2452
+ original_columns = list(original_data.columns)
2453
+ column_types = _get_column_dtypes(original_data, original_columns)
2454
+ except Exception as e:
2455
+ console.print(f"[red]Error getting column types:[/red] {e}")
2456
+ pass # Use empty dict as fallback
2457
+
2458
+ # Add columns to Rich table with special formatting for missing values table
2459
+ sector_columns = [col for col in columns if col != "columns" and col.isdigit()]
2460
+
2461
+ # Print the title first
2462
+ console.print()
2463
+ console.print(f"[bold cyan]{title_text}[/bold cyan]")
2464
+
2465
+ # Show the custom spanner header if we have sector columns
2466
+ if sector_columns:
2467
+ # Create a custom header line that shows the spanner
2468
+ header_parts = []
2469
+ header_parts.append(" " * 20) # Space for Column header
2470
+ header_parts.append(" " * 10) # Space for Type header
2471
+
2472
+ # Left-align "Row Sectors" with the first numbered column
2473
+ row_sectors_text = "Row Sectors"
2474
+ header_parts.append(row_sectors_text)
2475
+
2476
+ # Print the custom spanner header
2477
+ console.print("[dim]" + " ".join(header_parts) + "[/dim]")
2478
+
2479
+ # Add a horizontal rule below the spanner
2480
+ rule_parts = []
2481
+ rule_parts.append(" " * 20) # Space for Column header
2482
+ rule_parts.append(" " * 10) # Space for Type header
2483
+
2484
+ # Use a fixed width horizontal rule for "Row Sectors"
2485
+ horizontal_rule = "─" * 20
2486
+ rule_parts.append(horizontal_rule)
2487
+
2488
+ # Print the horizontal rule
2489
+ console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
2490
+
2491
+ # Create the missing values table WITHOUT the title (since we printed it above)
2492
+ rich_table = Table(
2493
+ show_header=True,
2494
+ header_style="bold magenta",
2495
+ box=SIMPLE_HEAD,
2496
+ )
2497
+
2498
+ # Two separate columns: Column name (20 chars) and Data type (10 chars)
2499
+ rich_table.add_column("Column", style="cyan", no_wrap=True, width=20)
2500
+ rich_table.add_column("Type", style="yellow", no_wrap=True, width=10)
2501
+
2502
+ # Sector columns: All same width, optimized for "100%" (4 chars + padding)
2503
+ for sector in sector_columns:
2504
+ rich_table.add_column(
2505
+ sector,
2506
+ style="cyan",
2507
+ justify="center",
2508
+ no_wrap=True,
2509
+ width=5, # Fixed width optimized for percentage values
2510
+ )
2511
+
2512
+ # Convert data to rows with special formatting
2513
+ rows = []
2514
+ try:
2515
+ if hasattr(df, "to_dicts"):
2516
+ data_dict = df.to_dicts()
2517
+ elif hasattr(df, "to_dict"):
2518
+ data_dict = df.to_dict("records")
2519
+ else:
2520
+ data_dict = []
2521
+
2522
+ for i, row in enumerate(data_dict):
2523
+ try:
2524
+ # Each row should have: [column_name, data_type, sector1, sector2, ...]
2525
+ column_name = str(row.get("columns", ""))
2526
+
2527
+ # Truncate column name to 20 characters with ellipsis if needed
2528
+ if len(column_name) > 20:
2529
+ truncated_name = column_name[:17] + "…"
2530
+ else:
2531
+ truncated_name = column_name
2532
+
2533
+ # Get data type for this column
2534
+ if column_name in column_types:
2535
+ dtype = column_types[column_name]
2536
+ if len(dtype) > 10:
2537
+ truncated_dtype = dtype[:9] + "…"
2538
+ else:
2539
+ truncated_dtype = dtype
2540
+ else:
2541
+ truncated_dtype = "?"
2542
+
2543
+ # Start building the row with column name and type
2544
+ formatted_row = [truncated_name, truncated_dtype]
2545
+
2546
+ # Add sector values (formatted percentages)
2547
+ for sector in sector_columns:
2548
+ value = row.get(sector, 0.0)
2549
+ if isinstance(value, (int, float)):
2550
+ formatted_row.append(_format_missing_percentage(float(value)))
2551
+ else:
2552
+ formatted_row.append(str(value))
2553
+
2554
+ rows.append(formatted_row)
2555
+
2556
+ except Exception as e:
2557
+ console.print(f"[red]Error processing row {i}:[/red] {e}")
2558
+ continue
2559
+
2560
+ except Exception as e:
2561
+ console.print(f"[red]Error extracting data:[/red] {e}")
2562
+ rows = [["Error extracting data", "?", *["" for _ in sector_columns]]]
2563
+
2564
+ # Add rows to Rich table
2565
+ for row in rows:
2566
+ try:
2567
+ rich_table.add_row(*row)
2568
+ except Exception as e:
2569
+ console.print(f"[red]Error adding row:[/red] {e}")
2570
+ break
2571
+
2572
+ # Print the Rich table (without title since we already printed it)
2573
+ console.print(rich_table)
2574
+
2575
+ footer_text = (
2576
+ "[dim]Symbols: [green]●[/green] = no missing vals in sector, "
2577
+ "[red]●[/red] = all vals completely missing, "
2578
+ "[cyan]x%[/cyan] = percentage missing[/dim]"
2579
+ )
2580
+ console.print(footer_text)
2581
+
2582
+ else:
2583
+ # Fallback to regular table display
2584
+ _rich_print_gt_table(gt_table)
2585
+
2586
+ except Exception as e:
2587
+ console.print(f"[red]Error rendering missing values table:[/red] {e}")
2588
+ # Fallback to regular table display
2589
+ _rich_print_gt_table(gt_table)
2590
+
2591
+
2592
+ def _rich_print_scan_table(
2593
+ scan_result: Any,
2594
+ data_source: str,
2595
+ source_type: str,
2596
+ table_type: str,
2597
+ total_rows: int | None = None,
2598
+ total_columns: int | None = None,
2599
+ ) -> None:
2600
+ """
2601
+ Display scan results as a Rich table in the terminal with statistical measures.
2602
+
2603
+ Args:
2604
+ scan_result: The GT object from col_summary_tbl()
2605
+ data_source: Name of the data source being scanned
2606
+ source_type: Type of data source (e.g., "Pointblank dataset: small_table")
2607
+ table_type: Type of table (e.g., "polars.LazyFrame")
2608
+ total_rows: Total number of rows in the dataset
2609
+ total_columns: Total number of columns in the dataset
2610
+ """
2611
+ try:
2612
+ import re
2613
+
2614
+ import narwhals as nw
2615
+ from rich.box import SIMPLE_HEAD
2616
+
2617
+ # Extract the underlying DataFrame from the GT object
2618
+ # The GT object has a _tbl_data attribute that contains the DataFrame
2619
+ gt_data = scan_result._tbl_data
2620
+
2621
+ # Convert to Narwhals DataFrame for consistent handling
2622
+ nw_data = nw.from_native(gt_data)
2623
+
2624
+ # Convert to dictionary for easier access
2625
+ data_dict = nw_data.to_dict(as_series=False)
2626
+
2627
+ # Create main scan table with missing data table styling
2628
+ # Create a comprehensive title with data source, source type, and table type
2629
+ title_text = f"Column Summary / {source_type} / {table_type}"
2630
+
2631
+ # Add dimensions subtitle in gray if available
2632
+ if total_rows is not None and total_columns is not None:
2633
+ title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2634
+
2635
+ # Create the scan table
2636
+ scan_table = Table(
2637
+ title=title_text,
2638
+ show_header=True,
2639
+ header_style="bold magenta",
2640
+ box=SIMPLE_HEAD,
2641
+ title_style="bold cyan",
2642
+ title_justify="left",
2643
+ )
2644
+
2645
+ # Add columns with specific styling and appropriate widths
2646
+ scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
2647
+ scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
2648
+ scan_table.add_column(
2649
+ "NA", style="red", width=6, justify="right"
2650
+ ) # Adjusted for better formatting
2651
+ scan_table.add_column(
2652
+ "UQ", style="green", width=8, justify="right"
2653
+ ) # Adjusted for boolean values
2654
+
2655
+ # Add statistical columns if they exist with appropriate widths
2656
+ stat_columns = []
2657
+ column_mapping = {
2658
+ "mean": ("Mean", "blue", 9),
2659
+ "std": ("SD", "blue", 9),
2660
+ "min": ("Min", "yellow", 9),
2661
+ "median": ("Med", "yellow", 9),
2662
+ "max": ("Max", "yellow", 9),
2663
+ "q_1": ("Q₁", "magenta", 8),
2664
+ "q_3": ("Q₃", "magenta", 9),
2665
+ "iqr": ("IQR", "magenta", 8),
2666
+ }
2033
2667
 
2034
2668
  for col_key, (display_name, color, width) in column_mapping.items():
2035
2669
  if col_key in data_dict:
@@ -2070,7 +2704,7 @@ def _rich_print_scan_table(
2070
2704
  # Clean up HTML formatting from the raw data
2071
2705
  str_val = str(value)
2072
2706
 
2073
- # Handle multi-line values with <br> tags FIRST - take the first line (absolute number)
2707
+ # Handle multi-line values with <br> tags FIRST: take the first line (absolute number)
2074
2708
  if "<br>" in str_val:
2075
2709
  str_val = str_val.split("<br>")[0].strip()
2076
2710
  # For unique values, we want just the integer part
@@ -2089,14 +2723,14 @@ def _rich_print_scan_table(
2089
2723
  # Clean up extra whitespace
2090
2724
  str_val = re.sub(r"\s+", " ", str_val).strip()
2091
2725
 
2092
- # Handle values like "2<.01" - extract the first number
2726
+ # Handle values like "2<.01": extract the first number
2093
2727
  if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
2094
2728
  # Extract number before the < symbol
2095
2729
  before_lt = str_val.split("<")[0].strip()
2096
2730
  if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
2097
2731
  str_val = before_lt
2098
2732
 
2099
- # Handle boolean unique values like "T0.62F0.38" - extract the more readable format
2733
+ # Handle boolean unique values like "T0.62F0.38": extract the more readable format
2100
2734
  if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
2101
2735
  # Extract T and F values
2102
2736
  t_match = re.search(r"T(\d+\.\d+)", str_val)
@@ -2126,7 +2760,7 @@ def _rich_print_scan_table(
2126
2760
  # Simple integers under 10000
2127
2761
  return str(int(num_val))
2128
2762
  elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
2129
- # Likely dates in YYYYMMDD format - format as date-like
2763
+ # Likely dates in YYYYMMDD format: format as date-like
2130
2764
  int_val = int(num_val)
2131
2765
  if 19000101 <= int_val <= 29991231: # Reasonable date range
2132
2766
  str_date = str(int_val)
@@ -2138,29 +2772,29 @@ def _rich_print_scan_table(
2138
2772
  # Otherwise treat as large number
2139
2773
  return f"{num_val / 1000000:.1f}M"
2140
2774
  elif abs(num_val) >= 1000000:
2141
- # Large numbers - use scientific notation or M/k notation
2775
+ # Large numbers: use scientific notation or M/k notation
2142
2776
 
2143
2777
  if abs(num_val) >= 1000000000:
2144
2778
  return f"{num_val:.1e}"
2145
2779
  else:
2146
2780
  return f"{num_val / 1000000:.1f}M"
2147
2781
  elif abs(num_val) >= 10000:
2148
- # Numbers >= 10k - use compact notation
2782
+ # Numbers >= 10k: use compact notation
2149
2783
  return f"{num_val / 1000:.1f}k"
2150
2784
  elif abs(num_val) >= 100:
2151
- # Numbers 100-9999 - show with minimal decimals
2785
+ # Numbers 100-9999: show with minimal decimals
2152
2786
  return f"{num_val:.1f}"
2153
2787
  elif abs(num_val) >= 10:
2154
- # Numbers 10-99 - show with one decimal
2788
+ # Numbers 10-99: show with one decimal
2155
2789
  return f"{num_val:.1f}"
2156
2790
  elif abs(num_val) >= 1:
2157
- # Numbers 1-9 - show with two decimals
2791
+ # Numbers 1-9: show with two decimals
2158
2792
  return f"{num_val:.2f}"
2159
2793
  elif abs(num_val) >= 0.01:
2160
- # Small numbers - show with appropriate precision
2794
+ # Small numbers: show with appropriate precision
2161
2795
  return f"{num_val:.2f}"
2162
2796
  else:
2163
- # Very small numbers - use scientific notation
2797
+ # Very small numbers: use scientific notation
2164
2798
 
2165
2799
  return f"{num_val:.1e}"
2166
2800
 
@@ -2168,7 +2802,7 @@ def _rich_print_scan_table(
2168
2802
  # Not a number, handle as string
2169
2803
  pass
2170
2804
 
2171
- # Handle date/datetime strings - show abbreviated format
2805
+ # Handle date/datetime strings: show abbreviated format
2172
2806
  if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
2173
2807
  # Likely a date/datetime, show abbreviated
2174
2808
  if len(str_val) > max_width:
@@ -2244,8 +2878,36 @@ def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
2244
2878
  if df is not None:
2245
2879
  from rich.box import SIMPLE_HEAD
2246
2880
 
2247
- # Create the missing values table
2248
- rich_table = Table(show_header=True, header_style="bold magenta", box=SIMPLE_HEAD)
2881
+ # Get metadata for enhanced missing table title
2882
+ total_rows = None
2883
+ total_columns = None
2884
+ source_type = "Data source"
2885
+ table_type = "unknown"
2886
+
2887
+ if original_data is not None:
2888
+ try:
2889
+ total_rows = pb.get_row_count(original_data)
2890
+ total_columns = pb.get_column_count(original_data)
2891
+ table_type = _get_tbl_type(original_data)
2892
+ except Exception:
2893
+ pass
2894
+
2895
+ # Create enhanced title matching the scan table format
2896
+ title_text = f"Missing Values / {source_type} / {table_type}"
2897
+
2898
+ # Add dimensions subtitle in gray if available
2899
+ if total_rows is not None and total_columns is not None:
2900
+ title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
2901
+
2902
+ # Create the missing values table with enhanced title
2903
+ rich_table = Table(
2904
+ title=title_text,
2905
+ show_header=True,
2906
+ header_style="bold magenta",
2907
+ box=SIMPLE_HEAD,
2908
+ title_style="bold cyan",
2909
+ title_justify="left",
2910
+ )
2249
2911
 
2250
2912
  # Get column names
2251
2913
  columns = []
@@ -2377,12 +3039,12 @@ def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
2377
3039
  console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
2378
3040
 
2379
3041
  # Print the Rich table (will handle terminal width automatically)
3042
+ console.print()
2380
3043
  console.print(rich_table)
2381
3044
  footer_text = (
2382
- "[dim]Symbols: [green]●[/green] = no missing values, "
2383
- "[red]●[/red] = completely missing, "
2384
- "<1% = less than 1% missing, "
2385
- ">99% = more than 99% missing[/dim]"
3045
+ "[dim]Symbols: [green]●[/green] = no missing vals in sector, "
3046
+ "[red]●[/red] = all vals completely missing, "
3047
+ "[cyan]x%[/cyan] = percentage missing[/dim]"
2386
3048
  )
2387
3049
  console.print(footer_text)
2388
3050
 
@@ -2521,6 +3183,20 @@ def _display_validation_result(
2521
3183
  set_val = sets_list[step_index] if step_index < len(sets_list) else None
2522
3184
  value = values_list[step_index] if step_index < len(values_list) else None
2523
3185
 
3186
+ # Check if this is piped data
3187
+ is_piped_data = _is_piped_data_source(data_source)
3188
+
3189
+ # Create friendly display name for data source
3190
+ if is_piped_data:
3191
+ if data_source.endswith(".parquet"):
3192
+ display_source = "Polars expression (serialized to Parquet) from `pb pl`"
3193
+ elif data_source.endswith(".csv"):
3194
+ display_source = "Polars expression (serialized to CSV) from `pb pl`"
3195
+ else:
3196
+ display_source = "Polars expression from `pb pl`"
3197
+ else:
3198
+ display_source = data_source
3199
+
2524
3200
  # Get validation step info
2525
3201
  step_info = None
2526
3202
  if hasattr(validation, "validation_info") and len(validation.validation_info) > step_index:
@@ -2528,7 +3204,7 @@ def _display_validation_result(
2528
3204
 
2529
3205
  # Create friendly title for table
2530
3206
  if total_checks == 1:
2531
- # Single check - use original title format
3207
+ # Single check: use original title format
2532
3208
  if check == "rows-distinct":
2533
3209
  table_title = "Validation Result: Rows Distinct"
2534
3210
  elif check == "col-vals-not-null":
@@ -2550,7 +3226,7 @@ def _display_validation_result(
2550
3226
  else:
2551
3227
  table_title = f"Validation Result: {check.replace('-', ' ').title()}"
2552
3228
  else:
2553
- # Multiple checks - add numbering
3229
+ # Multiple checks: add numbering
2554
3230
  if check == "rows-distinct":
2555
3231
  base_title = "Rows Distinct"
2556
3232
  elif check == "col-vals-not-null":
@@ -2587,7 +3263,7 @@ def _display_validation_result(
2587
3263
  result_table.add_column("Value", style="white")
2588
3264
 
2589
3265
  # Add basic info
2590
- result_table.add_row("Data Source", data_source)
3266
+ result_table.add_row("Data Source", display_source)
2591
3267
  result_table.add_row("Check Type", check)
2592
3268
 
2593
3269
  # Add column info for column-specific checks
@@ -2617,7 +3293,7 @@ def _display_validation_result(
2617
3293
  operator = "<"
2618
3294
  elif check == "col-vals-le":
2619
3295
  operator = "<="
2620
- result_table.add_row("Threshold", f"{operator} {value}")
3296
+ result_table.add_row("Comparison Value", f"{operator} {value}")
2621
3297
 
2622
3298
  # Get validation details
2623
3299
  if step_info:
@@ -2728,6 +3404,7 @@ def _display_validation_result(
2728
3404
  Panel(
2729
3405
  success_message,
2730
3406
  border_style="green",
3407
+ expand=False,
2731
3408
  )
2732
3409
  )
2733
3410
  else:
@@ -2757,6 +3434,7 @@ def _display_validation_result(
2757
3434
  Panel(
2758
3435
  failure_message,
2759
3436
  border_style="red",
3437
+ expand=False,
2760
3438
  )
2761
3439
  )
2762
3440
 
@@ -2837,7 +3515,7 @@ def _show_extract_for_multi_check(
2837
3515
  console.print()
2838
3516
  console.print(extract_message)
2839
3517
 
2840
- # Special handling for col-exists check - no rows to show when column doesn't exist
3518
+ # Special handling for col-exists check: no rows to show when column doesn't exist
2841
3519
  if check == "col-exists":
2842
3520
  if show_extract:
2843
3521
  console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
@@ -2848,16 +3526,17 @@ def _show_extract_for_multi_check(
2848
3526
  console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
2849
3527
  else:
2850
3528
  try:
2851
- # Get failing rows extract - use step_index + 1 since extracts are 1-indexed
3529
+ # Get failing rows extract: use step_index + 1 since extracts are 1-indexed
2852
3530
  failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
2853
3531
 
2854
3532
  if failing_rows is not None and len(failing_rows) > 0:
2855
3533
  if show_extract:
2856
- # Limit the number of rows shown
2857
- if len(failing_rows) > limit:
2858
- display_rows = failing_rows.head(limit)
3534
+ # Always limit to 10 rows for display, regardless of limit option
3535
+ display_limit = 10
3536
+ if len(failing_rows) > display_limit:
3537
+ display_rows = failing_rows.head(display_limit)
2859
3538
  console.print(
2860
- f"[dim]Showing first {limit} of {len(failing_rows)} {row_type}[/dim]"
3539
+ f"[dim]Showing first {display_limit} of {len(failing_rows)} {row_type}[/dim]"
2861
3540
  )
2862
3541
  else:
2863
3542
  display_rows = failing_rows
@@ -2868,9 +3547,9 @@ def _show_extract_for_multi_check(
2868
3547
 
2869
3548
  preview_table = pb.preview(
2870
3549
  data=display_rows,
2871
- n_head=min(limit, len(display_rows)),
3550
+ n_head=min(display_limit, len(display_rows)),
2872
3551
  n_tail=0,
2873
- limit=limit,
3552
+ limit=display_limit,
2874
3553
  show_row_numbers=True,
2875
3554
  )
2876
3555
 
@@ -2892,7 +3571,7 @@ def _show_extract_for_multi_check(
2892
3571
  filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
2893
3572
  filepath = output_folder / filename
2894
3573
 
2895
- # Limit the output if needed
3574
+ # Use limit option for write_extract
2896
3575
  write_rows = failing_rows
2897
3576
  if len(failing_rows) > limit:
2898
3577
  write_rows = failing_rows.head(limit)
@@ -2946,6 +3625,18 @@ def _show_extract_and_summary(
2946
3625
  """Show extract and summary for a validation step (used for single checks)."""
2947
3626
  step_passed = step_info.n_failed == 0 if step_info else True
2948
3627
 
3628
+ # Get the friendly display name
3629
+ is_piped_data = _is_piped_data_source(data_source)
3630
+ if is_piped_data:
3631
+ if data_source.endswith(".parquet"):
3632
+ display_source = "Polars expression (serialized to Parquet) from `pb pl`"
3633
+ elif data_source.endswith(".csv"):
3634
+ display_source = "Polars expression (serialized to CSV) from `pb pl`"
3635
+ else:
3636
+ display_source = "Polars expression from `pb pl`"
3637
+ else:
3638
+ display_source = data_source
3639
+
2949
3640
  # Show extract if requested and validation failed
2950
3641
  if (show_extract or write_extract) and not step_passed:
2951
3642
  console.print()
@@ -2997,7 +3688,7 @@ def _show_extract_and_summary(
2997
3688
  if show_extract:
2998
3689
  console.print(extract_message)
2999
3690
 
3000
- # Special handling for col-exists check - no rows to show when column doesn't exist
3691
+ # Special handling for col-exists check: no rows to show when column doesn't exist
3001
3692
  if check == "col-exists" and not step_passed:
3002
3693
  if show_extract:
3003
3694
  console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
@@ -3008,16 +3699,17 @@ def _show_extract_and_summary(
3008
3699
  console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
3009
3700
  else:
3010
3701
  try:
3011
- # Get failing rows extract - use step_index + 1 since extracts are 1-indexed
3702
+ # Get failing rows extract: use step_index + 1 since extracts are 1-indexed
3012
3703
  failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
3013
3704
 
3014
3705
  if failing_rows is not None and len(failing_rows) > 0:
3015
3706
  if show_extract:
3016
- # Limit the number of rows shown
3017
- if len(failing_rows) > limit:
3018
- display_rows = failing_rows.head(limit)
3707
+ # Always limit to 10 rows for display, regardless of limit option
3708
+ display_limit = 10
3709
+ if len(failing_rows) > display_limit:
3710
+ display_rows = failing_rows.head(display_limit)
3019
3711
  console.print(
3020
- f"[dim]Showing first {limit} of {len(failing_rows)} {row_type}[/dim]"
3712
+ f"[dim]Showing first {display_limit} of {len(failing_rows)} {row_type}[/dim]"
3021
3713
  )
3022
3714
  else:
3023
3715
  display_rows = failing_rows
@@ -3028,9 +3720,9 @@ def _show_extract_and_summary(
3028
3720
 
3029
3721
  preview_table = pb.preview(
3030
3722
  data=display_rows,
3031
- n_head=min(limit, len(display_rows)),
3723
+ n_head=min(display_limit, len(display_rows)),
3032
3724
  n_tail=0,
3033
- limit=limit,
3725
+ limit=display_limit,
3034
3726
  show_row_numbers=True,
3035
3727
  )
3036
3728
 
@@ -3052,7 +3744,7 @@ def _show_extract_and_summary(
3052
3744
  filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
3053
3745
  filepath = output_folder / filename
3054
3746
 
3055
- # Limit the output if needed
3747
+ # Use limit option for write_extract
3056
3748
  write_rows = failing_rows
3057
3749
  if len(failing_rows) > limit:
3058
3750
  write_rows = failing_rows.head(limit)
@@ -3098,84 +3790,84 @@ def _show_extract_and_summary(
3098
3790
  if step_passed:
3099
3791
  if check == "rows-distinct":
3100
3792
  success_message = (
3101
- f"[green]✓ Validation PASSED: No duplicate rows found in {data_source}[/green]"
3793
+ f"[green]✓ Validation PASSED: No duplicate rows found in {display_source}[/green]"
3102
3794
  )
3103
3795
  elif check == "col-vals-not-null":
3104
- success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {data_source}[/green]"
3796
+ success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {display_source}[/green]"
3105
3797
  elif check == "rows-complete":
3106
- success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {data_source}[/green]"
3798
+ success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {display_source}[/green]"
3107
3799
  elif check == "col-exists":
3108
3800
  success_message = (
3109
- f"[green]✓ Validation PASSED: Column '{column}' exists in {data_source}[/green]"
3801
+ f"[green]✓ Validation PASSED: Column '{column}' exists in {display_source}[/green]"
3110
3802
  )
3111
3803
  elif check == "col-vals-in-set":
3112
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {data_source}[/green]"
3804
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {display_source}[/green]"
3113
3805
  elif check == "col-vals-gt":
3114
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {data_source}[/green]"
3806
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {display_source}[/green]"
3115
3807
  elif check == "col-vals-ge":
3116
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {data_source}[/green]"
3808
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {display_source}[/green]"
3117
3809
  elif check == "col-vals-lt":
3118
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {data_source}[/green]"
3810
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {display_source}[/green]"
3119
3811
  elif check == "col-vals-le":
3120
- success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {data_source}[/green]"
3812
+ success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {display_source}[/green]"
3121
3813
  else:
3122
3814
  success_message = (
3123
- f"[green]✓ Validation PASSED: {check} check passed for {data_source}[/green]"
3815
+ f"[green]✓ Validation PASSED: {check} check passed for {display_source}[/green]"
3124
3816
  )
3125
3817
 
3126
- console.print(Panel(success_message, border_style="green"))
3818
+ console.print(Panel(success_message, border_style="green", expand=False))
3127
3819
  else:
3128
3820
  if step_info:
3129
3821
  if check == "rows-distinct":
3130
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {data_source}[/red]"
3822
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {display_source}[/red]"
3131
3823
  elif check == "col-vals-not-null":
3132
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {data_source}[/red]"
3824
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {display_source}[/red]"
3133
3825
  elif check == "rows-complete":
3134
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {data_source}[/red]"
3826
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {display_source}[/red]"
3135
3827
  elif check == "col-exists":
3136
- failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {data_source}[/red]"
3828
+ failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {display_source}[/red]"
3137
3829
  elif check == "col-vals-in-set":
3138
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {data_source}[/red]"
3830
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {display_source}[/red]"
3139
3831
  elif check == "col-vals-gt":
3140
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {data_source}[/red]"
3832
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {display_source}[/red]"
3141
3833
  elif check == "col-vals-ge":
3142
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {data_source}[/red]"
3834
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {display_source}[/red]"
3143
3835
  elif check == "col-vals-lt":
3144
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {data_source}[/red]"
3836
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {display_source}[/red]"
3145
3837
  elif check == "col-vals-le":
3146
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {data_source}[/red]"
3838
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {display_source}[/red]"
3147
3839
  else:
3148
- failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {data_source}[/red]"
3840
+ failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {display_source}[/red]"
3149
3841
 
3150
3842
  # Add hint about --show-extract if not already used (except for col-exists which has no rows to show)
3151
3843
  if not show_extract and check != "col-exists":
3152
3844
  failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
3153
3845
 
3154
- console.print(Panel(failure_message, border_style="red"))
3846
+ console.print(Panel(failure_message, border_style="red", expand=False))
3155
3847
  else:
3156
3848
  if check == "rows-distinct":
3157
3849
  failure_message = (
3158
- f"[red]✗ Validation FAILED: Duplicate rows found in {data_source}[/red]"
3850
+ f"[red]✗ Validation FAILED: Duplicate rows found in {display_source}[/red]"
3159
3851
  )
3160
3852
  elif check == "rows-complete":
3161
3853
  failure_message = (
3162
- f"[red]✗ Validation FAILED: Incomplete rows found in {data_source}[/red]"
3854
+ f"[red]✗ Validation FAILED: Incomplete rows found in {display_source}[/red]"
3163
3855
  )
3164
3856
  else:
3165
3857
  failure_message = (
3166
- f"[red]✗ Validation FAILED: {check} check failed for {data_source}[/red]"
3858
+ f"[red]✗ Validation FAILED: {check} check failed for {display_source}[/red]"
3167
3859
  )
3168
3860
 
3169
3861
  # Add hint about --show-extract if not already used
3170
3862
  if not show_extract:
3171
3863
  failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
3172
3864
 
3173
- console.print(Panel(failure_message, border_style="red"))
3865
+ console.print(Panel(failure_message, border_style="red", expand=False))
3174
3866
 
3175
3867
 
3176
3868
  @cli.command()
3177
- @click.argument("output_file", type=click.Path())
3178
- def make_template(output_file: str):
3869
+ @click.argument("output_file", type=click.Path(), required=False)
3870
+ def make_template(output_file: str | None):
3179
3871
  """
3180
3872
  Create a validation script template.
3181
3873
 
@@ -3191,11 +3883,19 @@ def make_template(output_file: str):
3191
3883
  pb make-template my_validation.py
3192
3884
  pb make-template validation_template.py
3193
3885
  """
3886
+ # Handle missing output_file with concise help
3887
+ if output_file is None:
3888
+ _show_concise_help("make-template", None)
3889
+ return
3890
+
3194
3891
  example_script = '''"""
3195
3892
  Example Pointblank validation script.
3196
3893
 
3197
3894
  This script demonstrates how to create validation rules for your data.
3198
3895
  Modify the data loading and validation rules below to match your requirements.
3896
+
3897
+ When using 'pb run' with --data option, the CLI will automatically replace
3898
+ the data source in your validation object with the provided data.
3199
3899
  """
3200
3900
 
3201
3901
  import pointblank as pb
@@ -3239,11 +3939,6 @@ validation = (
3239
3939
  # Finalize the validation
3240
3940
  .interrogate()
3241
3941
  )
3242
-
3243
- # The validation object will be automatically used by the CLI
3244
- # You can also access results programmatically:
3245
- # print(f"All passed: {validation.all_passed()}")
3246
- # print(f"Failed steps: {validation.n_failed()}")
3247
3942
  '''
3248
3943
 
3249
3944
  Path(output_file).write_text(example_script)
@@ -3251,13 +3946,17 @@ validation = (
3251
3946
  console.print("\nEdit the template to add your data loading and validation rules, then run:")
3252
3947
  console.print(f"[cyan]pb run {output_file}[/cyan]")
3253
3948
  console.print(
3254
- f"[cyan]pb run {output_file} --data your_data.csv[/cyan] [dim]# Override data source[/dim]"
3949
+ f"[cyan]pb run {output_file} --data your_data.csv[/cyan] [dim]# Replace data source automatically[/dim]"
3255
3950
  )
3256
3951
 
3257
3952
 
3258
3953
  @cli.command()
3259
- @click.argument("validation_script", type=click.Path(exists=True))
3260
- @click.option("--data", type=str, help="Optional data source to override script's data loading")
3954
+ @click.argument("validation_script", type=click.Path(exists=True), required=False)
3955
+ @click.option(
3956
+ "--data",
3957
+ type=str,
3958
+ help="Data source to replace in validation objects (single validation scripts only)",
3959
+ )
3261
3960
  @click.option("--output-html", type=click.Path(), help="Save HTML validation report to file")
3262
3961
  @click.option("--output-json", type=click.Path(), help="Save JSON validation summary to file")
3263
3962
  @click.option(
@@ -3269,7 +3968,7 @@ validation = (
3269
3968
  help="Save failing rows to folders (one CSV per step). Provide base name for folder.",
3270
3969
  )
3271
3970
  @click.option(
3272
- "--limit", "-l", default=10, help="Maximum number of failing rows to show/save (default: 10)"
3971
+ "--limit", default=500, help="Maximum number of failing rows to save to CSV (default: 500)"
3273
3972
  )
3274
3973
  @click.option(
3275
3974
  "--fail-on",
@@ -3277,7 +3976,7 @@ validation = (
3277
3976
  help="Exit with non-zero code when validation reaches this threshold level",
3278
3977
  )
3279
3978
  def run(
3280
- validation_script: str,
3979
+ validation_script: str | None,
3281
3980
  data: str | None,
3282
3981
  output_html: str | None,
3283
3982
  output_json: str | None,
@@ -3292,8 +3991,11 @@ def run(
3292
3991
  VALIDATION_SCRIPT should be a Python file that defines validation logic.
3293
3992
  The script should load its own data and create validation objects.
3294
3993
 
3295
- If --data is provided, it will be available as a 'cli_data' variable in the script,
3296
- allowing you to optionally override your script's data loading.
3994
+ If --data is provided, it will automatically replace the data source in your
3995
+ validation objects. This works with scripts containing a single validation.
3996
+ For scripts with multiple validations, use separate script files or remove --data.
3997
+
3998
+ To get started quickly, use 'pb make-template' to create a validation script template.
3297
3999
 
3298
4000
  DATA can be:
3299
4001
 
@@ -3307,6 +4009,7 @@ def run(
3307
4009
  Examples:
3308
4010
 
3309
4011
  \b
4012
+ pb make-template my_validation.py # Create a template first
3310
4013
  pb run validation_script.py
3311
4014
  pb run validation_script.py --data data.csv
3312
4015
  pb run validation_script.py --data small_table --output-html report.html
@@ -3314,6 +4017,11 @@ def run(
3314
4017
  pb run validation_script.py --write-extract extracts_folder --fail-on critical
3315
4018
  """
3316
4019
  try:
4020
+ # Handle missing validation_script with concise help
4021
+ if validation_script is None:
4022
+ _show_concise_help("run", None)
4023
+ return
4024
+
3317
4025
  # Load optional data override if provided
3318
4026
  cli_data = None
3319
4027
  if data:
@@ -3369,19 +4077,85 @@ def run(
3369
4077
 
3370
4078
  console.print(f"[green]✓[/green] Found {len(validations)} validation object(s)")
3371
4079
 
3372
- # Process each validation
3373
- overall_failed = False
3374
- overall_critical = False
3375
- overall_error = False
3376
- overall_warning = False
3377
-
3378
- for i, validation in enumerate(validations, 1):
4080
+ # Implement automatic data replacement for Validate objects if --data was provided
4081
+ if cli_data is not None:
4082
+ # Check if we have multiple validations (this is not supported)
3379
4083
  if len(validations) > 1:
3380
- console.print(f"\n[bold cyan]Validation {i}:[/bold cyan]")
3381
-
3382
- # Display summary
3383
- _display_validation_summary(validation)
3384
-
4084
+ console.print(
4085
+ f"[red]Error: Found {len(validations)} validation objects in the script.[/red]"
4086
+ )
4087
+ console.print(
4088
+ "[yellow]The --data option replaces data in ALL validation objects,[/yellow]"
4089
+ )
4090
+ console.print(
4091
+ "[yellow]which may cause failures if validations expect different schemas.[/yellow]"
4092
+ )
4093
+ console.print("\n[cyan]Options:[/cyan]")
4094
+ console.print(" 1. Split your script into separate files with one validation each")
4095
+ console.print(
4096
+ " 2. Remove the --data option to use each validation's original data"
4097
+ )
4098
+ sys.exit(1)
4099
+
4100
+ console.print(
4101
+ f"[yellow]Replacing data in {len(validations)} validation object(s) with CLI data[/yellow]"
4102
+ )
4103
+
4104
+ for idx, validation in enumerate(validations, 1):
4105
+ # Check if it's a Validate object with data attribute
4106
+ if hasattr(validation, "data") and hasattr(validation, "interrogate"):
4107
+ console.print("[cyan]Updating validation with new data source...[/cyan]")
4108
+
4109
+ # Store the original validation_info as our "plan"
4110
+ original_validation_info = validation.validation_info.copy()
4111
+
4112
+ # Replace the data
4113
+ validation.data = cli_data
4114
+
4115
+ # Re-process the data (same as what happens in __post_init__)
4116
+ from pointblank.validate import _process_data
4117
+
4118
+ validation.data = _process_data(validation.data)
4119
+
4120
+ # Reset validation results but keep the plan
4121
+ validation.validation_info = []
4122
+
4123
+ # Re-add each validation step from the original plan
4124
+ for val_info in original_validation_info:
4125
+ # Create a copy and reset any interrogation results
4126
+ new_val_info = copy.deepcopy(val_info)
4127
+ # Reset interrogation-specific attributes if they exist
4128
+ if hasattr(new_val_info, "n_passed"):
4129
+ new_val_info.n_passed = None
4130
+ if hasattr(new_val_info, "n_failed"):
4131
+ new_val_info.n_failed = None
4132
+ if hasattr(new_val_info, "all_passed"):
4133
+ new_val_info.all_passed = None
4134
+ if hasattr(new_val_info, "warning"):
4135
+ new_val_info.warning = None
4136
+ if hasattr(new_val_info, "error"):
4137
+ new_val_info.error = None
4138
+ if hasattr(new_val_info, "critical"):
4139
+ new_val_info.critical = None
4140
+ validation.validation_info.append(new_val_info)
4141
+
4142
+ # Re-interrogate with the new data
4143
+ console.print("[cyan]Re-interrogating with new data...[/cyan]")
4144
+ validation.interrogate()
4145
+
4146
+ # Process each validation
4147
+ overall_failed = False
4148
+ overall_critical = False
4149
+ overall_error = False
4150
+ overall_warning = False
4151
+
4152
+ for i, validation in enumerate(validations, 1):
4153
+ if len(validations) > 1:
4154
+ console.print(f"\n[bold cyan]Validation {i}:[/bold cyan]")
4155
+
4156
+ # Display summary
4157
+ _display_validation_summary(validation)
4158
+
3385
4159
  # Check failure status
3386
4160
  validation_failed = False
3387
4161
  has_critical = False
@@ -3432,11 +4206,12 @@ def run(
3432
4206
  f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
3433
4207
  )
3434
4208
 
3435
- # Limit the number of rows shown
3436
- if len(failing_rows) > limit:
3437
- display_rows = failing_rows.head(limit)
4209
+ # Always limit to 10 rows for display, regardless of limit option
4210
+ display_limit = 10
4211
+ if len(failing_rows) > display_limit:
4212
+ display_rows = failing_rows.head(display_limit)
3438
4213
  console.print(
3439
- f"[dim]Showing first {limit} of {len(failing_rows)} failing rows[/dim]"
4214
+ f"[dim]Showing first {display_limit} of {len(failing_rows)} failing rows[/dim]"
3440
4215
  )
3441
4216
  else:
3442
4217
  display_rows = failing_rows
@@ -3447,9 +4222,9 @@ def run(
3447
4222
  # Create a preview table using pointblank's preview function
3448
4223
  preview_table = pb.preview(
3449
4224
  data=display_rows,
3450
- n_head=min(limit, len(display_rows)),
4225
+ n_head=min(display_limit, len(display_rows)),
3451
4226
  n_tail=0,
3452
- limit=limit,
4227
+ limit=display_limit,
3453
4228
  show_row_numbers=True,
3454
4229
  )
3455
4230
 
@@ -3502,7 +4277,7 @@ def run(
3502
4277
  filename = f"step_{step_num:02d}_{safe_assertion_type}.csv"
3503
4278
  filepath = output_folder / filename
3504
4279
 
3505
- # Limit the output if needed
4280
+ # Use limit for CSV output
3506
4281
  save_rows = failing_rows
3507
4282
  if hasattr(failing_rows, "head") and len(failing_rows) > limit:
3508
4283
  save_rows = failing_rows.head(limit)
@@ -3521,7 +4296,11 @@ def run(
3521
4296
  pd_data = pd.DataFrame(save_rows)
3522
4297
  pd_data.to_csv(str(filepath), index=False)
3523
4298
 
3524
- saved_files.append((filename, len(failing_rows)))
4299
+ # Record the actual number of rows saved
4300
+ rows_saved = (
4301
+ len(save_rows) if hasattr(save_rows, "__len__") else limit
4302
+ )
4303
+ saved_files.append((filename, rows_saved))
3525
4304
 
3526
4305
  except Exception as e:
3527
4306
  console.print(
@@ -3548,11 +4327,11 @@ def run(
3548
4327
  if output_html:
3549
4328
  try:
3550
4329
  if len(validations) == 1:
3551
- # Single validation - save directly
4330
+ # Single validation: save directly
3552
4331
  html_content = validations[0]._repr_html_()
3553
4332
  Path(output_html).write_text(html_content, encoding="utf-8")
3554
4333
  else:
3555
- # Multiple validations - combine them
4334
+ # Multiple validations: combine them
3556
4335
  html_parts = []
3557
4336
  html_parts.append("<html><body>")
3558
4337
  html_parts.append("<h1>Pointblank Validation Report</h1>")
@@ -3572,11 +4351,11 @@ def run(
3572
4351
  if output_json:
3573
4352
  try:
3574
4353
  if len(validations) == 1:
3575
- # Single validation - save directly
4354
+ # Single validation: save directly
3576
4355
  json_report = validations[0].get_json_report()
3577
4356
  Path(output_json).write_text(json_report, encoding="utf-8")
3578
4357
  else:
3579
- # Multiple validations - combine them
4358
+ # Multiple validations: combine them
3580
4359
  import json
3581
4360
 
3582
4361
  combined_report = {"validations": []}
@@ -3642,3 +4421,768 @@ def _format_missing_percentage(value: float) -> str:
3642
4421
  return ">99%" # More than 99%
3643
4422
  else:
3644
4423
  return f"{int(round(value))}%" # Round to nearest integer with % sign
4424
+
4425
+
4426
+ @cli.command()
4427
+ @click.argument("polars_expression", type=str, required=False)
4428
+ @click.option("--edit", "-e", is_flag=True, help="Open editor for multi-line input")
4429
+ @click.option("--file", "-f", type=click.Path(exists=True), help="Read query from file")
4430
+ @click.option(
4431
+ "--editor", help="Editor to use for --edit mode (overrides $EDITOR and auto-detection)"
4432
+ )
4433
+ @click.option(
4434
+ "--output-format",
4435
+ "-o",
4436
+ type=click.Choice(["preview", "scan", "missing", "info"]),
4437
+ default="preview",
4438
+ help="Output format for the result",
4439
+ )
4440
+ @click.option("--preview-head", default=5, help="Number of head rows for preview")
4441
+ @click.option("--preview-tail", default=5, help="Number of tail rows for preview")
4442
+ @click.option("--output-html", type=click.Path(), help="Save HTML output to file")
4443
+ @click.option(
4444
+ "--pipe", is_flag=True, help="Output data in a format suitable for piping to other pb commands"
4445
+ )
4446
+ @click.option(
4447
+ "--pipe-format",
4448
+ type=click.Choice(["parquet", "csv"]),
4449
+ default="parquet",
4450
+ help="Format for piped output (default: parquet)",
4451
+ )
4452
+ def pl(
4453
+ polars_expression: str | None,
4454
+ edit: bool,
4455
+ file: str | None,
4456
+ editor: str | None,
4457
+ output_format: str,
4458
+ preview_head: int,
4459
+ preview_tail: int,
4460
+ output_html: str | None,
4461
+ pipe: bool,
4462
+ pipe_format: str,
4463
+ ):
4464
+ """
4465
+ Execute Polars expressions and display results.
4466
+
4467
+ Execute Polars DataFrame operations from the command line and display
4468
+ the results using Pointblank's visualization tools.
4469
+
4470
+ POLARS_EXPRESSION should be a valid Polars expression that returns a DataFrame.
4471
+ The 'pl' module is automatically imported and available.
4472
+
4473
+ Examples:
4474
+
4475
+ \b
4476
+ # Direct expression
4477
+ pb pl "pl.read_csv('data.csv')"
4478
+ pb pl "pl.read_csv('data.csv').select(['name', 'age'])"
4479
+ pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)"
4480
+
4481
+ # Multi-line with editor (supports multiple statements)
4482
+ pb pl --edit
4483
+
4484
+ # Multi-statement code example in editor:
4485
+ # csv = pl.read_csv('data.csv')
4486
+ # result = csv.select(['name', 'age']).filter(pl.col('age') > 25)
4487
+
4488
+ # Multi-line with a specific editor
4489
+ pb pl --edit --editor nano
4490
+ pb pl --edit --editor code
4491
+ pb pl --edit --editor micro
4492
+
4493
+ # From file
4494
+ pb pl --file query.py
4495
+
4496
+ # Piping to other pb commands
4497
+ pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)" --pipe | pb validate --check rows-distinct
4498
+ pb pl --edit --pipe | pb preview --head 10
4499
+ pb pl --edit --pipe | pb scan --output-html report.html
4500
+ pb pl --edit --pipe | pb missing --output-html missing_report.html
4501
+
4502
+ Use --output-format to change how results are displayed:
4503
+
4504
+ \b
4505
+ pb pl "pl.read_csv('data.csv')" --output-format scan
4506
+ pb pl "pl.read_csv('data.csv')" --output-format missing
4507
+ pb pl "pl.read_csv('data.csv')" --output-format info
4508
+
4509
+ Note: For multi-statement code, assign your final result to a variable like
4510
+ 'result', 'df', 'data', or ensure it's the last expression.
4511
+ """
4512
+ try:
4513
+ # Check if Polars is available
4514
+ if not _is_lib_present("polars"):
4515
+ console.print("[red]Error:[/red] Polars is not installed")
4516
+ console.print("\nThe 'pb pl' command requires Polars to be installed.")
4517
+ console.print("Install it with: [cyan]pip install polars[/cyan]")
4518
+ console.print("\nTo check all dependency status, run: [cyan]pb requirements[/cyan]")
4519
+ sys.exit(1)
4520
+
4521
+ import polars as pl
4522
+
4523
+ # Determine the source of the query
4524
+ query_code = None
4525
+
4526
+ if file:
4527
+ # Read from file
4528
+ query_code = Path(file).read_text()
4529
+ elif edit:
4530
+ # Determine which editor to use
4531
+ chosen_editor = editor or _get_best_editor()
4532
+
4533
+ # When piping, send editor message to stderr
4534
+ if pipe:
4535
+ print(f"Using editor: {chosen_editor}", file=sys.stderr)
4536
+ else:
4537
+ console.print(f"[dim]Using editor: {chosen_editor}[/dim]")
4538
+
4539
+ # Interactive editor with custom editor
4540
+ if chosen_editor == "code":
4541
+ # Special handling for VS Code
4542
+ query_code = _edit_with_vscode()
4543
+ else:
4544
+ # Use click.edit() for terminal editors
4545
+ query_code = click.edit(
4546
+ "# Enter your Polars query here\n"
4547
+ "# Example:\n"
4548
+ "# pl.read_csv('data.csv').select(['name', 'age'])\n"
4549
+ "# pl.read_csv('data.csv').filter(pl.col('age') > 25)\n"
4550
+ "# \n"
4551
+ "# The result should be a Polars DataFrame or LazyFrame\n"
4552
+ "\n",
4553
+ editor=chosen_editor,
4554
+ )
4555
+
4556
+ if query_code is None:
4557
+ if pipe:
4558
+ print("No query entered", file=sys.stderr)
4559
+ else:
4560
+ console.print("[yellow]No query entered[/yellow]")
4561
+ sys.exit(1)
4562
+ elif polars_expression:
4563
+ # Direct argument
4564
+ query_code = polars_expression
4565
+ else:
4566
+ # Try to read from stdin (for piping)
4567
+ if not sys.stdin.isatty():
4568
+ # Data is being piped in
4569
+ query_code = sys.stdin.read().strip()
4570
+ else:
4571
+ # No input provided and stdin is a terminal - show concise help
4572
+ _show_concise_help("pl", None)
4573
+ return
4574
+
4575
+ if not query_code or not query_code.strip():
4576
+ console.print("[red]Error:[/red] Empty query")
4577
+ sys.exit(1)
4578
+
4579
+ # Execute the query
4580
+ with console.status("[bold green]Executing Polars expression..."):
4581
+ namespace = {
4582
+ "pl": pl,
4583
+ "polars": pl,
4584
+ "__builtins__": __builtins__,
4585
+ }
4586
+
4587
+ try:
4588
+ # Check if this is a single expression or multiple statements
4589
+ if "\n" in query_code.strip() or any(
4590
+ keyword in query_code
4591
+ for keyword in [
4592
+ " = ",
4593
+ "import",
4594
+ "for ",
4595
+ "if ",
4596
+ "def ",
4597
+ "class ",
4598
+ "with ",
4599
+ "try:",
4600
+ ]
4601
+ ):
4602
+ # Multiple statements - use exec()
4603
+ exec(query_code, namespace)
4604
+
4605
+ # Look for the result in the namespace
4606
+ # Try common variable names first
4607
+ result = None
4608
+ for var_name in ["result", "df", "data", "table", "output"]:
4609
+ if var_name in namespace:
4610
+ result = namespace[var_name]
4611
+ break
4612
+
4613
+ # If no common names found, look for any DataFrame/LazyFrame
4614
+ if result is None:
4615
+ for key, value in namespace.items():
4616
+ if (
4617
+ hasattr(value, "collect") or hasattr(value, "columns")
4618
+ ) and not key.startswith("_"):
4619
+ result = value
4620
+ break
4621
+
4622
+ # If still no result, get the last assigned variable (excluding builtins)
4623
+ if result is None:
4624
+ # Get variables that were added to namespace (excluding our imports)
4625
+ user_vars = {
4626
+ k: v
4627
+ for k, v in namespace.items()
4628
+ if k not in ["pl", "polars", "__builtins__"] and not k.startswith("_")
4629
+ }
4630
+ if user_vars:
4631
+ # Get the last variable (this is a heuristic)
4632
+ last_var = list(user_vars.keys())[-1]
4633
+ result = user_vars[last_var]
4634
+
4635
+ if result is None:
4636
+ if pipe:
4637
+ print(
4638
+ "[red]Error:[/red] Could not find result variable", file=sys.stderr
4639
+ )
4640
+ print(
4641
+ "[dim]Assign your final result to a variable like 'result', 'df', or 'data'[/dim]",
4642
+ file=sys.stderr,
4643
+ )
4644
+ print(
4645
+ "[dim]Or ensure your last line returns a DataFrame[/dim]",
4646
+ file=sys.stderr,
4647
+ )
4648
+ else:
4649
+ console.print("[red]Error:[/red] Could not find result variable")
4650
+ console.print(
4651
+ "[dim]Assign your final result to a variable like 'result', 'df', or 'data'[/dim]"
4652
+ )
4653
+ console.print("[dim]Or ensure your last line returns a DataFrame[/dim]")
4654
+ sys.exit(1)
4655
+
4656
+ else:
4657
+ # Single expression - use eval()
4658
+ result = eval(query_code, namespace)
4659
+
4660
+ # Validate result
4661
+ if not hasattr(result, "collect") and not hasattr(result, "columns"):
4662
+ if pipe:
4663
+ print(
4664
+ "[red]Error:[/red] Expression must return a Polars DataFrame or LazyFrame",
4665
+ file=sys.stderr,
4666
+ )
4667
+ print(f"[dim]Got: {type(result)}[/dim]", file=sys.stderr)
4668
+ else:
4669
+ console.print(
4670
+ "[red]Error:[/red] Expression must return a Polars DataFrame or LazyFrame"
4671
+ )
4672
+ console.print(f"[dim]Got: {type(result)}[/dim]")
4673
+ sys.exit(1)
4674
+
4675
+ except Exception as e:
4676
+ # When piping, send errors to stderr so they don't interfere with the pipe
4677
+ if pipe:
4678
+ print(f"Error executing Polars expression: {e}", file=sys.stderr)
4679
+ print(file=sys.stderr)
4680
+
4681
+ # Create a panel with the expression(s) for better readability
4682
+ if "\n" in query_code.strip():
4683
+ # Multi-line expression
4684
+ print(f"Expression(s) provided:\n{query_code}", file=sys.stderr)
4685
+ else:
4686
+ # Single line expression
4687
+ print(f"Expression provided: {query_code}", file=sys.stderr)
4688
+ else:
4689
+ # Normal error handling when not piping
4690
+ console.print(f"[red]Error executing Polars expression:[/red] {e}")
4691
+ console.print()
4692
+
4693
+ # Create a panel with the expression(s) for better readability
4694
+ if "\n" in query_code.strip():
4695
+ # Multi-line expression
4696
+ console.print(
4697
+ Panel(
4698
+ query_code,
4699
+ title="Expression(s) provided",
4700
+ border_style="red",
4701
+ expand=False,
4702
+ title_align="left",
4703
+ )
4704
+ )
4705
+ else:
4706
+ # Single line expression
4707
+ console.print(
4708
+ Panel(
4709
+ query_code,
4710
+ title="Expression provided",
4711
+ border_style="red",
4712
+ expand=False,
4713
+ title_align="left",
4714
+ )
4715
+ )
4716
+
4717
+ sys.exit(1)
4718
+
4719
+ # Only print success message when not piping (so it doesn't interfere with pipe output)
4720
+ if not pipe:
4721
+ console.print("[green]✓[/green] Polars expression executed successfully")
4722
+
4723
+ # Process output
4724
+ if pipe:
4725
+ # Output data for piping to other commands
4726
+ _handle_pl_pipe(result, pipe_format)
4727
+ elif output_format == "preview":
4728
+ _handle_pl_preview(result, preview_head, preview_tail, output_html)
4729
+ elif output_format == "scan":
4730
+ _handle_pl_scan(result, query_code, output_html)
4731
+ elif output_format == "missing":
4732
+ _handle_pl_missing(result, query_code, output_html)
4733
+ elif output_format == "info":
4734
+ _handle_pl_info(result, query_code, output_html)
4735
+ elif output_format == "validate":
4736
+ console.print("[yellow]Validation output format not yet implemented[/yellow]")
4737
+ console.print("Use 'pb validate' with a data file for now")
4738
+
4739
+ except Exception as e:
4740
+ console.print(f"[red]Error:[/red] {e}")
4741
+ sys.exit(1)
4742
+
4743
+
4744
+ def _handle_pl_preview(result: Any, head: int, tail: int, output_html: str | None) -> None:
4745
+ """Handle preview output for Polars results."""
4746
+ try:
4747
+ # Create preview using existing preview function
4748
+ gt_table = pb.preview(
4749
+ data=result,
4750
+ n_head=head,
4751
+ n_tail=tail,
4752
+ show_row_numbers=True,
4753
+ )
4754
+
4755
+ if output_html:
4756
+ html_content = gt_table.as_raw_html()
4757
+ Path(output_html).write_text(html_content, encoding="utf-8")
4758
+ console.print(f"[green]✓[/green] HTML saved to: {output_html}")
4759
+ else:
4760
+ # Get metadata for enhanced preview
4761
+ try:
4762
+ total_rows = pb.get_row_count(result)
4763
+ total_columns = pb.get_column_count(result)
4764
+ table_type = _get_tbl_type(result)
4765
+
4766
+ preview_info = {
4767
+ "total_rows": total_rows,
4768
+ "total_columns": total_columns,
4769
+ "head_rows": head,
4770
+ "tail_rows": tail,
4771
+ "is_complete": total_rows <= (head + tail),
4772
+ "source_type": "Polars expression",
4773
+ "table_type": table_type,
4774
+ }
4775
+
4776
+ _rich_print_gt_table(gt_table, preview_info)
4777
+ except Exception:
4778
+ # Fallback to basic display
4779
+ _rich_print_gt_table(gt_table)
4780
+
4781
+ except Exception as e:
4782
+ console.print(f"[red]Error creating preview:[/red] {e}")
4783
+ sys.exit(1)
4784
+
4785
+
4786
+ def _handle_pl_scan(result: Any, expression: str, output_html: str | None) -> None:
4787
+ """Handle scan output for Polars results."""
4788
+ try:
4789
+ scan_result = pb.col_summary_tbl(data=result)
4790
+
4791
+ if output_html:
4792
+ html_content = scan_result.as_raw_html()
4793
+ Path(output_html).write_text(html_content, encoding="utf-8")
4794
+ console.print(f"[green]✓[/green] Data scan report saved to: {output_html}")
4795
+ else:
4796
+ # Get metadata for enhanced scan display
4797
+ try:
4798
+ total_rows = pb.get_row_count(result)
4799
+ total_columns = pb.get_column_count(result)
4800
+ table_type = _get_tbl_type(result)
4801
+
4802
+ _rich_print_scan_table(
4803
+ scan_result,
4804
+ expression,
4805
+ "Polars expression",
4806
+ table_type,
4807
+ total_rows,
4808
+ total_columns,
4809
+ )
4810
+ except Exception as e:
4811
+ console.print(f"[yellow]Could not display scan summary: {e}[/yellow]")
4812
+
4813
+ except Exception as e:
4814
+ console.print(f"[red]Error creating scan:[/red] {e}")
4815
+ sys.exit(1)
4816
+
4817
+
4818
+ def _handle_pl_missing(result: Any, expression: str, output_html: str | None) -> None:
4819
+ """Handle missing values output for Polars results."""
4820
+ try:
4821
+ missing_table = pb.missing_vals_tbl(data=result)
4822
+
4823
+ if output_html:
4824
+ html_content = missing_table.as_raw_html()
4825
+ Path(output_html).write_text(html_content, encoding="utf-8")
4826
+ console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
4827
+ else:
4828
+ _rich_print_missing_table(missing_table, result)
4829
+
4830
+ except Exception as e:
4831
+ console.print(f"[red]Error creating missing values report:[/red] {e}")
4832
+ sys.exit(1)
4833
+
4834
+
4835
+ def _handle_pl_info(result: Any, expression: str, output_html: str | None) -> None:
4836
+ """Handle info output for Polars results."""
4837
+ try:
4838
+ # Get basic info
4839
+ tbl_type = _get_tbl_type(result)
4840
+ row_count = pb.get_row_count(result)
4841
+ col_count = pb.get_column_count(result)
4842
+
4843
+ # Get column names and types
4844
+ if hasattr(result, "columns"):
4845
+ columns = list(result.columns)
4846
+ elif hasattr(result, "schema"):
4847
+ columns = list(result.schema.names)
4848
+ else:
4849
+ columns = []
4850
+
4851
+ dtypes_dict = _get_column_dtypes(result, columns)
4852
+
4853
+ if output_html:
4854
+ # Create a simple HTML info page
4855
+ # TODO: Implement an improved version of this in the Python API and then
4856
+ # use that here
4857
+ html_content = f"""
4858
+ <html><body>
4859
+ <h2>Polars Expression Info</h2>
4860
+ <p><strong>Expression:</strong> {expression}</p>
4861
+ <p><strong>Table Type:</strong> {tbl_type}</p>
4862
+ <p><strong>Rows:</strong> {row_count:,}</p>
4863
+ <p><strong>Columns:</strong> {col_count:,}</p>
4864
+ <h3>Column Details</h3>
4865
+ <ul>
4866
+ {"".join(f"<li>{col}: {dtypes_dict.get(col, '?')}</li>" for col in columns)}
4867
+ </ul>
4868
+ </body></html>
4869
+ """
4870
+ Path(output_html).write_text(html_content, encoding="utf-8")
4871
+ console.print(f"[green]✓[/green] HTML info saved to: {output_html}")
4872
+ else:
4873
+ # Display info table
4874
+ from rich.box import SIMPLE_HEAD
4875
+
4876
+ info_table = Table(
4877
+ title="Polars Expression Info",
4878
+ show_header=True,
4879
+ header_style="bold magenta",
4880
+ box=SIMPLE_HEAD,
4881
+ title_style="bold cyan",
4882
+ title_justify="left",
4883
+ )
4884
+ info_table.add_column("Property", style="cyan", no_wrap=True)
4885
+ info_table.add_column("Value", style="green")
4886
+
4887
+ info_table.add_row("Expression", expression)
4888
+ # Capitalize "polars" to "Polars" for consistency with pb info command
4889
+ display_tbl_type = (
4890
+ tbl_type.replace("polars", "Polars") if "polars" in tbl_type.lower() else tbl_type
4891
+ )
4892
+ info_table.add_row("Table Type", display_tbl_type)
4893
+ info_table.add_row("Rows", f"{row_count:,}")
4894
+ info_table.add_row("Columns", f"{col_count:,}")
4895
+
4896
+ console.print()
4897
+ console.print(info_table)
4898
+
4899
+ # Show column details
4900
+ if columns:
4901
+ console.print("\n[bold cyan]Column Details:[/bold cyan]")
4902
+ for col in columns[:10]: # Show first 10 columns
4903
+ dtype = dtypes_dict.get(col, "?")
4904
+ console.print(f" • {col}: [yellow]{dtype}[/yellow]")
4905
+
4906
+ if len(columns) > 10:
4907
+ console.print(f" ... and {len(columns) - 10} more columns")
4908
+
4909
+ except Exception as e:
4910
+ console.print(f"[red]Error creating info:[/red] {e}")
4911
+ sys.exit(1)
4912
+
4913
+
4914
+ def _handle_pl_pipe(result: Any, pipe_format: str) -> None:
4915
+ """Handle piped output from Polars results."""
4916
+ try:
4917
+ import sys
4918
+ import tempfile
4919
+
4920
+ # Create a temporary file to store the data
4921
+ with tempfile.NamedTemporaryFile(
4922
+ mode="w", suffix=f".{pipe_format}", prefix="pb_pipe_", delete=False
4923
+ ) as temp_file:
4924
+ temp_path = temp_file.name
4925
+
4926
+ # Write the data to the temporary file
4927
+ if pipe_format == "parquet":
4928
+ if hasattr(result, "write_parquet"):
4929
+ # Polars
4930
+ result.write_parquet(temp_path)
4931
+ elif hasattr(result, "to_parquet"):
4932
+ # Pandas
4933
+ result.to_parquet(temp_path)
4934
+ else:
4935
+ # Convert to pandas and write
4936
+ import pandas as pd
4937
+
4938
+ pd_result = pd.DataFrame(result)
4939
+ pd_result.to_parquet(temp_path)
4940
+ else: # CSV
4941
+ if hasattr(result, "write_csv"):
4942
+ # Polars
4943
+ result.write_csv(temp_path)
4944
+ elif hasattr(result, "to_csv"):
4945
+ # Pandas
4946
+ result.to_csv(temp_path, index=False)
4947
+ else:
4948
+ # Convert to pandas and write
4949
+ import pandas as pd
4950
+
4951
+ pd_result = pd.DataFrame(result)
4952
+ pd_result.to_csv(temp_path, index=False)
4953
+
4954
+ # Output the temporary file path to stdout for the next command
4955
+ print(temp_path)
4956
+
4957
+ except Exception as e:
4958
+ print(f"[red]Error creating pipe output:[/red] {e}", file=sys.stderr)
4959
+ sys.exit(1)
4960
+
4961
+
4962
+ def _get_best_editor() -> str:
4963
+ """Detect the best available editor on the system."""
4964
+
4965
+ # Check environment variable first
4966
+ if "EDITOR" in os.environ:
4967
+ return os.environ["EDITOR"]
4968
+
4969
+ # Check for common editors in order of preference
4970
+ editors = [
4971
+ "code", # VS Code
4972
+ "micro", # Modern terminal editor
4973
+ "nano", # User-friendly terminal editor
4974
+ "vim", # Vim
4975
+ "vi", # Vi (fallback)
4976
+ ]
4977
+
4978
+ for editor in editors:
4979
+ if shutil.which(editor):
4980
+ return editor
4981
+
4982
+ # Ultimate fallback
4983
+ return "nano"
4984
+
4985
+
4986
+ def _edit_with_vscode() -> str | None:
4987
+ """Edit Polars query using VS Code."""
4988
+ import subprocess
4989
+ import tempfile
4990
+
4991
+ # Create a temporary Python file
4992
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".py", prefix="pb_pl_", delete=False) as f:
4993
+ f.write("import polars as pl\n")
4994
+ f.write("\n")
4995
+ f.write("# Enter your Polars query here\n")
4996
+ f.write("# Examples:\n")
4997
+ f.write("# \n")
4998
+ f.write("# Single expression:\n")
4999
+ f.write("# pl.read_csv('data.csv').select(['name', 'age'])\n")
5000
+ f.write("# \n")
5001
+ f.write("# Multiple statements:\n")
5002
+ f.write("# csv = pl.read_csv('data.csv')\n")
5003
+ f.write("# result = csv.select(['name', 'age']).filter(pl.col('age') > 25)\n")
5004
+ f.write("# \n")
5005
+ f.write("# For multi-statement code, assign your final result to a variable\n")
5006
+ f.write("# like 'result', 'df', 'data', or just ensure it's the last line\n")
5007
+ f.write("# \n")
5008
+ f.write("# Save and then close this file in VS Code to execute the query\n")
5009
+ f.write("\n")
5010
+ temp_file = f.name
5011
+
5012
+ try:
5013
+ # Open in VS Code and wait for it to close
5014
+ result = subprocess.run(
5015
+ ["code", "--wait", temp_file], capture_output=True, text=True, timeout=300
5016
+ )
5017
+
5018
+ if result.returncode != 0:
5019
+ console.print(f"[yellow]VS Code exited with code {result.returncode}[/yellow]")
5020
+
5021
+ # Read the edited content
5022
+ with open(temp_file, "r") as f:
5023
+ content = f.read()
5024
+
5025
+ # Remove comments, empty lines, and import statements for cleaner execution
5026
+ lines = []
5027
+ for line in content.split("\n"):
5028
+ stripped = line.strip()
5029
+ if (
5030
+ stripped
5031
+ and not stripped.startswith("#")
5032
+ and not stripped.startswith("import polars")
5033
+ and not stripped.startswith("import polars as pl")
5034
+ ):
5035
+ lines.append(line)
5036
+
5037
+ return "\n".join(lines) if lines else None
5038
+
5039
+ except subprocess.TimeoutExpired:
5040
+ console.print("[red]Timeout:[/red] VS Code took too long to respond")
5041
+ return None
5042
+ except subprocess.CalledProcessError as e:
5043
+ console.print(f"[red]Error:[/red] Could not open VS Code: {e}")
5044
+ return None
5045
+ except FileNotFoundError:
5046
+ console.print("[red]Error:[/red] VS Code not found in PATH")
5047
+ return None
5048
+ finally:
5049
+ # Clean up
5050
+ Path(temp_file).unlink(missing_ok=True)
5051
+
5052
+
5053
+ def _show_concise_help(command_name: str, ctx: click.Context) -> None:
5054
+ """Show concise help for a command when required arguments are missing."""
5055
+
5056
+ if command_name == "info":
5057
+ console.print("[bold cyan]pb info[/bold cyan] - Display information about a data source")
5058
+ console.print()
5059
+ console.print("[bold yellow]Usage:[/bold yellow]")
5060
+ console.print(" pb info data.csv")
5061
+ console.print(" pb info small_table")
5062
+ console.print()
5063
+ console.print("[dim]Shows table type, dimensions, column names, and data types[/dim]")
5064
+ console.print()
5065
+ console.print(
5066
+ "[dim]Use [bold]pb info --help[/bold] for complete options and examples[/dim]"
5067
+ )
5068
+
5069
+ elif command_name == "preview":
5070
+ console.print(
5071
+ "[bold cyan]pb preview[/bold cyan] - Preview a data table showing head and tail rows"
5072
+ )
5073
+ console.print()
5074
+ console.print("[bold yellow]Usage:[/bold yellow]")
5075
+ console.print(" pb preview data.csv")
5076
+ console.print(" pb preview data.parquet --head 10 --tail 5")
5077
+ console.print()
5078
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5079
+ console.print(" --head N Number of rows from the top (default: 5)")
5080
+ console.print(" --tail N Number of rows from the bottom (default: 5)")
5081
+ console.print(" --columns LIST Comma-separated list of columns to display")
5082
+ console.print(" --output-html Save HTML output to file")
5083
+ console.print()
5084
+ console.print(
5085
+ "[dim]Use [bold]pb preview --help[/bold] for complete options and examples[/dim]"
5086
+ )
5087
+
5088
+ elif command_name == "scan":
5089
+ console.print(
5090
+ "[bold cyan]pb scan[/bold cyan] - Generate a comprehensive data profile report"
5091
+ )
5092
+ console.print()
5093
+ console.print("[bold yellow]Usage:[/bold yellow]")
5094
+ console.print(" pb scan data.csv")
5095
+ console.print(" pb scan data.parquet --output-html report.html")
5096
+ console.print()
5097
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5098
+ console.print(" --output-html Save HTML scan report to file")
5099
+ console.print(" --columns LIST Comma-separated list of columns to scan")
5100
+ console.print()
5101
+ console.print(
5102
+ "[dim]Use [bold]pb scan --help[/bold] for complete options and examples[/dim]"
5103
+ )
5104
+
5105
+ elif command_name == "missing":
5106
+ console.print("[bold cyan]pb missing[/bold cyan] - Generate a missing values report")
5107
+ console.print()
5108
+ console.print("[bold yellow]Usage:[/bold yellow]")
5109
+ console.print(" pb missing data.csv")
5110
+ console.print(" pb missing data.parquet --output-html missing_report.html")
5111
+ console.print()
5112
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5113
+ console.print(" --output-html Save HTML output to file")
5114
+ console.print()
5115
+ console.print(
5116
+ "[dim]Use [bold]pb missing --help[/bold] for complete options and examples[/dim]"
5117
+ )
5118
+
5119
+ elif command_name == "validate":
5120
+ console.print("[bold cyan]pb validate[/bold cyan] - Perform data validation checks")
5121
+ console.print()
5122
+ console.print("[bold yellow]Usage:[/bold yellow]")
5123
+ console.print(" pb validate data.csv")
5124
+ console.print(" pb validate data.csv --check col-vals-not-null --column email")
5125
+ console.print()
5126
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5127
+ console.print(" --check TYPE Validation check type (default: rows-distinct)")
5128
+ console.print(" --column COL Column name for column-specific checks")
5129
+ console.print(" --show-extract Show failing rows if validation fails")
5130
+ console.print(" --list-checks List all available validation checks")
5131
+ console.print()
5132
+ console.print(
5133
+ "[dim]Use [bold]pb validate --help[/bold] for complete options and examples[/dim]"
5134
+ )
5135
+
5136
+ elif command_name == "run":
5137
+ console.print("[bold cyan]pb run[/bold cyan] - Run a Pointblank validation script")
5138
+ console.print()
5139
+ console.print("[bold yellow]Usage:[/bold yellow]")
5140
+ console.print(" pb run validation_script.py")
5141
+ console.print(" pb run validation_script.py --data data.csv")
5142
+ console.print()
5143
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5144
+ console.print(" --data SOURCE Replace data source in validation objects")
5145
+ console.print(" --output-html Save HTML validation report to file")
5146
+ console.print(" --show-extract Show failing rows if validation fails")
5147
+ console.print(" --fail-on LEVEL Exit with error on critical/error/warning/any")
5148
+ console.print()
5149
+ console.print("[dim]Use [bold]pb run --help[/bold] for complete options and examples[/dim]")
5150
+
5151
+ elif command_name == "make-template":
5152
+ console.print(
5153
+ "[bold cyan]pb make-template[/bold cyan] - Create a validation script template"
5154
+ )
5155
+ console.print()
5156
+ console.print("[bold yellow]Usage:[/bold yellow]")
5157
+ console.print(" pb make-template my_validation.py")
5158
+ console.print(" pb make-template validation_template.py")
5159
+ console.print()
5160
+ console.print("[dim]Creates a sample Python script with validation examples[/dim]")
5161
+ console.print("[dim]Edit the template and run with [bold]pb run[/bold][/dim]")
5162
+ console.print()
5163
+ console.print(
5164
+ "[dim]Use [bold]pb make-template --help[/bold] for complete options and examples[/dim]"
5165
+ )
5166
+
5167
+ elif command_name == "pl":
5168
+ console.print(
5169
+ "[bold cyan]pb pl[/bold cyan] - Execute Polars expressions and display results"
5170
+ )
5171
+ console.print()
5172
+ console.print("[bold yellow]Usage:[/bold yellow]")
5173
+ console.print(" pb pl \"pl.read_csv('data.csv')\"")
5174
+ console.print(" pb pl --edit")
5175
+ console.print()
5176
+ console.print("[bold yellow]Key Options:[/bold yellow]")
5177
+ console.print(" --edit Open editor for multi-line input")
5178
+ console.print(" --file FILE Read query from file")
5179
+ console.print(" --output-format Output format: preview, scan, missing, info")
5180
+ console.print(" --pipe Output for piping to other pb commands")
5181
+ console.print()
5182
+ console.print("[dim]Use [bold]pb pl --help[/bold] for complete options and examples[/dim]")
5183
+
5184
+ # Fix the exit call at the end
5185
+ if ctx is not None:
5186
+ ctx.exit(1)
5187
+ else:
5188
+ sys.exit(1)