pointblank 0.11.1__py3-none-any.whl → 0.11.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/cli.py +1842 -298
- {pointblank-0.11.1.dist-info → pointblank-0.11.3.dist-info}/METADATA +60 -15
- {pointblank-0.11.1.dist-info → pointblank-0.11.3.dist-info}/RECORD +7 -7
- {pointblank-0.11.1.dist-info → pointblank-0.11.3.dist-info}/WHEEL +0 -0
- {pointblank-0.11.1.dist-info → pointblank-0.11.3.dist-info}/entry_points.txt +0 -0
- {pointblank-0.11.1.dist-info → pointblank-0.11.3.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.11.1.dist-info → pointblank-0.11.3.dist-info}/top_level.txt +0 -0
pointblank/cli.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import copy
|
|
4
|
+
import os
|
|
5
|
+
import shutil
|
|
3
6
|
import sys
|
|
4
7
|
from pathlib import Path
|
|
5
8
|
from typing import Any
|
|
@@ -31,6 +34,8 @@ class OrderedGroup(click.Group):
|
|
|
31
34
|
"validate",
|
|
32
35
|
"run",
|
|
33
36
|
"make-template",
|
|
37
|
+
# Data Manipulation
|
|
38
|
+
"pl",
|
|
34
39
|
# Utilities
|
|
35
40
|
"datasets",
|
|
36
41
|
"requirements",
|
|
@@ -90,6 +95,15 @@ def _load_data_source(data_source: str) -> Any:
|
|
|
90
95
|
return _process_data(data_source)
|
|
91
96
|
|
|
92
97
|
|
|
98
|
+
def _is_piped_data_source(data_source: str) -> bool:
|
|
99
|
+
"""Check if the data source is from a piped pb command."""
|
|
100
|
+
return (
|
|
101
|
+
data_source
|
|
102
|
+
and ("pb_pipe_" in data_source)
|
|
103
|
+
and (data_source.startswith("/var/folders/") or data_source.startswith("/tmp/"))
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
93
107
|
def _format_cell_value(
|
|
94
108
|
value: Any, is_row_number: bool = False, max_width: int = 50, num_columns: int = 10
|
|
95
109
|
) -> str:
|
|
@@ -274,7 +288,7 @@ def _format_dtype_compact(dtype_str: str) -> str:
|
|
|
274
288
|
elif "str" in dtype_str:
|
|
275
289
|
return "str"
|
|
276
290
|
|
|
277
|
-
# Unknown or complex types
|
|
291
|
+
# Unknown or complex types: truncate if too long
|
|
278
292
|
elif len(dtype_str) > 8:
|
|
279
293
|
return dtype_str[:8] + "…"
|
|
280
294
|
else:
|
|
@@ -395,7 +409,7 @@ def _rich_print_scan_table(
|
|
|
395
409
|
# Clean up HTML formatting from the raw data
|
|
396
410
|
str_val = str(value)
|
|
397
411
|
|
|
398
|
-
# Handle multi-line values with <br> tags FIRST
|
|
412
|
+
# Handle multi-line values with <br> tags FIRST: take the first line (absolute number)
|
|
399
413
|
if "<br>" in str_val:
|
|
400
414
|
str_val = str_val.split("<br>")[0].strip()
|
|
401
415
|
# For unique values, we want just the integer part
|
|
@@ -414,14 +428,14 @@ def _rich_print_scan_table(
|
|
|
414
428
|
# Clean up extra whitespace
|
|
415
429
|
str_val = re.sub(r"\s+", " ", str_val).strip()
|
|
416
430
|
|
|
417
|
-
# Handle values like "2<.01"
|
|
431
|
+
# Handle values like "2<.01": extract the first number
|
|
418
432
|
if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
|
|
419
433
|
# Extract number before the < symbol
|
|
420
434
|
before_lt = str_val.split("<")[0].strip()
|
|
421
435
|
if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
|
|
422
436
|
str_val = before_lt
|
|
423
437
|
|
|
424
|
-
# Handle boolean unique values like "T0.62F0.38"
|
|
438
|
+
# Handle boolean unique values like "T0.62F0.38": extract the more readable format
|
|
425
439
|
if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
|
|
426
440
|
# Extract T and F values
|
|
427
441
|
t_match = re.search(r"T(\d+\.\d+)", str_val)
|
|
@@ -451,7 +465,7 @@ def _rich_print_scan_table(
|
|
|
451
465
|
# Simple integers under 10000
|
|
452
466
|
return str(int(num_val))
|
|
453
467
|
elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
|
|
454
|
-
# Likely dates in YYYYMMDD format
|
|
468
|
+
# Likely dates in YYYYMMDD format: format as date-like
|
|
455
469
|
int_val = int(num_val)
|
|
456
470
|
if 19000101 <= int_val <= 29991231: # Reasonable date range
|
|
457
471
|
str_date = str(int_val)
|
|
@@ -463,29 +477,29 @@ def _rich_print_scan_table(
|
|
|
463
477
|
# Otherwise treat as large number
|
|
464
478
|
return f"{num_val / 1000000:.1f}M"
|
|
465
479
|
elif abs(num_val) >= 1000000:
|
|
466
|
-
# Large numbers
|
|
480
|
+
# Large numbers: use scientific notation or M/k notation
|
|
467
481
|
|
|
468
482
|
if abs(num_val) >= 1000000000:
|
|
469
483
|
return f"{num_val:.1e}"
|
|
470
484
|
else:
|
|
471
485
|
return f"{num_val / 1000000:.1f}M"
|
|
472
486
|
elif abs(num_val) >= 10000:
|
|
473
|
-
# Numbers >= 10k
|
|
487
|
+
# Numbers >= 10k: use compact notation
|
|
474
488
|
return f"{num_val / 1000:.1f}k"
|
|
475
489
|
elif abs(num_val) >= 100:
|
|
476
|
-
# Numbers 100-9999
|
|
490
|
+
# Numbers 100-9999: show with minimal decimals
|
|
477
491
|
return f"{num_val:.1f}"
|
|
478
492
|
elif abs(num_val) >= 10:
|
|
479
|
-
# Numbers 10-99
|
|
493
|
+
# Numbers 10-99: show with one decimal
|
|
480
494
|
return f"{num_val:.1f}"
|
|
481
495
|
elif abs(num_val) >= 1:
|
|
482
|
-
# Numbers 1-9
|
|
496
|
+
# Numbers 1-9: show with two decimals
|
|
483
497
|
return f"{num_val:.2f}"
|
|
484
498
|
elif abs(num_val) >= 0.01:
|
|
485
|
-
# Small numbers
|
|
499
|
+
# Small numbers: show with appropriate precision
|
|
486
500
|
return f"{num_val:.2f}"
|
|
487
501
|
else:
|
|
488
|
-
# Very small numbers
|
|
502
|
+
# Very small numbers: use scientific notation
|
|
489
503
|
|
|
490
504
|
return f"{num_val:.1e}"
|
|
491
505
|
|
|
@@ -493,7 +507,7 @@ def _rich_print_scan_table(
|
|
|
493
507
|
# Not a number, handle as string
|
|
494
508
|
pass
|
|
495
509
|
|
|
496
|
-
# Handle date/datetime strings
|
|
510
|
+
# Handle date/datetime strings: show abbreviated format
|
|
497
511
|
if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
|
|
498
512
|
# Likely a date/datetime, show abbreviated
|
|
499
513
|
if len(str_val) > max_width:
|
|
@@ -557,9 +571,12 @@ def _rich_print_gt_table(
|
|
|
557
571
|
gt_table: The GT table object to display
|
|
558
572
|
preview_info: Optional dict with preview context info:
|
|
559
573
|
- total_rows: Total rows in the dataset
|
|
574
|
+
- total_columns: Total columns in the dataset
|
|
560
575
|
- head_rows: Number of head rows shown
|
|
561
576
|
- tail_rows: Number of tail rows shown
|
|
562
577
|
- is_complete: Whether the entire dataset is shown
|
|
578
|
+
- source_type: Type of data source (e.g., "External source: worldcities_new.csv")
|
|
579
|
+
- table_type: Type of table (e.g., "polars")
|
|
563
580
|
show_summary: Whether to show the row count summary at the bottom
|
|
564
581
|
"""
|
|
565
582
|
try:
|
|
@@ -592,6 +609,12 @@ def _rich_print_gt_table(
|
|
|
592
609
|
table_type = preview_info["table_type"]
|
|
593
610
|
table_title = f"Data Preview / {source_type} / {table_type}"
|
|
594
611
|
|
|
612
|
+
# Add dimensions subtitle in gray if available
|
|
613
|
+
total_rows = preview_info.get("total_rows")
|
|
614
|
+
total_columns = preview_info.get("total_columns")
|
|
615
|
+
if total_rows is not None and total_columns is not None:
|
|
616
|
+
table_title += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
|
|
617
|
+
|
|
595
618
|
rich_table = Table(
|
|
596
619
|
title=table_title,
|
|
597
620
|
show_header=True,
|
|
@@ -933,14 +956,19 @@ def _rich_print_gt_table(
|
|
|
933
956
|
|
|
934
957
|
|
|
935
958
|
def _display_validation_summary(validation: Any) -> None:
|
|
936
|
-
"""Display a validation summary in a Rich table format."""
|
|
959
|
+
"""Display a validation summary in a compact Rich table format."""
|
|
937
960
|
try:
|
|
938
961
|
# Try to get the summary from the validation report
|
|
939
962
|
if hasattr(validation, "validation_info") and validation.validation_info is not None:
|
|
940
963
|
# Use the validation_info to create a summary
|
|
941
964
|
info = validation.validation_info
|
|
942
965
|
n_steps = len(info)
|
|
943
|
-
|
|
966
|
+
|
|
967
|
+
# Count steps based on their threshold status
|
|
968
|
+
n_passed = sum(
|
|
969
|
+
1 for step in info if not step.warning and not step.error and not step.critical
|
|
970
|
+
)
|
|
971
|
+
n_all_passed = sum(1 for step in info if step.all_passed)
|
|
944
972
|
n_failed = n_steps - n_passed
|
|
945
973
|
|
|
946
974
|
# Calculate severity counts
|
|
@@ -950,64 +978,213 @@ def _display_validation_summary(validation: Any) -> None:
|
|
|
950
978
|
|
|
951
979
|
all_passed = n_failed == 0
|
|
952
980
|
|
|
953
|
-
# Determine highest severity
|
|
981
|
+
# Determine highest severity and its color
|
|
954
982
|
if n_critical > 0:
|
|
955
983
|
highest_severity = "critical"
|
|
984
|
+
severity_color = "red"
|
|
956
985
|
elif n_error > 0:
|
|
957
986
|
highest_severity = "error"
|
|
987
|
+
severity_color = "yellow"
|
|
958
988
|
elif n_warning > 0:
|
|
959
989
|
highest_severity = "warning"
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
990
|
+
severity_color = "bright_black" # gray
|
|
991
|
+
elif n_all_passed == n_steps:
|
|
992
|
+
# All steps passed AND all steps had 100% pass rate
|
|
963
993
|
highest_severity = "all passed"
|
|
994
|
+
severity_color = "bold green"
|
|
995
|
+
else:
|
|
996
|
+
# Steps passed (no threshold exceedances) but some had failing test units
|
|
997
|
+
highest_severity = "passed"
|
|
998
|
+
severity_color = "green"
|
|
999
|
+
|
|
1000
|
+
# Create compact summary header
|
|
1001
|
+
# Format: Steps: 6 / P: 3 (3 AP) / W: 3 / E: 0 / C: 0 / warning
|
|
1002
|
+
summary_header = (
|
|
1003
|
+
f"Steps: {n_steps} / P: {n_passed} ({n_all_passed} AP) / "
|
|
1004
|
+
f"W: {n_warning} / E: {n_error} / C: {n_critical} / "
|
|
1005
|
+
f"[{severity_color}]{highest_severity}[/{severity_color}]"
|
|
1006
|
+
)
|
|
964
1007
|
|
|
965
|
-
#
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
# Add summary statistics
|
|
971
|
-
table.add_row("Total Steps", str(n_steps))
|
|
972
|
-
table.add_row("Passing Steps", str(n_passed))
|
|
973
|
-
table.add_row("Failing Steps", str(n_failed))
|
|
974
|
-
table.add_row("Warning Steps", str(n_warning))
|
|
975
|
-
table.add_row("Error Steps", str(n_error))
|
|
976
|
-
table.add_row("Critical Steps", str(n_critical))
|
|
977
|
-
table.add_row("All Passed", str(all_passed))
|
|
978
|
-
table.add_row("Highest Severity", highest_severity)
|
|
979
|
-
|
|
980
|
-
console.print(table)
|
|
1008
|
+
# Print the report title and summary
|
|
1009
|
+
console.print()
|
|
1010
|
+
console.print("[blue]Validation Report[/blue]")
|
|
1011
|
+
console.print(f"[white]{summary_header}[/white]")
|
|
981
1012
|
|
|
982
1013
|
# Display step details
|
|
983
1014
|
if n_steps > 0:
|
|
1015
|
+
from rich.box import SIMPLE_HEAD
|
|
1016
|
+
|
|
984
1017
|
steps_table = Table(
|
|
985
|
-
|
|
1018
|
+
show_header=True,
|
|
1019
|
+
header_style="bold cyan",
|
|
1020
|
+
box=SIMPLE_HEAD,
|
|
986
1021
|
)
|
|
987
|
-
steps_table.add_column("
|
|
988
|
-
steps_table.add_column("
|
|
1022
|
+
steps_table.add_column("", style="dim")
|
|
1023
|
+
steps_table.add_column("Step", style="white")
|
|
989
1024
|
steps_table.add_column("Column", style="cyan")
|
|
990
|
-
steps_table.add_column("
|
|
991
|
-
steps_table.add_column("
|
|
1025
|
+
steps_table.add_column("Values", style="yellow")
|
|
1026
|
+
steps_table.add_column("Units", style="blue")
|
|
1027
|
+
steps_table.add_column("Pass", style="green")
|
|
1028
|
+
steps_table.add_column("Fail", style="red")
|
|
1029
|
+
steps_table.add_column("W", style="bright_black")
|
|
1030
|
+
steps_table.add_column("E", style="yellow")
|
|
1031
|
+
steps_table.add_column("C", style="red")
|
|
1032
|
+
steps_table.add_column("Ext", style="blue", justify="center")
|
|
1033
|
+
|
|
1034
|
+
def format_units(n: int) -> str:
|
|
1035
|
+
"""Format large numbers with K, M, B abbreviations for values above 10,000."""
|
|
1036
|
+
if n is None:
|
|
1037
|
+
return "—"
|
|
1038
|
+
if n >= 1000000000: # Billions
|
|
1039
|
+
return f"{n / 1000000000:.1f}B"
|
|
1040
|
+
elif n >= 1000000: # Millions
|
|
1041
|
+
return f"{n / 1000000:.1f}M"
|
|
1042
|
+
elif n >= 10000: # Use K for 10,000 and above
|
|
1043
|
+
return f"{n / 1000:.0f}K"
|
|
1044
|
+
else:
|
|
1045
|
+
return str(n)
|
|
1046
|
+
|
|
1047
|
+
def format_pass_fail(passed: int, total: int) -> str:
|
|
1048
|
+
"""Format pass/fail counts with abbreviated numbers and fractions."""
|
|
1049
|
+
if passed is None or total is None or total == 0:
|
|
1050
|
+
return "—/—"
|
|
1051
|
+
|
|
1052
|
+
# Calculate fraction
|
|
1053
|
+
fraction = passed / total
|
|
1054
|
+
|
|
1055
|
+
# Format fraction with special handling for very small and very large values
|
|
1056
|
+
if fraction == 0.0:
|
|
1057
|
+
fraction_str = "0.00"
|
|
1058
|
+
elif fraction == 1.0:
|
|
1059
|
+
fraction_str = "1.00"
|
|
1060
|
+
elif fraction < 0.005: # Less than 0.005 rounds to 0.00
|
|
1061
|
+
fraction_str = "<0.01"
|
|
1062
|
+
elif fraction > 0.995: # Greater than 0.995 rounds to 1.00
|
|
1063
|
+
fraction_str = ">0.99"
|
|
1064
|
+
else:
|
|
1065
|
+
fraction_str = f"{fraction:.2f}"
|
|
1066
|
+
|
|
1067
|
+
# Format absolute number with abbreviations
|
|
1068
|
+
absolute_str = format_units(passed)
|
|
1069
|
+
|
|
1070
|
+
return f"{absolute_str}/{fraction_str}"
|
|
992
1071
|
|
|
993
1072
|
for step in info:
|
|
994
|
-
|
|
995
|
-
|
|
1073
|
+
# Extract values information for the Values column
|
|
1074
|
+
values_str = "—" # Default to em dash if no values
|
|
1075
|
+
|
|
1076
|
+
# Handle different validation types
|
|
1077
|
+
if step.assertion_type == "col_schema_match":
|
|
1078
|
+
values_str = "—" # Schema is too complex to display inline
|
|
1079
|
+
elif step.assertion_type == "col_vals_between":
|
|
1080
|
+
# For between validations, try to get left and right bounds
|
|
1081
|
+
if (
|
|
1082
|
+
hasattr(step, "left")
|
|
1083
|
+
and hasattr(step, "right")
|
|
1084
|
+
and step.left is not None
|
|
1085
|
+
and step.right is not None
|
|
1086
|
+
):
|
|
1087
|
+
values_str = f"[{step.left}, {step.right}]"
|
|
1088
|
+
elif hasattr(step, "values") and step.values is not None:
|
|
1089
|
+
if isinstance(step.values, (list, tuple)) and len(step.values) >= 2:
|
|
1090
|
+
values_str = f"[{step.values[0]}, {step.values[1]}]"
|
|
1091
|
+
else:
|
|
1092
|
+
values_str = str(step.values)
|
|
1093
|
+
elif step.assertion_type in ["row_count_match", "col_count_match"]:
|
|
1094
|
+
# For count match validations, extract the 'count' value from the dictionary
|
|
1095
|
+
if hasattr(step, "values") and step.values is not None:
|
|
1096
|
+
if isinstance(step.values, dict) and "count" in step.values:
|
|
1097
|
+
values_str = str(step.values["count"])
|
|
1098
|
+
else:
|
|
1099
|
+
values_str = str(step.values)
|
|
1100
|
+
else:
|
|
1101
|
+
values_str = "—"
|
|
1102
|
+
elif step.assertion_type in ["col_vals_expr", "conjointly"]:
|
|
1103
|
+
values_str = "COLUMN EXPR"
|
|
1104
|
+
elif step.assertion_type == "specially":
|
|
1105
|
+
values_str = "EXPR"
|
|
1106
|
+
elif hasattr(step, "values") and step.values is not None:
|
|
1107
|
+
if isinstance(step.values, (list, tuple)):
|
|
1108
|
+
if len(step.values) <= 3:
|
|
1109
|
+
values_str = ", ".join(str(v) for v in step.values)
|
|
1110
|
+
else:
|
|
1111
|
+
values_str = f"{', '.join(str(v) for v in step.values[:3])}..."
|
|
1112
|
+
else:
|
|
1113
|
+
values_str = str(step.values)
|
|
1114
|
+
elif hasattr(step, "value") and step.value is not None:
|
|
1115
|
+
values_str = str(step.value)
|
|
1116
|
+
elif hasattr(step, "set") and step.set is not None:
|
|
1117
|
+
if isinstance(step.set, (list, tuple)):
|
|
1118
|
+
if len(step.set) <= 3:
|
|
1119
|
+
values_str = ", ".join(str(v) for v in step.set)
|
|
1120
|
+
else:
|
|
1121
|
+
values_str = f"{', '.join(str(v) for v in step.set[:3])}..."
|
|
1122
|
+
else:
|
|
1123
|
+
values_str = str(step.set)
|
|
1124
|
+
|
|
1125
|
+
# Determine threshold status for W, E, C columns
|
|
1126
|
+
# Check if thresholds are set and whether they were exceeded
|
|
1127
|
+
|
|
1128
|
+
# Warning threshold
|
|
1129
|
+
if (
|
|
1130
|
+
hasattr(step, "thresholds")
|
|
1131
|
+
and step.thresholds
|
|
1132
|
+
and hasattr(step.thresholds, "warning")
|
|
1133
|
+
and step.thresholds.warning is not None
|
|
1134
|
+
):
|
|
1135
|
+
w_status = (
|
|
1136
|
+
"[bright_black]●[/bright_black]"
|
|
1137
|
+
if step.warning
|
|
1138
|
+
else "[bright_black]○[/bright_black]"
|
|
1139
|
+
)
|
|
1140
|
+
else:
|
|
1141
|
+
w_status = "—"
|
|
1142
|
+
|
|
1143
|
+
# Error threshold
|
|
1144
|
+
if (
|
|
1145
|
+
hasattr(step, "thresholds")
|
|
1146
|
+
and step.thresholds
|
|
1147
|
+
and hasattr(step.thresholds, "error")
|
|
1148
|
+
and step.thresholds.error is not None
|
|
1149
|
+
):
|
|
1150
|
+
e_status = "[yellow]●[/yellow]" if step.error else "[yellow]○[/yellow]"
|
|
1151
|
+
else:
|
|
1152
|
+
e_status = "—"
|
|
1153
|
+
|
|
1154
|
+
# Critical threshold
|
|
1155
|
+
if (
|
|
1156
|
+
hasattr(step, "thresholds")
|
|
1157
|
+
and step.thresholds
|
|
1158
|
+
and hasattr(step.thresholds, "critical")
|
|
1159
|
+
and step.thresholds.critical is not None
|
|
1160
|
+
):
|
|
1161
|
+
c_status = "[red]●[/red]" if step.critical else "[red]○[/red]"
|
|
1162
|
+
else:
|
|
1163
|
+
c_status = "—"
|
|
996
1164
|
|
|
997
|
-
|
|
998
|
-
if
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1165
|
+
# Extract status, here we check if the step has any extract data
|
|
1166
|
+
if (
|
|
1167
|
+
hasattr(step, "extract")
|
|
1168
|
+
and step.extract is not None
|
|
1169
|
+
and hasattr(step.extract, "__len__")
|
|
1170
|
+
and len(step.extract) > 0
|
|
1171
|
+
):
|
|
1172
|
+
ext_status = "[blue]✓[/blue]"
|
|
1173
|
+
else:
|
|
1174
|
+
ext_status = "[bright_black]—[/bright_black]"
|
|
1004
1175
|
|
|
1005
1176
|
steps_table.add_row(
|
|
1006
1177
|
str(step.i),
|
|
1007
1178
|
step.assertion_type,
|
|
1008
1179
|
str(step.column) if step.column else "—",
|
|
1009
|
-
|
|
1010
|
-
|
|
1180
|
+
values_str,
|
|
1181
|
+
format_units(step.n),
|
|
1182
|
+
format_pass_fail(step.n_passed, step.n),
|
|
1183
|
+
format_pass_fail(step.n - step.n_passed, step.n),
|
|
1184
|
+
w_status,
|
|
1185
|
+
e_status,
|
|
1186
|
+
c_status,
|
|
1187
|
+
ext_status,
|
|
1011
1188
|
)
|
|
1012
1189
|
|
|
1013
1190
|
console.print(steps_table)
|
|
@@ -1015,18 +1192,32 @@ def _display_validation_summary(validation: Any) -> None:
|
|
|
1015
1192
|
# Display status with appropriate color
|
|
1016
1193
|
if highest_severity == "all passed":
|
|
1017
1194
|
console.print(
|
|
1018
|
-
Panel(
|
|
1195
|
+
Panel(
|
|
1196
|
+
"[green]✓ All validations passed![/green]",
|
|
1197
|
+
border_style="green",
|
|
1198
|
+
expand=False,
|
|
1199
|
+
)
|
|
1019
1200
|
)
|
|
1020
|
-
elif highest_severity == "
|
|
1201
|
+
elif highest_severity == "passed":
|
|
1021
1202
|
console.print(
|
|
1022
|
-
Panel(
|
|
1203
|
+
Panel(
|
|
1204
|
+
"[dim green]⚠ Some steps had failing test units[/dim green]",
|
|
1205
|
+
border_style="dim green",
|
|
1206
|
+
expand=False,
|
|
1207
|
+
)
|
|
1023
1208
|
)
|
|
1024
1209
|
elif highest_severity in ["warning", "error", "critical"]:
|
|
1025
|
-
|
|
1210
|
+
if highest_severity == "warning":
|
|
1211
|
+
color = "bright_black" # gray
|
|
1212
|
+
elif highest_severity == "error":
|
|
1213
|
+
color = "yellow"
|
|
1214
|
+
else: # critical
|
|
1215
|
+
color = "red"
|
|
1026
1216
|
console.print(
|
|
1027
1217
|
Panel(
|
|
1028
1218
|
f"[{color}]✗ Validation failed with {highest_severity} severity[/{color}]",
|
|
1029
1219
|
border_style=color,
|
|
1220
|
+
expand=False,
|
|
1030
1221
|
)
|
|
1031
1222
|
)
|
|
1032
1223
|
else:
|
|
@@ -1040,20 +1231,31 @@ def _display_validation_summary(validation: Any) -> None:
|
|
|
1040
1231
|
|
|
1041
1232
|
|
|
1042
1233
|
@click.group(cls=OrderedGroup)
|
|
1043
|
-
@click.version_option(
|
|
1234
|
+
@click.version_option(pb.__version__, "-v", "--version", prog_name="pb")
|
|
1235
|
+
@click.help_option("-h", "--help")
|
|
1044
1236
|
def cli():
|
|
1045
1237
|
"""
|
|
1046
|
-
Pointblank CLI
|
|
1238
|
+
Pointblank CLI: Data validation and quality tools for data engineers.
|
|
1239
|
+
|
|
1240
|
+
Use this CLI to validate data quality, explore datasets, and generate comprehensive
|
|
1241
|
+
reports for CSV, Parquet, and database sources. Suitable for data pipelines, ETL
|
|
1242
|
+
validation, and exploratory data analysis from the command line.
|
|
1243
|
+
|
|
1244
|
+
Quick Examples:
|
|
1047
1245
|
|
|
1048
|
-
|
|
1049
|
-
|
|
1246
|
+
\b
|
|
1247
|
+
pb preview data.csv Preview your data
|
|
1248
|
+
pb scan data.csv Generate data profile
|
|
1249
|
+
pb validate data.csv Run basic validation
|
|
1250
|
+
|
|
1251
|
+
Use pb COMMAND --help for detailed help on any command.
|
|
1050
1252
|
"""
|
|
1051
1253
|
pass
|
|
1052
1254
|
|
|
1053
1255
|
|
|
1054
1256
|
@cli.command()
|
|
1055
|
-
@click.argument("data_source", type=str)
|
|
1056
|
-
def info(data_source: str):
|
|
1257
|
+
@click.argument("data_source", type=str, required=False)
|
|
1258
|
+
def info(data_source: str | None):
|
|
1057
1259
|
"""
|
|
1058
1260
|
Display information about a data source.
|
|
1059
1261
|
|
|
@@ -1069,6 +1271,11 @@ def info(data_source: str):
|
|
|
1069
1271
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1070
1272
|
"""
|
|
1071
1273
|
try:
|
|
1274
|
+
# Handle missing data_source with concise help
|
|
1275
|
+
if data_source is None:
|
|
1276
|
+
_show_concise_help("info", None)
|
|
1277
|
+
return
|
|
1278
|
+
|
|
1072
1279
|
with console.status("[bold green]Loading data..."):
|
|
1073
1280
|
# Load the data source using the centralized function
|
|
1074
1281
|
data = _load_data_source(data_source)
|
|
@@ -1107,21 +1314,21 @@ def info(data_source: str):
|
|
|
1107
1314
|
|
|
1108
1315
|
|
|
1109
1316
|
@cli.command()
|
|
1110
|
-
@click.argument("data_source", type=str)
|
|
1111
|
-
@click.option("--columns",
|
|
1317
|
+
@click.argument("data_source", type=str, required=False)
|
|
1318
|
+
@click.option("--columns", help="Comma-separated list of columns to display")
|
|
1112
1319
|
@click.option("--col-range", help="Column range like '1:10' or '5:' or ':15' (1-based indexing)")
|
|
1113
1320
|
@click.option("--col-first", type=int, help="Show first N columns")
|
|
1114
1321
|
@click.option("--col-last", type=int, help="Show last N columns")
|
|
1115
|
-
@click.option("--head",
|
|
1116
|
-
@click.option("--tail",
|
|
1117
|
-
@click.option("--limit",
|
|
1322
|
+
@click.option("--head", default=5, help="Number of rows from the top (default: 5)")
|
|
1323
|
+
@click.option("--tail", default=5, help="Number of rows from the bottom (default: 5)")
|
|
1324
|
+
@click.option("--limit", default=50, help="Maximum total rows to display (default: 50)")
|
|
1118
1325
|
@click.option("--no-row-numbers", is_flag=True, help="Hide row numbers")
|
|
1119
1326
|
@click.option("--max-col-width", default=250, help="Maximum column width in pixels (default: 250)")
|
|
1120
1327
|
@click.option("--min-table-width", default=500, help="Minimum table width in pixels (default: 500)")
|
|
1121
1328
|
@click.option("--no-header", is_flag=True, help="Hide table header")
|
|
1122
1329
|
@click.option("--output-html", type=click.Path(), help="Save HTML output to file")
|
|
1123
1330
|
def preview(
|
|
1124
|
-
data_source: str,
|
|
1331
|
+
data_source: str | None,
|
|
1125
1332
|
columns: str | None,
|
|
1126
1333
|
col_range: str | None,
|
|
1127
1334
|
col_first: int | None,
|
|
@@ -1146,6 +1353,7 @@ def preview(
|
|
|
1146
1353
|
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1147
1354
|
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1148
1355
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1356
|
+
- Piped data from pb pl command
|
|
1149
1357
|
|
|
1150
1358
|
COLUMN SELECTION OPTIONS:
|
|
1151
1359
|
|
|
@@ -1160,11 +1368,52 @@ def preview(
|
|
|
1160
1368
|
Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
|
|
1161
1369
|
"""
|
|
1162
1370
|
try:
|
|
1371
|
+
import sys
|
|
1372
|
+
|
|
1373
|
+
# Handle piped input
|
|
1374
|
+
if data_source is None:
|
|
1375
|
+
if not sys.stdin.isatty():
|
|
1376
|
+
# Data is being piped in - read the file path from stdin
|
|
1377
|
+
piped_input = sys.stdin.read().strip()
|
|
1378
|
+
if piped_input:
|
|
1379
|
+
data_source = piped_input
|
|
1380
|
+
|
|
1381
|
+
# Determine the format from the file extension
|
|
1382
|
+
if piped_input.endswith(".parquet"):
|
|
1383
|
+
format_type = "Parquet"
|
|
1384
|
+
elif piped_input.endswith(".csv"):
|
|
1385
|
+
format_type = "CSV"
|
|
1386
|
+
else:
|
|
1387
|
+
format_type = "unknown"
|
|
1388
|
+
|
|
1389
|
+
console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
|
|
1390
|
+
else:
|
|
1391
|
+
console.print("[red]Error:[/red] No data provided via pipe")
|
|
1392
|
+
sys.exit(1)
|
|
1393
|
+
else:
|
|
1394
|
+
# Show concise help and exit
|
|
1395
|
+
_show_concise_help("preview", None)
|
|
1396
|
+
return
|
|
1397
|
+
|
|
1163
1398
|
with console.status("[bold green]Loading data..."):
|
|
1164
1399
|
# Load the data source using the centralized function
|
|
1165
1400
|
data = _load_data_source(data_source)
|
|
1166
1401
|
|
|
1167
|
-
|
|
1402
|
+
# Check if this is a piped data source and create friendly display name
|
|
1403
|
+
is_piped_data = _is_piped_data_source(data_source)
|
|
1404
|
+
|
|
1405
|
+
if is_piped_data:
|
|
1406
|
+
if data_source.endswith(".parquet"):
|
|
1407
|
+
display_source = "Parquet file via `pb pl`"
|
|
1408
|
+
elif data_source.endswith(".csv"):
|
|
1409
|
+
display_source = "CSV file via `pb pl`"
|
|
1410
|
+
else:
|
|
1411
|
+
display_source = "File via `pb pl`"
|
|
1412
|
+
console.print(
|
|
1413
|
+
f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
|
|
1414
|
+
)
|
|
1415
|
+
else:
|
|
1416
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1168
1417
|
|
|
1169
1418
|
# Parse columns if provided
|
|
1170
1419
|
columns_list = None
|
|
@@ -1186,7 +1435,7 @@ def preview(
|
|
|
1186
1435
|
# If _row_num_ exists in data but not in user selection, add it at beginning
|
|
1187
1436
|
if all_columns and "_row_num_" in all_columns and "_row_num_" not in columns_list:
|
|
1188
1437
|
columns_list = ["_row_num_"] + columns_list
|
|
1189
|
-
except Exception:
|
|
1438
|
+
except Exception:
|
|
1190
1439
|
# If we can't process the data, just use the user's column list as-is
|
|
1191
1440
|
pass
|
|
1192
1441
|
elif col_range or col_first or col_last:
|
|
@@ -1261,7 +1510,14 @@ def preview(
|
|
|
1261
1510
|
total_dataset_columns = pb.get_column_count(processed_data)
|
|
1262
1511
|
|
|
1263
1512
|
# Determine source type and table type for enhanced preview title
|
|
1264
|
-
if
|
|
1513
|
+
if is_piped_data:
|
|
1514
|
+
if data_source.endswith(".parquet"):
|
|
1515
|
+
source_type = "Polars expression (serialized to Parquet) from `pb pl`"
|
|
1516
|
+
elif data_source.endswith(".csv"):
|
|
1517
|
+
source_type = "Polars expression (serialized to CSV) from `pb pl`"
|
|
1518
|
+
else:
|
|
1519
|
+
source_type = "Polars expression from `pb pl`"
|
|
1520
|
+
elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1265
1521
|
source_type = f"Pointblank dataset: {data_source}"
|
|
1266
1522
|
else:
|
|
1267
1523
|
source_type = f"External source: {data_source}"
|
|
@@ -1311,17 +1567,17 @@ def preview(
|
|
|
1311
1567
|
|
|
1312
1568
|
_rich_print_gt_table(gt_table, preview_info)
|
|
1313
1569
|
|
|
1314
|
-
except Exception as e:
|
|
1570
|
+
except Exception as e:
|
|
1315
1571
|
console.print(f"[red]Error:[/red] {e}")
|
|
1316
|
-
sys.exit(1)
|
|
1572
|
+
sys.exit(1)
|
|
1317
1573
|
|
|
1318
1574
|
|
|
1319
1575
|
@cli.command()
|
|
1320
|
-
@click.argument("data_source", type=str)
|
|
1576
|
+
@click.argument("data_source", type=str, required=False)
|
|
1321
1577
|
@click.option("--output-html", type=click.Path(), help="Save HTML scan report to file")
|
|
1322
1578
|
@click.option("--columns", "-c", help="Comma-separated list of columns to scan")
|
|
1323
1579
|
def scan(
|
|
1324
|
-
data_source: str,
|
|
1580
|
+
data_source: str | None,
|
|
1325
1581
|
output_html: str | None,
|
|
1326
1582
|
columns: str | None,
|
|
1327
1583
|
):
|
|
@@ -1344,17 +1600,58 @@ def scan(
|
|
|
1344
1600
|
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1345
1601
|
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1346
1602
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1603
|
+
- Piped data from pb pl command
|
|
1347
1604
|
"""
|
|
1348
1605
|
try:
|
|
1606
|
+
import sys
|
|
1349
1607
|
import time
|
|
1350
1608
|
|
|
1351
1609
|
start_time = time.time()
|
|
1352
1610
|
|
|
1611
|
+
# Handle piped input
|
|
1612
|
+
if data_source is None:
|
|
1613
|
+
if not sys.stdin.isatty():
|
|
1614
|
+
# Data is being piped in - read the file path from stdin
|
|
1615
|
+
piped_input = sys.stdin.read().strip()
|
|
1616
|
+
if piped_input:
|
|
1617
|
+
data_source = piped_input
|
|
1618
|
+
|
|
1619
|
+
# Determine the format from the file extension
|
|
1620
|
+
if piped_input.endswith(".parquet"):
|
|
1621
|
+
format_type = "Parquet"
|
|
1622
|
+
elif piped_input.endswith(".csv"):
|
|
1623
|
+
format_type = "CSV"
|
|
1624
|
+
else:
|
|
1625
|
+
format_type = "unknown"
|
|
1626
|
+
|
|
1627
|
+
console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
|
|
1628
|
+
else:
|
|
1629
|
+
console.print("[red]Error:[/red] No data provided via pipe")
|
|
1630
|
+
sys.exit(1)
|
|
1631
|
+
else:
|
|
1632
|
+
# Show concise help and exit
|
|
1633
|
+
_show_concise_help("scan", None)
|
|
1634
|
+
return
|
|
1635
|
+
|
|
1353
1636
|
with console.status("[bold green]Loading data..."):
|
|
1354
1637
|
# Load the data source using the centralized function
|
|
1355
1638
|
data = _load_data_source(data_source)
|
|
1356
1639
|
|
|
1357
|
-
|
|
1640
|
+
# Check if this is a piped data source and create friendly display name
|
|
1641
|
+
is_piped_data = _is_piped_data_source(data_source)
|
|
1642
|
+
|
|
1643
|
+
if is_piped_data:
|
|
1644
|
+
if data_source.endswith(".parquet"):
|
|
1645
|
+
display_source = "Parquet file via `pb pl`"
|
|
1646
|
+
elif data_source.endswith(".csv"):
|
|
1647
|
+
display_source = "CSV file via `pb pl`"
|
|
1648
|
+
else:
|
|
1649
|
+
display_source = "File via `pb pl`"
|
|
1650
|
+
console.print(
|
|
1651
|
+
f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
|
|
1652
|
+
)
|
|
1653
|
+
else:
|
|
1654
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1358
1655
|
|
|
1359
1656
|
# Parse columns if provided
|
|
1360
1657
|
columns_list = None
|
|
@@ -1367,7 +1664,15 @@ def scan(
|
|
|
1367
1664
|
# Data is already processed by _load_data_source
|
|
1368
1665
|
scan_result = pb.col_summary_tbl(data=data)
|
|
1369
1666
|
|
|
1370
|
-
|
|
1667
|
+
# Create friendly source type for display
|
|
1668
|
+
if is_piped_data:
|
|
1669
|
+
if data_source.endswith(".parquet"):
|
|
1670
|
+
source_type = "Polars expression (serialized to Parquet) from `pb pl`"
|
|
1671
|
+
elif data_source.endswith(".csv"):
|
|
1672
|
+
source_type = "Polars expression (serialized to CSV) from `pb pl`"
|
|
1673
|
+
else:
|
|
1674
|
+
source_type = "Polars expression from `pb pl`"
|
|
1675
|
+
elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1371
1676
|
source_type = f"Pointblank dataset: {data_source}"
|
|
1372
1677
|
else:
|
|
1373
1678
|
source_type = f"External source: {data_source}"
|
|
@@ -1399,7 +1704,12 @@ def scan(
|
|
|
1399
1704
|
# Display detailed column summary using rich formatting
|
|
1400
1705
|
try:
|
|
1401
1706
|
_rich_print_scan_table(
|
|
1402
|
-
scan_result,
|
|
1707
|
+
scan_result,
|
|
1708
|
+
display_source if is_piped_data else data_source,
|
|
1709
|
+
source_type,
|
|
1710
|
+
table_type,
|
|
1711
|
+
total_rows,
|
|
1712
|
+
total_columns,
|
|
1403
1713
|
)
|
|
1404
1714
|
|
|
1405
1715
|
except Exception as e:
|
|
@@ -1411,9 +1721,9 @@ def scan(
|
|
|
1411
1721
|
|
|
1412
1722
|
|
|
1413
1723
|
@cli.command()
|
|
1414
|
-
@click.argument("data_source", type=str)
|
|
1724
|
+
@click.argument("data_source", type=str, required=False)
|
|
1415
1725
|
@click.option("--output-html", type=click.Path(), help="Save HTML output to file")
|
|
1416
|
-
def missing(data_source: str, output_html: str | None):
|
|
1726
|
+
def missing(data_source: str | None, output_html: str | None):
|
|
1417
1727
|
"""
|
|
1418
1728
|
Generate a missing values report for a data table.
|
|
1419
1729
|
|
|
@@ -1425,13 +1735,55 @@ def missing(data_source: str, output_html: str | None):
|
|
|
1425
1735
|
- GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv)
|
|
1426
1736
|
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1427
1737
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1738
|
+
- Piped data from pb pl command
|
|
1428
1739
|
"""
|
|
1429
1740
|
try:
|
|
1741
|
+
import sys
|
|
1742
|
+
|
|
1743
|
+
# Handle piped input
|
|
1744
|
+
if data_source is None:
|
|
1745
|
+
if not sys.stdin.isatty():
|
|
1746
|
+
# Data is being piped in - read the file path from stdin
|
|
1747
|
+
piped_input = sys.stdin.read().strip()
|
|
1748
|
+
if piped_input:
|
|
1749
|
+
data_source = piped_input
|
|
1750
|
+
|
|
1751
|
+
# Determine the format from the file extension
|
|
1752
|
+
if piped_input.endswith(".parquet"):
|
|
1753
|
+
format_type = "Parquet"
|
|
1754
|
+
elif piped_input.endswith(".csv"):
|
|
1755
|
+
format_type = "CSV"
|
|
1756
|
+
else:
|
|
1757
|
+
format_type = "unknown"
|
|
1758
|
+
|
|
1759
|
+
console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
|
|
1760
|
+
else:
|
|
1761
|
+
console.print("[red]Error:[/red] No data provided via pipe")
|
|
1762
|
+
sys.exit(1)
|
|
1763
|
+
else:
|
|
1764
|
+
# Show concise help and exit
|
|
1765
|
+
_show_concise_help("missing", None)
|
|
1766
|
+
return
|
|
1767
|
+
|
|
1430
1768
|
with console.status("[bold green]Loading data..."):
|
|
1431
1769
|
# Load the data source using the centralized function
|
|
1432
1770
|
data = _load_data_source(data_source)
|
|
1433
1771
|
|
|
1434
|
-
|
|
1772
|
+
# Check if this is a piped data source and create friendly display name
|
|
1773
|
+
is_piped_data = _is_piped_data_source(data_source)
|
|
1774
|
+
|
|
1775
|
+
if is_piped_data:
|
|
1776
|
+
if data_source.endswith(".parquet"):
|
|
1777
|
+
display_source = "Parquet file via `pb pl`"
|
|
1778
|
+
elif data_source.endswith(".csv"):
|
|
1779
|
+
display_source = "CSV file via `pb pl`"
|
|
1780
|
+
else:
|
|
1781
|
+
display_source = "File via `pb pl`"
|
|
1782
|
+
console.print(
|
|
1783
|
+
f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
|
|
1784
|
+
)
|
|
1785
|
+
else:
|
|
1786
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1435
1787
|
|
|
1436
1788
|
# Generate missing values table
|
|
1437
1789
|
with console.status("[bold green]Analyzing missing values..."):
|
|
@@ -1447,7 +1799,38 @@ def missing(data_source: str, output_html: str | None):
|
|
|
1447
1799
|
console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
|
|
1448
1800
|
else:
|
|
1449
1801
|
# Display in terminal with special missing values formatting
|
|
1450
|
-
|
|
1802
|
+
# Create enhanced context info for missing table display
|
|
1803
|
+
missing_info = {}
|
|
1804
|
+
try:
|
|
1805
|
+
# Determine source type and table type for enhanced preview title
|
|
1806
|
+
if is_piped_data:
|
|
1807
|
+
if data_source.endswith(".parquet"):
|
|
1808
|
+
source_type = "Polars expression (serialized to Parquet) from `pb pl`"
|
|
1809
|
+
elif data_source.endswith(".csv"):
|
|
1810
|
+
source_type = "Polars expression (serialized to CSV) from `pb pl`"
|
|
1811
|
+
else:
|
|
1812
|
+
source_type = "Polars expression from `pb pl`"
|
|
1813
|
+
elif data_source in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
1814
|
+
source_type = f"Pointblank dataset: {data_source}"
|
|
1815
|
+
else:
|
|
1816
|
+
source_type = f"External source: {data_source}"
|
|
1817
|
+
|
|
1818
|
+
missing_info = {
|
|
1819
|
+
"source_type": source_type,
|
|
1820
|
+
"table_type": _get_tbl_type(original_data),
|
|
1821
|
+
"total_rows": pb.get_row_count(original_data),
|
|
1822
|
+
"total_columns": pb.get_column_count(original_data),
|
|
1823
|
+
}
|
|
1824
|
+
except Exception:
|
|
1825
|
+
# Use defaults if metadata extraction fails
|
|
1826
|
+
missing_info = {
|
|
1827
|
+
"source_type": f"Data source: {data_source}",
|
|
1828
|
+
"table_type": "unknown",
|
|
1829
|
+
"total_rows": None,
|
|
1830
|
+
"total_columns": None,
|
|
1831
|
+
}
|
|
1832
|
+
|
|
1833
|
+
_rich_print_missing_table_enhanced(gt_table, original_data, missing_info)
|
|
1451
1834
|
|
|
1452
1835
|
except Exception as e:
|
|
1453
1836
|
console.print(f"[red]Error:[/red] {e}")
|
|
@@ -1455,10 +1838,11 @@ def missing(data_source: str, output_html: str | None):
|
|
|
1455
1838
|
|
|
1456
1839
|
|
|
1457
1840
|
@cli.command(name="validate")
|
|
1458
|
-
@click.argument("data_source", type=str)
|
|
1841
|
+
@click.argument("data_source", type=str, required=False)
|
|
1842
|
+
@click.option("--list-checks", is_flag=True, help="List available validation checks and exit")
|
|
1459
1843
|
@click.option(
|
|
1460
1844
|
"--check",
|
|
1461
|
-
"checks",
|
|
1845
|
+
"checks",
|
|
1462
1846
|
type=click.Choice(
|
|
1463
1847
|
[
|
|
1464
1848
|
"rows-distinct",
|
|
@@ -1472,25 +1856,25 @@ def missing(data_source: str, output_html: str | None):
|
|
|
1472
1856
|
"col-vals-le",
|
|
1473
1857
|
]
|
|
1474
1858
|
),
|
|
1859
|
+
metavar="CHECK_TYPE",
|
|
1475
1860
|
multiple=True, # Allow multiple --check options
|
|
1476
1861
|
help="Type of validation check to perform. Can be used multiple times for multiple checks.",
|
|
1477
1862
|
)
|
|
1478
|
-
@click.option("--list-checks", is_flag=True, help="List available validation checks and exit")
|
|
1479
1863
|
@click.option(
|
|
1480
1864
|
"--column",
|
|
1481
|
-
"columns",
|
|
1865
|
+
"columns",
|
|
1482
1866
|
multiple=True, # Allow multiple --column options
|
|
1483
1867
|
help="Column name or integer position as #N (1-based index) for validation.",
|
|
1484
1868
|
)
|
|
1485
1869
|
@click.option(
|
|
1486
1870
|
"--set",
|
|
1487
|
-
"sets",
|
|
1871
|
+
"sets",
|
|
1488
1872
|
multiple=True, # Allow multiple --set options
|
|
1489
1873
|
help="Comma-separated allowed values for col-vals-in-set checks.",
|
|
1490
1874
|
)
|
|
1491
1875
|
@click.option(
|
|
1492
1876
|
"--value",
|
|
1493
|
-
"values",
|
|
1877
|
+
"values",
|
|
1494
1878
|
type=float,
|
|
1495
1879
|
multiple=True, # Allow multiple --value options
|
|
1496
1880
|
help="Numeric value for comparison checks.",
|
|
@@ -1502,17 +1886,17 @@ def missing(data_source: str, output_html: str | None):
|
|
|
1502
1886
|
"--write-extract", type=str, help="Save failing rows to folder. Provide base name for folder."
|
|
1503
1887
|
)
|
|
1504
1888
|
@click.option(
|
|
1505
|
-
"--limit",
|
|
1889
|
+
"--limit", default=500, help="Maximum number of failing rows to save to CSV (default: 500)"
|
|
1506
1890
|
)
|
|
1507
1891
|
@click.option("--exit-code", is_flag=True, help="Exit with non-zero code if validation fails")
|
|
1508
1892
|
@click.pass_context
|
|
1509
1893
|
def validate(
|
|
1510
1894
|
ctx: click.Context,
|
|
1511
|
-
data_source: str,
|
|
1512
|
-
checks: tuple[str, ...],
|
|
1513
|
-
columns: tuple[str, ...],
|
|
1514
|
-
sets: tuple[str, ...],
|
|
1515
|
-
values: tuple[float, ...],
|
|
1895
|
+
data_source: str | None,
|
|
1896
|
+
checks: tuple[str, ...],
|
|
1897
|
+
columns: tuple[str, ...],
|
|
1898
|
+
sets: tuple[str, ...],
|
|
1899
|
+
values: tuple[float, ...],
|
|
1516
1900
|
show_extract: bool,
|
|
1517
1901
|
write_extract: str | None,
|
|
1518
1902
|
limit: int,
|
|
@@ -1534,21 +1918,21 @@ def validate(
|
|
|
1534
1918
|
- Database connection string (e.g., duckdb:///path/to/db.ddb::table_name)
|
|
1535
1919
|
- Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales)
|
|
1536
1920
|
|
|
1537
|
-
AVAILABLE
|
|
1921
|
+
AVAILABLE CHECK_TYPES:
|
|
1538
1922
|
|
|
1539
1923
|
Use --list-checks to see all available validation methods with examples.
|
|
1540
1924
|
|
|
1541
|
-
The default
|
|
1925
|
+
The default CHECK_TYPE is 'rows-distinct' which checks for duplicate rows.
|
|
1542
1926
|
|
|
1543
1927
|
\b
|
|
1544
1928
|
- rows-distinct: Check if all rows in the dataset are unique (no duplicates)
|
|
1545
1929
|
- rows-complete: Check if all rows are complete (no missing values in any column)
|
|
1546
1930
|
- col-exists: Check if a specific column exists in the dataset (requires --column)
|
|
1547
1931
|
- col-vals-not-null: Check if all values in a column are not null/missing (requires --column)
|
|
1548
|
-
- col-vals-gt: Check if all values in a column are greater than a
|
|
1549
|
-
- col-vals-ge: Check if all values in a column are greater than or equal to a
|
|
1550
|
-
- col-vals-lt: Check if all values in a column are less than a
|
|
1551
|
-
- col-vals-le: Check if all values in a column are less than or equal to a
|
|
1932
|
+
- col-vals-gt: Check if all values in a column are greater than a comparison value (requires --column and --value)
|
|
1933
|
+
- col-vals-ge: Check if all values in a column are greater than or equal to a comparison value (requires --column and --value)
|
|
1934
|
+
- col-vals-lt: Check if all values in a column are less than a comparison value (requires --column and --value)
|
|
1935
|
+
- col-vals-le: Check if all values in a column are less than or equal to a comparison value (requires --column and --value)
|
|
1552
1936
|
- col-vals-in-set: Check if all values in a column are in an allowed set (requires --column and --set)
|
|
1553
1937
|
|
|
1554
1938
|
Examples:
|
|
@@ -1571,28 +1955,9 @@ def validate(
|
|
|
1571
1955
|
pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
|
|
1572
1956
|
"""
|
|
1573
1957
|
try:
|
|
1574
|
-
# Handle backward compatibility and parameter conversion
|
|
1575
1958
|
import sys
|
|
1576
1959
|
|
|
1577
|
-
#
|
|
1578
|
-
if not checks:
|
|
1579
|
-
# No --check options provided, use default
|
|
1580
|
-
checks_list = ["rows-distinct"]
|
|
1581
|
-
is_using_default_check = True
|
|
1582
|
-
else:
|
|
1583
|
-
checks_list = list(checks)
|
|
1584
|
-
is_using_default_check = False
|
|
1585
|
-
|
|
1586
|
-
columns_list = list(columns) if columns else []
|
|
1587
|
-
sets_list = list(sets) if sets else []
|
|
1588
|
-
values_list = list(values) if values else []
|
|
1589
|
-
|
|
1590
|
-
# Map parameters to checks intelligently
|
|
1591
|
-
mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
|
|
1592
|
-
checks_list, columns_list, sets_list, values_list
|
|
1593
|
-
)
|
|
1594
|
-
|
|
1595
|
-
# Handle --list-checks option
|
|
1960
|
+
# Handle --list-checks option early (doesn't need data source)
|
|
1596
1961
|
if list_checks:
|
|
1597
1962
|
console.print("[bold bright_cyan]Available Validation Checks:[/bold bright_cyan]")
|
|
1598
1963
|
console.print()
|
|
@@ -1616,14 +1981,16 @@ def validate(
|
|
|
1616
1981
|
"[bold magenta]Value comparison checks [bright_black](require --column and --value)[/bright_black]:[/bold magenta]"
|
|
1617
1982
|
)
|
|
1618
1983
|
console.print(
|
|
1619
|
-
" • [bold cyan]col-vals-gt[/bold cyan] Values greater than
|
|
1984
|
+
" • [bold cyan]col-vals-gt[/bold cyan] Values greater than comparison value"
|
|
1620
1985
|
)
|
|
1621
1986
|
console.print(
|
|
1622
|
-
" • [bold cyan]col-vals-ge[/bold cyan] Values greater than or equal to
|
|
1987
|
+
" • [bold cyan]col-vals-ge[/bold cyan] Values greater than or equal to comparison value"
|
|
1623
1988
|
)
|
|
1624
|
-
console.print(" • [bold cyan]col-vals-lt[/bold cyan] Values less than threshold")
|
|
1625
1989
|
console.print(
|
|
1626
|
-
" • [bold cyan]col-vals-
|
|
1990
|
+
" • [bold cyan]col-vals-lt[/bold cyan] Values less than comparison value"
|
|
1991
|
+
)
|
|
1992
|
+
console.print(
|
|
1993
|
+
" • [bold cyan]col-vals-le[/bold cyan] Values less than or equal to comparison value"
|
|
1627
1994
|
)
|
|
1628
1995
|
console.print()
|
|
1629
1996
|
console.print(
|
|
@@ -1634,19 +2001,65 @@ def validate(
|
|
|
1634
2001
|
)
|
|
1635
2002
|
console.print()
|
|
1636
2003
|
console.print("[bold bright_yellow]Examples:[/bold bright_yellow]")
|
|
2004
|
+
console.print(" [bright_blue]pb validate data.csv --check rows-distinct[/bright_blue]")
|
|
1637
2005
|
console.print(
|
|
1638
|
-
|
|
1639
|
-
)
|
|
1640
|
-
console.print(
|
|
1641
|
-
f" [bright_blue]pb validate {data_source} --check col-vals-not-null --column price[/bright_blue]"
|
|
2006
|
+
" [bright_blue]pb validate data.csv --check col-vals-not-null --column price[/bright_blue]"
|
|
1642
2007
|
)
|
|
1643
2008
|
console.print(
|
|
1644
|
-
|
|
2009
|
+
" [bright_blue]pb validate data.csv --check col-vals-gt --column age --value 18[/bright_blue]"
|
|
1645
2010
|
)
|
|
1646
2011
|
import sys
|
|
1647
2012
|
|
|
1648
2013
|
sys.exit(0)
|
|
1649
2014
|
|
|
2015
|
+
# Check if data_source is provided (required for all operations except --list-checks)
|
|
2016
|
+
# or if we have piped input
|
|
2017
|
+
if data_source is None:
|
|
2018
|
+
# Check if we have piped input
|
|
2019
|
+
if not sys.stdin.isatty():
|
|
2020
|
+
# Data is being piped in: read the file path from stdin
|
|
2021
|
+
piped_input = sys.stdin.read().strip()
|
|
2022
|
+
if piped_input:
|
|
2023
|
+
data_source = piped_input
|
|
2024
|
+
|
|
2025
|
+
# Determine the format from the file extension
|
|
2026
|
+
if piped_input.endswith(".parquet"):
|
|
2027
|
+
format_type = "Parquet"
|
|
2028
|
+
elif piped_input.endswith(".csv"):
|
|
2029
|
+
format_type = "CSV"
|
|
2030
|
+
else:
|
|
2031
|
+
format_type = "unknown"
|
|
2032
|
+
|
|
2033
|
+
console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
|
|
2034
|
+
else:
|
|
2035
|
+
console.print("[red]Error:[/red] No data provided via pipe")
|
|
2036
|
+
sys.exit(1)
|
|
2037
|
+
else:
|
|
2038
|
+
# Show concise help and exit
|
|
2039
|
+
_show_concise_help("validate", None)
|
|
2040
|
+
return
|
|
2041
|
+
|
|
2042
|
+
# Handle backward compatibility and parameter conversion
|
|
2043
|
+
import sys
|
|
2044
|
+
|
|
2045
|
+
# Convert parameter tuples to lists, handling default case
|
|
2046
|
+
if not checks:
|
|
2047
|
+
# No --check options provided, use default
|
|
2048
|
+
checks_list = ["rows-distinct"]
|
|
2049
|
+
is_using_default_check = True
|
|
2050
|
+
else:
|
|
2051
|
+
checks_list = list(checks)
|
|
2052
|
+
is_using_default_check = False
|
|
2053
|
+
|
|
2054
|
+
columns_list = list(columns) if columns else []
|
|
2055
|
+
sets_list = list(sets) if sets else []
|
|
2056
|
+
values_list = list(values) if values else []
|
|
2057
|
+
|
|
2058
|
+
# Map parameters to checks intelligently
|
|
2059
|
+
mapped_columns, mapped_sets, mapped_values = _map_parameters_to_checks(
|
|
2060
|
+
checks_list, columns_list, sets_list, values_list
|
|
2061
|
+
)
|
|
2062
|
+
|
|
1650
2063
|
# Validate required parameters for different check types
|
|
1651
2064
|
# Check parameters for each check in the list using mapped parameters
|
|
1652
2065
|
for i, check in enumerate(checks_list):
|
|
@@ -1732,7 +2145,25 @@ def validate(
|
|
|
1732
2145
|
checks_list, columns_list, sets_list, values_list
|
|
1733
2146
|
)
|
|
1734
2147
|
|
|
1735
|
-
|
|
2148
|
+
# Check if this is a piped data source and create friendly display name
|
|
2149
|
+
is_piped_data = (
|
|
2150
|
+
data_source
|
|
2151
|
+
and data_source.startswith("/var/folders/")
|
|
2152
|
+
and ("pb_pipe_" in data_source or "/T/" in data_source)
|
|
2153
|
+
)
|
|
2154
|
+
|
|
2155
|
+
if is_piped_data:
|
|
2156
|
+
if data_source.endswith(".parquet"):
|
|
2157
|
+
display_source = "Parquet file via `pb pl`"
|
|
2158
|
+
elif data_source.endswith(".csv"):
|
|
2159
|
+
display_source = "CSV file via `pb pl`"
|
|
2160
|
+
else:
|
|
2161
|
+
display_source = "File via `pb pl`"
|
|
2162
|
+
console.print(
|
|
2163
|
+
f"[green]✓[/green] Loaded data source: {display_source} ({data_source})"
|
|
2164
|
+
)
|
|
2165
|
+
else:
|
|
2166
|
+
console.print(f"[green]✓[/green] Loaded data source: {data_source}")
|
|
1736
2167
|
|
|
1737
2168
|
# Build a single validation object with chained checks
|
|
1738
2169
|
with console.status(f"[bold green]Running {len(checks_list)} validation check(s)..."):
|
|
@@ -1791,7 +2222,7 @@ def validate(
|
|
|
1791
2222
|
|
|
1792
2223
|
# Display results based on whether we have single or multiple checks
|
|
1793
2224
|
if len(checks_list) == 1:
|
|
1794
|
-
# Single check
|
|
2225
|
+
# Single check: use current display format
|
|
1795
2226
|
_display_validation_result(
|
|
1796
2227
|
validation,
|
|
1797
2228
|
checks_list,
|
|
@@ -1806,7 +2237,7 @@ def validate(
|
|
|
1806
2237
|
limit,
|
|
1807
2238
|
)
|
|
1808
2239
|
else:
|
|
1809
|
-
# Multiple checks
|
|
2240
|
+
# Multiple checks: use stacked display format
|
|
1810
2241
|
any_failed = False
|
|
1811
2242
|
for i in range(len(checks_list)):
|
|
1812
2243
|
console.print() # Add spacing between results
|
|
@@ -1845,7 +2276,7 @@ def validate(
|
|
|
1845
2276
|
console.print()
|
|
1846
2277
|
console.print("[bold magenta]Common validation options:[/bold magenta]")
|
|
1847
2278
|
console.print(
|
|
1848
|
-
" • [bold cyan]--check rows-complete[/bold cyan]
|
|
2279
|
+
" • [bold cyan]--check rows-complete[/bold cyan] Check for rows with missing values"
|
|
1849
2280
|
)
|
|
1850
2281
|
console.print(
|
|
1851
2282
|
" • [bold cyan]--check col-vals-not-null[/bold cyan] Check for null values in a column [bright_black](requires --column)[/bright_black]"
|
|
@@ -1955,81 +2386,284 @@ def requirements():
|
|
|
1955
2386
|
console.print("\n[dim]Install missing packages to enable additional functionality.[/dim]")
|
|
1956
2387
|
|
|
1957
2388
|
|
|
1958
|
-
def
|
|
1959
|
-
|
|
1960
|
-
data_source: str,
|
|
1961
|
-
source_type: str,
|
|
1962
|
-
table_type: str,
|
|
1963
|
-
total_rows: int | None = None,
|
|
1964
|
-
total_columns: int | None = None,
|
|
2389
|
+
def _rich_print_missing_table_enhanced(
|
|
2390
|
+
gt_table: Any, original_data: Any = None, missing_info: dict = None
|
|
1965
2391
|
) -> None:
|
|
1966
|
-
"""
|
|
1967
|
-
Display scan results as a Rich table in the terminal with statistical measures.
|
|
2392
|
+
"""Convert a missing values GT table to Rich table with enhanced formatting and metadata.
|
|
1968
2393
|
|
|
1969
2394
|
Args:
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
table_type: Type of table (e.g., "polars.LazyFrame")
|
|
1974
|
-
total_rows: Total number of rows in the dataset
|
|
1975
|
-
total_columns: Total number of columns in the dataset
|
|
2395
|
+
gt_table: The GT table object for missing values
|
|
2396
|
+
original_data: The original data source to extract column types
|
|
2397
|
+
missing_info: Dict with metadata including source_type, table_type, total_rows, total_columns
|
|
1976
2398
|
"""
|
|
1977
2399
|
try:
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
import narwhals as nw
|
|
1981
|
-
from rich.box import SIMPLE_HEAD
|
|
2400
|
+
# Extract the underlying data from the GT table
|
|
2401
|
+
df = None
|
|
1982
2402
|
|
|
1983
|
-
|
|
1984
|
-
|
|
1985
|
-
|
|
2403
|
+
if hasattr(gt_table, "_tbl_data") and gt_table._tbl_data is not None:
|
|
2404
|
+
df = gt_table._tbl_data
|
|
2405
|
+
elif hasattr(gt_table, "_data") and gt_table._data is not None:
|
|
2406
|
+
df = gt_table._data
|
|
2407
|
+
elif hasattr(gt_table, "data") and gt_table.data is not None:
|
|
2408
|
+
df = gt_table.data
|
|
1986
2409
|
|
|
1987
|
-
|
|
1988
|
-
|
|
2410
|
+
if df is not None:
|
|
2411
|
+
from rich.box import SIMPLE_HEAD
|
|
1989
2412
|
|
|
1990
|
-
|
|
1991
|
-
|
|
2413
|
+
# Extract metadata from missing_info or use defaults
|
|
2414
|
+
source_type = "Data source"
|
|
2415
|
+
table_type = "unknown"
|
|
2416
|
+
total_rows = None
|
|
2417
|
+
total_columns = None
|
|
1992
2418
|
|
|
1993
|
-
|
|
1994
|
-
|
|
1995
|
-
|
|
2419
|
+
if missing_info:
|
|
2420
|
+
source_type = missing_info.get("source_type", "Data source")
|
|
2421
|
+
table_type = missing_info.get("table_type", "unknown")
|
|
2422
|
+
total_rows = missing_info.get("total_rows")
|
|
2423
|
+
total_columns = missing_info.get("total_columns")
|
|
1996
2424
|
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
|
|
2425
|
+
# Create enhanced title matching the scan table format
|
|
2426
|
+
title_text = f"Missing Values / {source_type} / {table_type}"
|
|
2000
2427
|
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
show_header=True,
|
|
2005
|
-
header_style="bold magenta",
|
|
2006
|
-
box=SIMPLE_HEAD,
|
|
2007
|
-
title_style="bold cyan",
|
|
2008
|
-
title_justify="left",
|
|
2009
|
-
)
|
|
2428
|
+
# Add dimensions subtitle in gray if available
|
|
2429
|
+
if total_rows is not None and total_columns is not None:
|
|
2430
|
+
title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
|
|
2010
2431
|
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
|
|
2017
|
-
|
|
2018
|
-
|
|
2019
|
-
|
|
2432
|
+
# Get column names
|
|
2433
|
+
columns = []
|
|
2434
|
+
try:
|
|
2435
|
+
if hasattr(df, "columns"):
|
|
2436
|
+
columns = list(df.columns)
|
|
2437
|
+
elif hasattr(df, "schema"):
|
|
2438
|
+
columns = list(df.schema.names)
|
|
2439
|
+
except Exception as e:
|
|
2440
|
+
console.print(f"[red]Error getting columns:[/red] {e}")
|
|
2441
|
+
columns = []
|
|
2020
2442
|
|
|
2021
|
-
|
|
2022
|
-
|
|
2023
|
-
|
|
2024
|
-
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
|
|
2028
|
-
|
|
2029
|
-
|
|
2030
|
-
|
|
2031
|
-
|
|
2032
|
-
|
|
2443
|
+
if not columns:
|
|
2444
|
+
columns = [f"Column {i + 1}" for i in range(10)] # Fallback
|
|
2445
|
+
|
|
2446
|
+
# Get original data to extract column types
|
|
2447
|
+
column_types = {}
|
|
2448
|
+
if original_data is not None:
|
|
2449
|
+
try:
|
|
2450
|
+
# Get column types from original data
|
|
2451
|
+
if hasattr(original_data, "columns"):
|
|
2452
|
+
original_columns = list(original_data.columns)
|
|
2453
|
+
column_types = _get_column_dtypes(original_data, original_columns)
|
|
2454
|
+
except Exception as e:
|
|
2455
|
+
console.print(f"[red]Error getting column types:[/red] {e}")
|
|
2456
|
+
pass # Use empty dict as fallback
|
|
2457
|
+
|
|
2458
|
+
# Add columns to Rich table with special formatting for missing values table
|
|
2459
|
+
sector_columns = [col for col in columns if col != "columns" and col.isdigit()]
|
|
2460
|
+
|
|
2461
|
+
# Print the title first
|
|
2462
|
+
console.print()
|
|
2463
|
+
console.print(f"[bold cyan]{title_text}[/bold cyan]")
|
|
2464
|
+
|
|
2465
|
+
# Show the custom spanner header if we have sector columns
|
|
2466
|
+
if sector_columns:
|
|
2467
|
+
# Create a custom header line that shows the spanner
|
|
2468
|
+
header_parts = []
|
|
2469
|
+
header_parts.append(" " * 20) # Space for Column header
|
|
2470
|
+
header_parts.append(" " * 10) # Space for Type header
|
|
2471
|
+
|
|
2472
|
+
# Left-align "Row Sectors" with the first numbered column
|
|
2473
|
+
row_sectors_text = "Row Sectors"
|
|
2474
|
+
header_parts.append(row_sectors_text)
|
|
2475
|
+
|
|
2476
|
+
# Print the custom spanner header
|
|
2477
|
+
console.print("[dim]" + " ".join(header_parts) + "[/dim]")
|
|
2478
|
+
|
|
2479
|
+
# Add a horizontal rule below the spanner
|
|
2480
|
+
rule_parts = []
|
|
2481
|
+
rule_parts.append(" " * 20) # Space for Column header
|
|
2482
|
+
rule_parts.append(" " * 10) # Space for Type header
|
|
2483
|
+
|
|
2484
|
+
# Use a fixed width horizontal rule for "Row Sectors"
|
|
2485
|
+
horizontal_rule = "─" * 20
|
|
2486
|
+
rule_parts.append(horizontal_rule)
|
|
2487
|
+
|
|
2488
|
+
# Print the horizontal rule
|
|
2489
|
+
console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
|
|
2490
|
+
|
|
2491
|
+
# Create the missing values table WITHOUT the title (since we printed it above)
|
|
2492
|
+
rich_table = Table(
|
|
2493
|
+
show_header=True,
|
|
2494
|
+
header_style="bold magenta",
|
|
2495
|
+
box=SIMPLE_HEAD,
|
|
2496
|
+
)
|
|
2497
|
+
|
|
2498
|
+
# Two separate columns: Column name (20 chars) and Data type (10 chars)
|
|
2499
|
+
rich_table.add_column("Column", style="cyan", no_wrap=True, width=20)
|
|
2500
|
+
rich_table.add_column("Type", style="yellow", no_wrap=True, width=10)
|
|
2501
|
+
|
|
2502
|
+
# Sector columns: All same width, optimized for "100%" (4 chars + padding)
|
|
2503
|
+
for sector in sector_columns:
|
|
2504
|
+
rich_table.add_column(
|
|
2505
|
+
sector,
|
|
2506
|
+
style="cyan",
|
|
2507
|
+
justify="center",
|
|
2508
|
+
no_wrap=True,
|
|
2509
|
+
width=5, # Fixed width optimized for percentage values
|
|
2510
|
+
)
|
|
2511
|
+
|
|
2512
|
+
# Convert data to rows with special formatting
|
|
2513
|
+
rows = []
|
|
2514
|
+
try:
|
|
2515
|
+
if hasattr(df, "to_dicts"):
|
|
2516
|
+
data_dict = df.to_dicts()
|
|
2517
|
+
elif hasattr(df, "to_dict"):
|
|
2518
|
+
data_dict = df.to_dict("records")
|
|
2519
|
+
else:
|
|
2520
|
+
data_dict = []
|
|
2521
|
+
|
|
2522
|
+
for i, row in enumerate(data_dict):
|
|
2523
|
+
try:
|
|
2524
|
+
# Each row should have: [column_name, data_type, sector1, sector2, ...]
|
|
2525
|
+
column_name = str(row.get("columns", ""))
|
|
2526
|
+
|
|
2527
|
+
# Truncate column name to 20 characters with ellipsis if needed
|
|
2528
|
+
if len(column_name) > 20:
|
|
2529
|
+
truncated_name = column_name[:17] + "…"
|
|
2530
|
+
else:
|
|
2531
|
+
truncated_name = column_name
|
|
2532
|
+
|
|
2533
|
+
# Get data type for this column
|
|
2534
|
+
if column_name in column_types:
|
|
2535
|
+
dtype = column_types[column_name]
|
|
2536
|
+
if len(dtype) > 10:
|
|
2537
|
+
truncated_dtype = dtype[:9] + "…"
|
|
2538
|
+
else:
|
|
2539
|
+
truncated_dtype = dtype
|
|
2540
|
+
else:
|
|
2541
|
+
truncated_dtype = "?"
|
|
2542
|
+
|
|
2543
|
+
# Start building the row with column name and type
|
|
2544
|
+
formatted_row = [truncated_name, truncated_dtype]
|
|
2545
|
+
|
|
2546
|
+
# Add sector values (formatted percentages)
|
|
2547
|
+
for sector in sector_columns:
|
|
2548
|
+
value = row.get(sector, 0.0)
|
|
2549
|
+
if isinstance(value, (int, float)):
|
|
2550
|
+
formatted_row.append(_format_missing_percentage(float(value)))
|
|
2551
|
+
else:
|
|
2552
|
+
formatted_row.append(str(value))
|
|
2553
|
+
|
|
2554
|
+
rows.append(formatted_row)
|
|
2555
|
+
|
|
2556
|
+
except Exception as e:
|
|
2557
|
+
console.print(f"[red]Error processing row {i}:[/red] {e}")
|
|
2558
|
+
continue
|
|
2559
|
+
|
|
2560
|
+
except Exception as e:
|
|
2561
|
+
console.print(f"[red]Error extracting data:[/red] {e}")
|
|
2562
|
+
rows = [["Error extracting data", "?", *["" for _ in sector_columns]]]
|
|
2563
|
+
|
|
2564
|
+
# Add rows to Rich table
|
|
2565
|
+
for row in rows:
|
|
2566
|
+
try:
|
|
2567
|
+
rich_table.add_row(*row)
|
|
2568
|
+
except Exception as e:
|
|
2569
|
+
console.print(f"[red]Error adding row:[/red] {e}")
|
|
2570
|
+
break
|
|
2571
|
+
|
|
2572
|
+
# Print the Rich table (without title since we already printed it)
|
|
2573
|
+
console.print(rich_table)
|
|
2574
|
+
|
|
2575
|
+
footer_text = (
|
|
2576
|
+
"[dim]Symbols: [green]●[/green] = no missing vals in sector, "
|
|
2577
|
+
"[red]●[/red] = all vals completely missing, "
|
|
2578
|
+
"[cyan]x%[/cyan] = percentage missing[/dim]"
|
|
2579
|
+
)
|
|
2580
|
+
console.print(footer_text)
|
|
2581
|
+
|
|
2582
|
+
else:
|
|
2583
|
+
# Fallback to regular table display
|
|
2584
|
+
_rich_print_gt_table(gt_table)
|
|
2585
|
+
|
|
2586
|
+
except Exception as e:
|
|
2587
|
+
console.print(f"[red]Error rendering missing values table:[/red] {e}")
|
|
2588
|
+
# Fallback to regular table display
|
|
2589
|
+
_rich_print_gt_table(gt_table)
|
|
2590
|
+
|
|
2591
|
+
|
|
2592
|
+
def _rich_print_scan_table(
|
|
2593
|
+
scan_result: Any,
|
|
2594
|
+
data_source: str,
|
|
2595
|
+
source_type: str,
|
|
2596
|
+
table_type: str,
|
|
2597
|
+
total_rows: int | None = None,
|
|
2598
|
+
total_columns: int | None = None,
|
|
2599
|
+
) -> None:
|
|
2600
|
+
"""
|
|
2601
|
+
Display scan results as a Rich table in the terminal with statistical measures.
|
|
2602
|
+
|
|
2603
|
+
Args:
|
|
2604
|
+
scan_result: The GT object from col_summary_tbl()
|
|
2605
|
+
data_source: Name of the data source being scanned
|
|
2606
|
+
source_type: Type of data source (e.g., "Pointblank dataset: small_table")
|
|
2607
|
+
table_type: Type of table (e.g., "polars.LazyFrame")
|
|
2608
|
+
total_rows: Total number of rows in the dataset
|
|
2609
|
+
total_columns: Total number of columns in the dataset
|
|
2610
|
+
"""
|
|
2611
|
+
try:
|
|
2612
|
+
import re
|
|
2613
|
+
|
|
2614
|
+
import narwhals as nw
|
|
2615
|
+
from rich.box import SIMPLE_HEAD
|
|
2616
|
+
|
|
2617
|
+
# Extract the underlying DataFrame from the GT object
|
|
2618
|
+
# The GT object has a _tbl_data attribute that contains the DataFrame
|
|
2619
|
+
gt_data = scan_result._tbl_data
|
|
2620
|
+
|
|
2621
|
+
# Convert to Narwhals DataFrame for consistent handling
|
|
2622
|
+
nw_data = nw.from_native(gt_data)
|
|
2623
|
+
|
|
2624
|
+
# Convert to dictionary for easier access
|
|
2625
|
+
data_dict = nw_data.to_dict(as_series=False)
|
|
2626
|
+
|
|
2627
|
+
# Create main scan table with missing data table styling
|
|
2628
|
+
# Create a comprehensive title with data source, source type, and table type
|
|
2629
|
+
title_text = f"Column Summary / {source_type} / {table_type}"
|
|
2630
|
+
|
|
2631
|
+
# Add dimensions subtitle in gray if available
|
|
2632
|
+
if total_rows is not None and total_columns is not None:
|
|
2633
|
+
title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
|
|
2634
|
+
|
|
2635
|
+
# Create the scan table
|
|
2636
|
+
scan_table = Table(
|
|
2637
|
+
title=title_text,
|
|
2638
|
+
show_header=True,
|
|
2639
|
+
header_style="bold magenta",
|
|
2640
|
+
box=SIMPLE_HEAD,
|
|
2641
|
+
title_style="bold cyan",
|
|
2642
|
+
title_justify="left",
|
|
2643
|
+
)
|
|
2644
|
+
|
|
2645
|
+
# Add columns with specific styling and appropriate widths
|
|
2646
|
+
scan_table.add_column("Column", style="cyan", no_wrap=True, width=20)
|
|
2647
|
+
scan_table.add_column("Type", style="yellow", no_wrap=True, width=10)
|
|
2648
|
+
scan_table.add_column(
|
|
2649
|
+
"NA", style="red", width=6, justify="right"
|
|
2650
|
+
) # Adjusted for better formatting
|
|
2651
|
+
scan_table.add_column(
|
|
2652
|
+
"UQ", style="green", width=8, justify="right"
|
|
2653
|
+
) # Adjusted for boolean values
|
|
2654
|
+
|
|
2655
|
+
# Add statistical columns if they exist with appropriate widths
|
|
2656
|
+
stat_columns = []
|
|
2657
|
+
column_mapping = {
|
|
2658
|
+
"mean": ("Mean", "blue", 9),
|
|
2659
|
+
"std": ("SD", "blue", 9),
|
|
2660
|
+
"min": ("Min", "yellow", 9),
|
|
2661
|
+
"median": ("Med", "yellow", 9),
|
|
2662
|
+
"max": ("Max", "yellow", 9),
|
|
2663
|
+
"q_1": ("Q₁", "magenta", 8),
|
|
2664
|
+
"q_3": ("Q₃", "magenta", 9),
|
|
2665
|
+
"iqr": ("IQR", "magenta", 8),
|
|
2666
|
+
}
|
|
2033
2667
|
|
|
2034
2668
|
for col_key, (display_name, color, width) in column_mapping.items():
|
|
2035
2669
|
if col_key in data_dict:
|
|
@@ -2070,7 +2704,7 @@ def _rich_print_scan_table(
|
|
|
2070
2704
|
# Clean up HTML formatting from the raw data
|
|
2071
2705
|
str_val = str(value)
|
|
2072
2706
|
|
|
2073
|
-
# Handle multi-line values with <br> tags FIRST
|
|
2707
|
+
# Handle multi-line values with <br> tags FIRST: take the first line (absolute number)
|
|
2074
2708
|
if "<br>" in str_val:
|
|
2075
2709
|
str_val = str_val.split("<br>")[0].strip()
|
|
2076
2710
|
# For unique values, we want just the integer part
|
|
@@ -2089,14 +2723,14 @@ def _rich_print_scan_table(
|
|
|
2089
2723
|
# Clean up extra whitespace
|
|
2090
2724
|
str_val = re.sub(r"\s+", " ", str_val).strip()
|
|
2091
2725
|
|
|
2092
|
-
# Handle values like "2<.01"
|
|
2726
|
+
# Handle values like "2<.01": extract the first number
|
|
2093
2727
|
if "<" in str_val and not (str_val.startswith("<") and str_val.endswith(">")):
|
|
2094
2728
|
# Extract number before the < symbol
|
|
2095
2729
|
before_lt = str_val.split("<")[0].strip()
|
|
2096
2730
|
if before_lt and before_lt.replace(".", "").replace("-", "").isdigit():
|
|
2097
2731
|
str_val = before_lt
|
|
2098
2732
|
|
|
2099
|
-
# Handle boolean unique values like "T0.62F0.38"
|
|
2733
|
+
# Handle boolean unique values like "T0.62F0.38": extract the more readable format
|
|
2100
2734
|
if re.match(r"^[TF]\d+\.\d+[TF]\d+\.\d+$", str_val):
|
|
2101
2735
|
# Extract T and F values
|
|
2102
2736
|
t_match = re.search(r"T(\d+\.\d+)", str_val)
|
|
@@ -2126,7 +2760,7 @@ def _rich_print_scan_table(
|
|
|
2126
2760
|
# Simple integers under 10000
|
|
2127
2761
|
return str(int(num_val))
|
|
2128
2762
|
elif abs(num_val) >= 10000000 and abs(num_val) < 100000000:
|
|
2129
|
-
# Likely dates in YYYYMMDD format
|
|
2763
|
+
# Likely dates in YYYYMMDD format: format as date-like
|
|
2130
2764
|
int_val = int(num_val)
|
|
2131
2765
|
if 19000101 <= int_val <= 29991231: # Reasonable date range
|
|
2132
2766
|
str_date = str(int_val)
|
|
@@ -2138,29 +2772,29 @@ def _rich_print_scan_table(
|
|
|
2138
2772
|
# Otherwise treat as large number
|
|
2139
2773
|
return f"{num_val / 1000000:.1f}M"
|
|
2140
2774
|
elif abs(num_val) >= 1000000:
|
|
2141
|
-
# Large numbers
|
|
2775
|
+
# Large numbers: use scientific notation or M/k notation
|
|
2142
2776
|
|
|
2143
2777
|
if abs(num_val) >= 1000000000:
|
|
2144
2778
|
return f"{num_val:.1e}"
|
|
2145
2779
|
else:
|
|
2146
2780
|
return f"{num_val / 1000000:.1f}M"
|
|
2147
2781
|
elif abs(num_val) >= 10000:
|
|
2148
|
-
# Numbers >= 10k
|
|
2782
|
+
# Numbers >= 10k: use compact notation
|
|
2149
2783
|
return f"{num_val / 1000:.1f}k"
|
|
2150
2784
|
elif abs(num_val) >= 100:
|
|
2151
|
-
# Numbers 100-9999
|
|
2785
|
+
# Numbers 100-9999: show with minimal decimals
|
|
2152
2786
|
return f"{num_val:.1f}"
|
|
2153
2787
|
elif abs(num_val) >= 10:
|
|
2154
|
-
# Numbers 10-99
|
|
2788
|
+
# Numbers 10-99: show with one decimal
|
|
2155
2789
|
return f"{num_val:.1f}"
|
|
2156
2790
|
elif abs(num_val) >= 1:
|
|
2157
|
-
# Numbers 1-9
|
|
2791
|
+
# Numbers 1-9: show with two decimals
|
|
2158
2792
|
return f"{num_val:.2f}"
|
|
2159
2793
|
elif abs(num_val) >= 0.01:
|
|
2160
|
-
# Small numbers
|
|
2794
|
+
# Small numbers: show with appropriate precision
|
|
2161
2795
|
return f"{num_val:.2f}"
|
|
2162
2796
|
else:
|
|
2163
|
-
# Very small numbers
|
|
2797
|
+
# Very small numbers: use scientific notation
|
|
2164
2798
|
|
|
2165
2799
|
return f"{num_val:.1e}"
|
|
2166
2800
|
|
|
@@ -2168,7 +2802,7 @@ def _rich_print_scan_table(
|
|
|
2168
2802
|
# Not a number, handle as string
|
|
2169
2803
|
pass
|
|
2170
2804
|
|
|
2171
|
-
# Handle date/datetime strings
|
|
2805
|
+
# Handle date/datetime strings: show abbreviated format
|
|
2172
2806
|
if len(str_val) > 10 and any(char in str_val for char in ["-", "/", ":"]):
|
|
2173
2807
|
# Likely a date/datetime, show abbreviated
|
|
2174
2808
|
if len(str_val) > max_width:
|
|
@@ -2244,8 +2878,36 @@ def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
|
|
|
2244
2878
|
if df is not None:
|
|
2245
2879
|
from rich.box import SIMPLE_HEAD
|
|
2246
2880
|
|
|
2247
|
-
#
|
|
2248
|
-
|
|
2881
|
+
# Get metadata for enhanced missing table title
|
|
2882
|
+
total_rows = None
|
|
2883
|
+
total_columns = None
|
|
2884
|
+
source_type = "Data source"
|
|
2885
|
+
table_type = "unknown"
|
|
2886
|
+
|
|
2887
|
+
if original_data is not None:
|
|
2888
|
+
try:
|
|
2889
|
+
total_rows = pb.get_row_count(original_data)
|
|
2890
|
+
total_columns = pb.get_column_count(original_data)
|
|
2891
|
+
table_type = _get_tbl_type(original_data)
|
|
2892
|
+
except Exception:
|
|
2893
|
+
pass
|
|
2894
|
+
|
|
2895
|
+
# Create enhanced title matching the scan table format
|
|
2896
|
+
title_text = f"Missing Values / {source_type} / {table_type}"
|
|
2897
|
+
|
|
2898
|
+
# Add dimensions subtitle in gray if available
|
|
2899
|
+
if total_rows is not None and total_columns is not None:
|
|
2900
|
+
title_text += f"\n[dim]{total_rows:,} rows / {total_columns} columns[/dim]"
|
|
2901
|
+
|
|
2902
|
+
# Create the missing values table with enhanced title
|
|
2903
|
+
rich_table = Table(
|
|
2904
|
+
title=title_text,
|
|
2905
|
+
show_header=True,
|
|
2906
|
+
header_style="bold magenta",
|
|
2907
|
+
box=SIMPLE_HEAD,
|
|
2908
|
+
title_style="bold cyan",
|
|
2909
|
+
title_justify="left",
|
|
2910
|
+
)
|
|
2249
2911
|
|
|
2250
2912
|
# Get column names
|
|
2251
2913
|
columns = []
|
|
@@ -2377,12 +3039,12 @@ def _rich_print_missing_table(gt_table: Any, original_data: Any = None) -> None:
|
|
|
2377
3039
|
console.print("[dim]" + " ".join(rule_parts) + "[/dim]")
|
|
2378
3040
|
|
|
2379
3041
|
# Print the Rich table (will handle terminal width automatically)
|
|
3042
|
+
console.print()
|
|
2380
3043
|
console.print(rich_table)
|
|
2381
3044
|
footer_text = (
|
|
2382
|
-
"[dim]Symbols: [green]●[/green] = no missing
|
|
2383
|
-
"[red]●[/red] = completely missing, "
|
|
2384
|
-
"
|
|
2385
|
-
">99% = more than 99% missing[/dim]"
|
|
3045
|
+
"[dim]Symbols: [green]●[/green] = no missing vals in sector, "
|
|
3046
|
+
"[red]●[/red] = all vals completely missing, "
|
|
3047
|
+
"[cyan]x%[/cyan] = percentage missing[/dim]"
|
|
2386
3048
|
)
|
|
2387
3049
|
console.print(footer_text)
|
|
2388
3050
|
|
|
@@ -2521,6 +3183,20 @@ def _display_validation_result(
|
|
|
2521
3183
|
set_val = sets_list[step_index] if step_index < len(sets_list) else None
|
|
2522
3184
|
value = values_list[step_index] if step_index < len(values_list) else None
|
|
2523
3185
|
|
|
3186
|
+
# Check if this is piped data
|
|
3187
|
+
is_piped_data = _is_piped_data_source(data_source)
|
|
3188
|
+
|
|
3189
|
+
# Create friendly display name for data source
|
|
3190
|
+
if is_piped_data:
|
|
3191
|
+
if data_source.endswith(".parquet"):
|
|
3192
|
+
display_source = "Polars expression (serialized to Parquet) from `pb pl`"
|
|
3193
|
+
elif data_source.endswith(".csv"):
|
|
3194
|
+
display_source = "Polars expression (serialized to CSV) from `pb pl`"
|
|
3195
|
+
else:
|
|
3196
|
+
display_source = "Polars expression from `pb pl`"
|
|
3197
|
+
else:
|
|
3198
|
+
display_source = data_source
|
|
3199
|
+
|
|
2524
3200
|
# Get validation step info
|
|
2525
3201
|
step_info = None
|
|
2526
3202
|
if hasattr(validation, "validation_info") and len(validation.validation_info) > step_index:
|
|
@@ -2528,7 +3204,7 @@ def _display_validation_result(
|
|
|
2528
3204
|
|
|
2529
3205
|
# Create friendly title for table
|
|
2530
3206
|
if total_checks == 1:
|
|
2531
|
-
# Single check
|
|
3207
|
+
# Single check: use original title format
|
|
2532
3208
|
if check == "rows-distinct":
|
|
2533
3209
|
table_title = "Validation Result: Rows Distinct"
|
|
2534
3210
|
elif check == "col-vals-not-null":
|
|
@@ -2550,7 +3226,7 @@ def _display_validation_result(
|
|
|
2550
3226
|
else:
|
|
2551
3227
|
table_title = f"Validation Result: {check.replace('-', ' ').title()}"
|
|
2552
3228
|
else:
|
|
2553
|
-
# Multiple checks
|
|
3229
|
+
# Multiple checks: add numbering
|
|
2554
3230
|
if check == "rows-distinct":
|
|
2555
3231
|
base_title = "Rows Distinct"
|
|
2556
3232
|
elif check == "col-vals-not-null":
|
|
@@ -2587,7 +3263,7 @@ def _display_validation_result(
|
|
|
2587
3263
|
result_table.add_column("Value", style="white")
|
|
2588
3264
|
|
|
2589
3265
|
# Add basic info
|
|
2590
|
-
result_table.add_row("Data Source",
|
|
3266
|
+
result_table.add_row("Data Source", display_source)
|
|
2591
3267
|
result_table.add_row("Check Type", check)
|
|
2592
3268
|
|
|
2593
3269
|
# Add column info for column-specific checks
|
|
@@ -2617,7 +3293,7 @@ def _display_validation_result(
|
|
|
2617
3293
|
operator = "<"
|
|
2618
3294
|
elif check == "col-vals-le":
|
|
2619
3295
|
operator = "<="
|
|
2620
|
-
result_table.add_row("
|
|
3296
|
+
result_table.add_row("Comparison Value", f"{operator} {value}")
|
|
2621
3297
|
|
|
2622
3298
|
# Get validation details
|
|
2623
3299
|
if step_info:
|
|
@@ -2728,6 +3404,7 @@ def _display_validation_result(
|
|
|
2728
3404
|
Panel(
|
|
2729
3405
|
success_message,
|
|
2730
3406
|
border_style="green",
|
|
3407
|
+
expand=False,
|
|
2731
3408
|
)
|
|
2732
3409
|
)
|
|
2733
3410
|
else:
|
|
@@ -2757,6 +3434,7 @@ def _display_validation_result(
|
|
|
2757
3434
|
Panel(
|
|
2758
3435
|
failure_message,
|
|
2759
3436
|
border_style="red",
|
|
3437
|
+
expand=False,
|
|
2760
3438
|
)
|
|
2761
3439
|
)
|
|
2762
3440
|
|
|
@@ -2837,7 +3515,7 @@ def _show_extract_for_multi_check(
|
|
|
2837
3515
|
console.print()
|
|
2838
3516
|
console.print(extract_message)
|
|
2839
3517
|
|
|
2840
|
-
# Special handling for col-exists check
|
|
3518
|
+
# Special handling for col-exists check: no rows to show when column doesn't exist
|
|
2841
3519
|
if check == "col-exists":
|
|
2842
3520
|
if show_extract:
|
|
2843
3521
|
console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
|
|
@@ -2848,16 +3526,17 @@ def _show_extract_for_multi_check(
|
|
|
2848
3526
|
console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
|
|
2849
3527
|
else:
|
|
2850
3528
|
try:
|
|
2851
|
-
# Get failing rows extract
|
|
3529
|
+
# Get failing rows extract: use step_index + 1 since extracts are 1-indexed
|
|
2852
3530
|
failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
|
|
2853
3531
|
|
|
2854
3532
|
if failing_rows is not None and len(failing_rows) > 0:
|
|
2855
3533
|
if show_extract:
|
|
2856
|
-
#
|
|
2857
|
-
|
|
2858
|
-
|
|
3534
|
+
# Always limit to 10 rows for display, regardless of limit option
|
|
3535
|
+
display_limit = 10
|
|
3536
|
+
if len(failing_rows) > display_limit:
|
|
3537
|
+
display_rows = failing_rows.head(display_limit)
|
|
2859
3538
|
console.print(
|
|
2860
|
-
f"[dim]Showing first {
|
|
3539
|
+
f"[dim]Showing first {display_limit} of {len(failing_rows)} {row_type}[/dim]"
|
|
2861
3540
|
)
|
|
2862
3541
|
else:
|
|
2863
3542
|
display_rows = failing_rows
|
|
@@ -2868,9 +3547,9 @@ def _show_extract_for_multi_check(
|
|
|
2868
3547
|
|
|
2869
3548
|
preview_table = pb.preview(
|
|
2870
3549
|
data=display_rows,
|
|
2871
|
-
n_head=min(
|
|
3550
|
+
n_head=min(display_limit, len(display_rows)),
|
|
2872
3551
|
n_tail=0,
|
|
2873
|
-
limit=
|
|
3552
|
+
limit=display_limit,
|
|
2874
3553
|
show_row_numbers=True,
|
|
2875
3554
|
)
|
|
2876
3555
|
|
|
@@ -2892,7 +3571,7 @@ def _show_extract_for_multi_check(
|
|
|
2892
3571
|
filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
|
|
2893
3572
|
filepath = output_folder / filename
|
|
2894
3573
|
|
|
2895
|
-
#
|
|
3574
|
+
# Use limit option for write_extract
|
|
2896
3575
|
write_rows = failing_rows
|
|
2897
3576
|
if len(failing_rows) > limit:
|
|
2898
3577
|
write_rows = failing_rows.head(limit)
|
|
@@ -2946,6 +3625,18 @@ def _show_extract_and_summary(
|
|
|
2946
3625
|
"""Show extract and summary for a validation step (used for single checks)."""
|
|
2947
3626
|
step_passed = step_info.n_failed == 0 if step_info else True
|
|
2948
3627
|
|
|
3628
|
+
# Get the friendly display name
|
|
3629
|
+
is_piped_data = _is_piped_data_source(data_source)
|
|
3630
|
+
if is_piped_data:
|
|
3631
|
+
if data_source.endswith(".parquet"):
|
|
3632
|
+
display_source = "Polars expression (serialized to Parquet) from `pb pl`"
|
|
3633
|
+
elif data_source.endswith(".csv"):
|
|
3634
|
+
display_source = "Polars expression (serialized to CSV) from `pb pl`"
|
|
3635
|
+
else:
|
|
3636
|
+
display_source = "Polars expression from `pb pl`"
|
|
3637
|
+
else:
|
|
3638
|
+
display_source = data_source
|
|
3639
|
+
|
|
2949
3640
|
# Show extract if requested and validation failed
|
|
2950
3641
|
if (show_extract or write_extract) and not step_passed:
|
|
2951
3642
|
console.print()
|
|
@@ -2997,7 +3688,7 @@ def _show_extract_and_summary(
|
|
|
2997
3688
|
if show_extract:
|
|
2998
3689
|
console.print(extract_message)
|
|
2999
3690
|
|
|
3000
|
-
# Special handling for col-exists check
|
|
3691
|
+
# Special handling for col-exists check: no rows to show when column doesn't exist
|
|
3001
3692
|
if check == "col-exists" and not step_passed:
|
|
3002
3693
|
if show_extract:
|
|
3003
3694
|
console.print(f"[dim]The column '{column}' was not found in the dataset.[/dim]")
|
|
@@ -3008,16 +3699,17 @@ def _show_extract_and_summary(
|
|
|
3008
3699
|
console.print("[yellow]Cannot save failing rows when column doesn't exist[/yellow]")
|
|
3009
3700
|
else:
|
|
3010
3701
|
try:
|
|
3011
|
-
# Get failing rows extract
|
|
3702
|
+
# Get failing rows extract: use step_index + 1 since extracts are 1-indexed
|
|
3012
3703
|
failing_rows = validation.get_data_extracts(i=step_index + 1, frame=True)
|
|
3013
3704
|
|
|
3014
3705
|
if failing_rows is not None and len(failing_rows) > 0:
|
|
3015
3706
|
if show_extract:
|
|
3016
|
-
#
|
|
3017
|
-
|
|
3018
|
-
|
|
3707
|
+
# Always limit to 10 rows for display, regardless of limit option
|
|
3708
|
+
display_limit = 10
|
|
3709
|
+
if len(failing_rows) > display_limit:
|
|
3710
|
+
display_rows = failing_rows.head(display_limit)
|
|
3019
3711
|
console.print(
|
|
3020
|
-
f"[dim]Showing first {
|
|
3712
|
+
f"[dim]Showing first {display_limit} of {len(failing_rows)} {row_type}[/dim]"
|
|
3021
3713
|
)
|
|
3022
3714
|
else:
|
|
3023
3715
|
display_rows = failing_rows
|
|
@@ -3028,9 +3720,9 @@ def _show_extract_and_summary(
|
|
|
3028
3720
|
|
|
3029
3721
|
preview_table = pb.preview(
|
|
3030
3722
|
data=display_rows,
|
|
3031
|
-
n_head=min(
|
|
3723
|
+
n_head=min(display_limit, len(display_rows)),
|
|
3032
3724
|
n_tail=0,
|
|
3033
|
-
limit=
|
|
3725
|
+
limit=display_limit,
|
|
3034
3726
|
show_row_numbers=True,
|
|
3035
3727
|
)
|
|
3036
3728
|
|
|
@@ -3052,7 +3744,7 @@ def _show_extract_and_summary(
|
|
|
3052
3744
|
filename = f"step_{step_index + 1:02d}_{safe_check_type}.csv"
|
|
3053
3745
|
filepath = output_folder / filename
|
|
3054
3746
|
|
|
3055
|
-
#
|
|
3747
|
+
# Use limit option for write_extract
|
|
3056
3748
|
write_rows = failing_rows
|
|
3057
3749
|
if len(failing_rows) > limit:
|
|
3058
3750
|
write_rows = failing_rows.head(limit)
|
|
@@ -3098,84 +3790,84 @@ def _show_extract_and_summary(
|
|
|
3098
3790
|
if step_passed:
|
|
3099
3791
|
if check == "rows-distinct":
|
|
3100
3792
|
success_message = (
|
|
3101
|
-
f"[green]✓ Validation PASSED: No duplicate rows found in {
|
|
3793
|
+
f"[green]✓ Validation PASSED: No duplicate rows found in {display_source}[/green]"
|
|
3102
3794
|
)
|
|
3103
3795
|
elif check == "col-vals-not-null":
|
|
3104
|
-
success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {
|
|
3796
|
+
success_message = f"[green]✓ Validation PASSED: No null values found in column '{column}' in {display_source}[/green]"
|
|
3105
3797
|
elif check == "rows-complete":
|
|
3106
|
-
success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {
|
|
3798
|
+
success_message = f"[green]✓ Validation PASSED: All rows are complete (no missing values) in {display_source}[/green]"
|
|
3107
3799
|
elif check == "col-exists":
|
|
3108
3800
|
success_message = (
|
|
3109
|
-
f"[green]✓ Validation PASSED: Column '{column}' exists in {
|
|
3801
|
+
f"[green]✓ Validation PASSED: Column '{column}' exists in {display_source}[/green]"
|
|
3110
3802
|
)
|
|
3111
3803
|
elif check == "col-vals-in-set":
|
|
3112
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {
|
|
3804
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are in the allowed set in {display_source}[/green]"
|
|
3113
3805
|
elif check == "col-vals-gt":
|
|
3114
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {
|
|
3806
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are > {value} in {display_source}[/green]"
|
|
3115
3807
|
elif check == "col-vals-ge":
|
|
3116
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {
|
|
3808
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are >= {value} in {display_source}[/green]"
|
|
3117
3809
|
elif check == "col-vals-lt":
|
|
3118
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {
|
|
3810
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are < {value} in {display_source}[/green]"
|
|
3119
3811
|
elif check == "col-vals-le":
|
|
3120
|
-
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {
|
|
3812
|
+
success_message = f"[green]✓ Validation PASSED: All values in column '{column}' are <= {value} in {display_source}[/green]"
|
|
3121
3813
|
else:
|
|
3122
3814
|
success_message = (
|
|
3123
|
-
f"[green]✓ Validation PASSED: {check} check passed for {
|
|
3815
|
+
f"[green]✓ Validation PASSED: {check} check passed for {display_source}[/green]"
|
|
3124
3816
|
)
|
|
3125
3817
|
|
|
3126
|
-
console.print(Panel(success_message, border_style="green"))
|
|
3818
|
+
console.print(Panel(success_message, border_style="green", expand=False))
|
|
3127
3819
|
else:
|
|
3128
3820
|
if step_info:
|
|
3129
3821
|
if check == "rows-distinct":
|
|
3130
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {
|
|
3822
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} duplicate rows found in {display_source}[/red]"
|
|
3131
3823
|
elif check == "col-vals-not-null":
|
|
3132
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {
|
|
3824
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} null values found in column '{column}' in {display_source}[/red]"
|
|
3133
3825
|
elif check == "rows-complete":
|
|
3134
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {
|
|
3826
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} incomplete rows found in {display_source}[/red]"
|
|
3135
3827
|
elif check == "col-exists":
|
|
3136
|
-
failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {
|
|
3828
|
+
failure_message = f"[red]✗ Validation FAILED: Column '{column}' does not exist in {display_source}[/red]"
|
|
3137
3829
|
elif check == "col-vals-in-set":
|
|
3138
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {
|
|
3830
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} invalid values found in column '{column}' in {display_source}[/red]"
|
|
3139
3831
|
elif check == "col-vals-gt":
|
|
3140
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {
|
|
3832
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values <= {value} found in column '{column}' in {display_source}[/red]"
|
|
3141
3833
|
elif check == "col-vals-ge":
|
|
3142
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {
|
|
3834
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values < {value} found in column '{column}' in {display_source}[/red]"
|
|
3143
3835
|
elif check == "col-vals-lt":
|
|
3144
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {
|
|
3836
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values >= {value} found in column '{column}' in {display_source}[/red]"
|
|
3145
3837
|
elif check == "col-vals-le":
|
|
3146
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {
|
|
3838
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} values > {value} found in column '{column}' in {display_source}[/red]"
|
|
3147
3839
|
else:
|
|
3148
|
-
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {
|
|
3840
|
+
failure_message = f"[red]✗ Validation FAILED: {step_info.n_failed:,} failing rows found in {display_source}[/red]"
|
|
3149
3841
|
|
|
3150
3842
|
# Add hint about --show-extract if not already used (except for col-exists which has no rows to show)
|
|
3151
3843
|
if not show_extract and check != "col-exists":
|
|
3152
3844
|
failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
|
|
3153
3845
|
|
|
3154
|
-
console.print(Panel(failure_message, border_style="red"))
|
|
3846
|
+
console.print(Panel(failure_message, border_style="red", expand=False))
|
|
3155
3847
|
else:
|
|
3156
3848
|
if check == "rows-distinct":
|
|
3157
3849
|
failure_message = (
|
|
3158
|
-
f"[red]✗ Validation FAILED: Duplicate rows found in {
|
|
3850
|
+
f"[red]✗ Validation FAILED: Duplicate rows found in {display_source}[/red]"
|
|
3159
3851
|
)
|
|
3160
3852
|
elif check == "rows-complete":
|
|
3161
3853
|
failure_message = (
|
|
3162
|
-
f"[red]✗ Validation FAILED: Incomplete rows found in {
|
|
3854
|
+
f"[red]✗ Validation FAILED: Incomplete rows found in {display_source}[/red]"
|
|
3163
3855
|
)
|
|
3164
3856
|
else:
|
|
3165
3857
|
failure_message = (
|
|
3166
|
-
f"[red]✗ Validation FAILED: {check} check failed for {
|
|
3858
|
+
f"[red]✗ Validation FAILED: {check} check failed for {display_source}[/red]"
|
|
3167
3859
|
)
|
|
3168
3860
|
|
|
3169
3861
|
# Add hint about --show-extract if not already used
|
|
3170
3862
|
if not show_extract:
|
|
3171
3863
|
failure_message += "\n[bright_blue]💡 Tip:[/bright_blue] [cyan]Use --show-extract to see the failing rows[/cyan]"
|
|
3172
3864
|
|
|
3173
|
-
console.print(Panel(failure_message, border_style="red"))
|
|
3865
|
+
console.print(Panel(failure_message, border_style="red", expand=False))
|
|
3174
3866
|
|
|
3175
3867
|
|
|
3176
3868
|
@cli.command()
|
|
3177
|
-
@click.argument("output_file", type=click.Path())
|
|
3178
|
-
def make_template(output_file: str):
|
|
3869
|
+
@click.argument("output_file", type=click.Path(), required=False)
|
|
3870
|
+
def make_template(output_file: str | None):
|
|
3179
3871
|
"""
|
|
3180
3872
|
Create a validation script template.
|
|
3181
3873
|
|
|
@@ -3191,11 +3883,19 @@ def make_template(output_file: str):
|
|
|
3191
3883
|
pb make-template my_validation.py
|
|
3192
3884
|
pb make-template validation_template.py
|
|
3193
3885
|
"""
|
|
3886
|
+
# Handle missing output_file with concise help
|
|
3887
|
+
if output_file is None:
|
|
3888
|
+
_show_concise_help("make-template", None)
|
|
3889
|
+
return
|
|
3890
|
+
|
|
3194
3891
|
example_script = '''"""
|
|
3195
3892
|
Example Pointblank validation script.
|
|
3196
3893
|
|
|
3197
3894
|
This script demonstrates how to create validation rules for your data.
|
|
3198
3895
|
Modify the data loading and validation rules below to match your requirements.
|
|
3896
|
+
|
|
3897
|
+
When using 'pb run' with --data option, the CLI will automatically replace
|
|
3898
|
+
the data source in your validation object with the provided data.
|
|
3199
3899
|
"""
|
|
3200
3900
|
|
|
3201
3901
|
import pointblank as pb
|
|
@@ -3239,11 +3939,6 @@ validation = (
|
|
|
3239
3939
|
# Finalize the validation
|
|
3240
3940
|
.interrogate()
|
|
3241
3941
|
)
|
|
3242
|
-
|
|
3243
|
-
# The validation object will be automatically used by the CLI
|
|
3244
|
-
# You can also access results programmatically:
|
|
3245
|
-
# print(f"All passed: {validation.all_passed()}")
|
|
3246
|
-
# print(f"Failed steps: {validation.n_failed()}")
|
|
3247
3942
|
'''
|
|
3248
3943
|
|
|
3249
3944
|
Path(output_file).write_text(example_script)
|
|
@@ -3251,13 +3946,17 @@ validation = (
|
|
|
3251
3946
|
console.print("\nEdit the template to add your data loading and validation rules, then run:")
|
|
3252
3947
|
console.print(f"[cyan]pb run {output_file}[/cyan]")
|
|
3253
3948
|
console.print(
|
|
3254
|
-
f"[cyan]pb run {output_file} --data your_data.csv[/cyan] [dim]#
|
|
3949
|
+
f"[cyan]pb run {output_file} --data your_data.csv[/cyan] [dim]# Replace data source automatically[/dim]"
|
|
3255
3950
|
)
|
|
3256
3951
|
|
|
3257
3952
|
|
|
3258
3953
|
@cli.command()
|
|
3259
|
-
@click.argument("validation_script", type=click.Path(exists=True))
|
|
3260
|
-
@click.option(
|
|
3954
|
+
@click.argument("validation_script", type=click.Path(exists=True), required=False)
|
|
3955
|
+
@click.option(
|
|
3956
|
+
"--data",
|
|
3957
|
+
type=str,
|
|
3958
|
+
help="Data source to replace in validation objects (single validation scripts only)",
|
|
3959
|
+
)
|
|
3261
3960
|
@click.option("--output-html", type=click.Path(), help="Save HTML validation report to file")
|
|
3262
3961
|
@click.option("--output-json", type=click.Path(), help="Save JSON validation summary to file")
|
|
3263
3962
|
@click.option(
|
|
@@ -3269,7 +3968,7 @@ validation = (
|
|
|
3269
3968
|
help="Save failing rows to folders (one CSV per step). Provide base name for folder.",
|
|
3270
3969
|
)
|
|
3271
3970
|
@click.option(
|
|
3272
|
-
"--limit",
|
|
3971
|
+
"--limit", default=500, help="Maximum number of failing rows to save to CSV (default: 500)"
|
|
3273
3972
|
)
|
|
3274
3973
|
@click.option(
|
|
3275
3974
|
"--fail-on",
|
|
@@ -3277,7 +3976,7 @@ validation = (
|
|
|
3277
3976
|
help="Exit with non-zero code when validation reaches this threshold level",
|
|
3278
3977
|
)
|
|
3279
3978
|
def run(
|
|
3280
|
-
validation_script: str,
|
|
3979
|
+
validation_script: str | None,
|
|
3281
3980
|
data: str | None,
|
|
3282
3981
|
output_html: str | None,
|
|
3283
3982
|
output_json: str | None,
|
|
@@ -3292,8 +3991,11 @@ def run(
|
|
|
3292
3991
|
VALIDATION_SCRIPT should be a Python file that defines validation logic.
|
|
3293
3992
|
The script should load its own data and create validation objects.
|
|
3294
3993
|
|
|
3295
|
-
If --data is provided, it will
|
|
3296
|
-
|
|
3994
|
+
If --data is provided, it will automatically replace the data source in your
|
|
3995
|
+
validation objects. This works with scripts containing a single validation.
|
|
3996
|
+
For scripts with multiple validations, use separate script files or remove --data.
|
|
3997
|
+
|
|
3998
|
+
To get started quickly, use 'pb make-template' to create a validation script template.
|
|
3297
3999
|
|
|
3298
4000
|
DATA can be:
|
|
3299
4001
|
|
|
@@ -3307,6 +4009,7 @@ def run(
|
|
|
3307
4009
|
Examples:
|
|
3308
4010
|
|
|
3309
4011
|
\b
|
|
4012
|
+
pb make-template my_validation.py # Create a template first
|
|
3310
4013
|
pb run validation_script.py
|
|
3311
4014
|
pb run validation_script.py --data data.csv
|
|
3312
4015
|
pb run validation_script.py --data small_table --output-html report.html
|
|
@@ -3314,6 +4017,11 @@ def run(
|
|
|
3314
4017
|
pb run validation_script.py --write-extract extracts_folder --fail-on critical
|
|
3315
4018
|
"""
|
|
3316
4019
|
try:
|
|
4020
|
+
# Handle missing validation_script with concise help
|
|
4021
|
+
if validation_script is None:
|
|
4022
|
+
_show_concise_help("run", None)
|
|
4023
|
+
return
|
|
4024
|
+
|
|
3317
4025
|
# Load optional data override if provided
|
|
3318
4026
|
cli_data = None
|
|
3319
4027
|
if data:
|
|
@@ -3369,19 +4077,85 @@ def run(
|
|
|
3369
4077
|
|
|
3370
4078
|
console.print(f"[green]✓[/green] Found {len(validations)} validation object(s)")
|
|
3371
4079
|
|
|
3372
|
-
#
|
|
3373
|
-
|
|
3374
|
-
|
|
3375
|
-
overall_error = False
|
|
3376
|
-
overall_warning = False
|
|
3377
|
-
|
|
3378
|
-
for i, validation in enumerate(validations, 1):
|
|
4080
|
+
# Implement automatic data replacement for Validate objects if --data was provided
|
|
4081
|
+
if cli_data is not None:
|
|
4082
|
+
# Check if we have multiple validations (this is not supported)
|
|
3379
4083
|
if len(validations) > 1:
|
|
3380
|
-
console.print(
|
|
3381
|
-
|
|
3382
|
-
|
|
3383
|
-
|
|
3384
|
-
|
|
4084
|
+
console.print(
|
|
4085
|
+
f"[red]Error: Found {len(validations)} validation objects in the script.[/red]"
|
|
4086
|
+
)
|
|
4087
|
+
console.print(
|
|
4088
|
+
"[yellow]The --data option replaces data in ALL validation objects,[/yellow]"
|
|
4089
|
+
)
|
|
4090
|
+
console.print(
|
|
4091
|
+
"[yellow]which may cause failures if validations expect different schemas.[/yellow]"
|
|
4092
|
+
)
|
|
4093
|
+
console.print("\n[cyan]Options:[/cyan]")
|
|
4094
|
+
console.print(" 1. Split your script into separate files with one validation each")
|
|
4095
|
+
console.print(
|
|
4096
|
+
" 2. Remove the --data option to use each validation's original data"
|
|
4097
|
+
)
|
|
4098
|
+
sys.exit(1)
|
|
4099
|
+
|
|
4100
|
+
console.print(
|
|
4101
|
+
f"[yellow]Replacing data in {len(validations)} validation object(s) with CLI data[/yellow]"
|
|
4102
|
+
)
|
|
4103
|
+
|
|
4104
|
+
for idx, validation in enumerate(validations, 1):
|
|
4105
|
+
# Check if it's a Validate object with data attribute
|
|
4106
|
+
if hasattr(validation, "data") and hasattr(validation, "interrogate"):
|
|
4107
|
+
console.print("[cyan]Updating validation with new data source...[/cyan]")
|
|
4108
|
+
|
|
4109
|
+
# Store the original validation_info as our "plan"
|
|
4110
|
+
original_validation_info = validation.validation_info.copy()
|
|
4111
|
+
|
|
4112
|
+
# Replace the data
|
|
4113
|
+
validation.data = cli_data
|
|
4114
|
+
|
|
4115
|
+
# Re-process the data (same as what happens in __post_init__)
|
|
4116
|
+
from pointblank.validate import _process_data
|
|
4117
|
+
|
|
4118
|
+
validation.data = _process_data(validation.data)
|
|
4119
|
+
|
|
4120
|
+
# Reset validation results but keep the plan
|
|
4121
|
+
validation.validation_info = []
|
|
4122
|
+
|
|
4123
|
+
# Re-add each validation step from the original plan
|
|
4124
|
+
for val_info in original_validation_info:
|
|
4125
|
+
# Create a copy and reset any interrogation results
|
|
4126
|
+
new_val_info = copy.deepcopy(val_info)
|
|
4127
|
+
# Reset interrogation-specific attributes if they exist
|
|
4128
|
+
if hasattr(new_val_info, "n_passed"):
|
|
4129
|
+
new_val_info.n_passed = None
|
|
4130
|
+
if hasattr(new_val_info, "n_failed"):
|
|
4131
|
+
new_val_info.n_failed = None
|
|
4132
|
+
if hasattr(new_val_info, "all_passed"):
|
|
4133
|
+
new_val_info.all_passed = None
|
|
4134
|
+
if hasattr(new_val_info, "warning"):
|
|
4135
|
+
new_val_info.warning = None
|
|
4136
|
+
if hasattr(new_val_info, "error"):
|
|
4137
|
+
new_val_info.error = None
|
|
4138
|
+
if hasattr(new_val_info, "critical"):
|
|
4139
|
+
new_val_info.critical = None
|
|
4140
|
+
validation.validation_info.append(new_val_info)
|
|
4141
|
+
|
|
4142
|
+
# Re-interrogate with the new data
|
|
4143
|
+
console.print("[cyan]Re-interrogating with new data...[/cyan]")
|
|
4144
|
+
validation.interrogate()
|
|
4145
|
+
|
|
4146
|
+
# Process each validation
|
|
4147
|
+
overall_failed = False
|
|
4148
|
+
overall_critical = False
|
|
4149
|
+
overall_error = False
|
|
4150
|
+
overall_warning = False
|
|
4151
|
+
|
|
4152
|
+
for i, validation in enumerate(validations, 1):
|
|
4153
|
+
if len(validations) > 1:
|
|
4154
|
+
console.print(f"\n[bold cyan]Validation {i}:[/bold cyan]")
|
|
4155
|
+
|
|
4156
|
+
# Display summary
|
|
4157
|
+
_display_validation_summary(validation)
|
|
4158
|
+
|
|
3385
4159
|
# Check failure status
|
|
3386
4160
|
validation_failed = False
|
|
3387
4161
|
has_critical = False
|
|
@@ -3432,11 +4206,12 @@ def run(
|
|
|
3432
4206
|
f"\n[cyan]Step {step_num}:[/cyan] {step_info.assertion_type}"
|
|
3433
4207
|
)
|
|
3434
4208
|
|
|
3435
|
-
#
|
|
3436
|
-
|
|
3437
|
-
|
|
4209
|
+
# Always limit to 10 rows for display, regardless of limit option
|
|
4210
|
+
display_limit = 10
|
|
4211
|
+
if len(failing_rows) > display_limit:
|
|
4212
|
+
display_rows = failing_rows.head(display_limit)
|
|
3438
4213
|
console.print(
|
|
3439
|
-
f"[dim]Showing first {
|
|
4214
|
+
f"[dim]Showing first {display_limit} of {len(failing_rows)} failing rows[/dim]"
|
|
3440
4215
|
)
|
|
3441
4216
|
else:
|
|
3442
4217
|
display_rows = failing_rows
|
|
@@ -3447,9 +4222,9 @@ def run(
|
|
|
3447
4222
|
# Create a preview table using pointblank's preview function
|
|
3448
4223
|
preview_table = pb.preview(
|
|
3449
4224
|
data=display_rows,
|
|
3450
|
-
n_head=min(
|
|
4225
|
+
n_head=min(display_limit, len(display_rows)),
|
|
3451
4226
|
n_tail=0,
|
|
3452
|
-
limit=
|
|
4227
|
+
limit=display_limit,
|
|
3453
4228
|
show_row_numbers=True,
|
|
3454
4229
|
)
|
|
3455
4230
|
|
|
@@ -3502,7 +4277,7 @@ def run(
|
|
|
3502
4277
|
filename = f"step_{step_num:02d}_{safe_assertion_type}.csv"
|
|
3503
4278
|
filepath = output_folder / filename
|
|
3504
4279
|
|
|
3505
|
-
#
|
|
4280
|
+
# Use limit for CSV output
|
|
3506
4281
|
save_rows = failing_rows
|
|
3507
4282
|
if hasattr(failing_rows, "head") and len(failing_rows) > limit:
|
|
3508
4283
|
save_rows = failing_rows.head(limit)
|
|
@@ -3521,7 +4296,11 @@ def run(
|
|
|
3521
4296
|
pd_data = pd.DataFrame(save_rows)
|
|
3522
4297
|
pd_data.to_csv(str(filepath), index=False)
|
|
3523
4298
|
|
|
3524
|
-
|
|
4299
|
+
# Record the actual number of rows saved
|
|
4300
|
+
rows_saved = (
|
|
4301
|
+
len(save_rows) if hasattr(save_rows, "__len__") else limit
|
|
4302
|
+
)
|
|
4303
|
+
saved_files.append((filename, rows_saved))
|
|
3525
4304
|
|
|
3526
4305
|
except Exception as e:
|
|
3527
4306
|
console.print(
|
|
@@ -3548,11 +4327,11 @@ def run(
|
|
|
3548
4327
|
if output_html:
|
|
3549
4328
|
try:
|
|
3550
4329
|
if len(validations) == 1:
|
|
3551
|
-
# Single validation
|
|
4330
|
+
# Single validation: save directly
|
|
3552
4331
|
html_content = validations[0]._repr_html_()
|
|
3553
4332
|
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
3554
4333
|
else:
|
|
3555
|
-
# Multiple validations
|
|
4334
|
+
# Multiple validations: combine them
|
|
3556
4335
|
html_parts = []
|
|
3557
4336
|
html_parts.append("<html><body>")
|
|
3558
4337
|
html_parts.append("<h1>Pointblank Validation Report</h1>")
|
|
@@ -3572,11 +4351,11 @@ def run(
|
|
|
3572
4351
|
if output_json:
|
|
3573
4352
|
try:
|
|
3574
4353
|
if len(validations) == 1:
|
|
3575
|
-
# Single validation
|
|
4354
|
+
# Single validation: save directly
|
|
3576
4355
|
json_report = validations[0].get_json_report()
|
|
3577
4356
|
Path(output_json).write_text(json_report, encoding="utf-8")
|
|
3578
4357
|
else:
|
|
3579
|
-
# Multiple validations
|
|
4358
|
+
# Multiple validations: combine them
|
|
3580
4359
|
import json
|
|
3581
4360
|
|
|
3582
4361
|
combined_report = {"validations": []}
|
|
@@ -3642,3 +4421,768 @@ def _format_missing_percentage(value: float) -> str:
|
|
|
3642
4421
|
return ">99%" # More than 99%
|
|
3643
4422
|
else:
|
|
3644
4423
|
return f"{int(round(value))}%" # Round to nearest integer with % sign
|
|
4424
|
+
|
|
4425
|
+
|
|
4426
|
+
@cli.command()
|
|
4427
|
+
@click.argument("polars_expression", type=str, required=False)
|
|
4428
|
+
@click.option("--edit", "-e", is_flag=True, help="Open editor for multi-line input")
|
|
4429
|
+
@click.option("--file", "-f", type=click.Path(exists=True), help="Read query from file")
|
|
4430
|
+
@click.option(
|
|
4431
|
+
"--editor", help="Editor to use for --edit mode (overrides $EDITOR and auto-detection)"
|
|
4432
|
+
)
|
|
4433
|
+
@click.option(
|
|
4434
|
+
"--output-format",
|
|
4435
|
+
"-o",
|
|
4436
|
+
type=click.Choice(["preview", "scan", "missing", "info"]),
|
|
4437
|
+
default="preview",
|
|
4438
|
+
help="Output format for the result",
|
|
4439
|
+
)
|
|
4440
|
+
@click.option("--preview-head", default=5, help="Number of head rows for preview")
|
|
4441
|
+
@click.option("--preview-tail", default=5, help="Number of tail rows for preview")
|
|
4442
|
+
@click.option("--output-html", type=click.Path(), help="Save HTML output to file")
|
|
4443
|
+
@click.option(
|
|
4444
|
+
"--pipe", is_flag=True, help="Output data in a format suitable for piping to other pb commands"
|
|
4445
|
+
)
|
|
4446
|
+
@click.option(
|
|
4447
|
+
"--pipe-format",
|
|
4448
|
+
type=click.Choice(["parquet", "csv"]),
|
|
4449
|
+
default="parquet",
|
|
4450
|
+
help="Format for piped output (default: parquet)",
|
|
4451
|
+
)
|
|
4452
|
+
def pl(
|
|
4453
|
+
polars_expression: str | None,
|
|
4454
|
+
edit: bool,
|
|
4455
|
+
file: str | None,
|
|
4456
|
+
editor: str | None,
|
|
4457
|
+
output_format: str,
|
|
4458
|
+
preview_head: int,
|
|
4459
|
+
preview_tail: int,
|
|
4460
|
+
output_html: str | None,
|
|
4461
|
+
pipe: bool,
|
|
4462
|
+
pipe_format: str,
|
|
4463
|
+
):
|
|
4464
|
+
"""
|
|
4465
|
+
Execute Polars expressions and display results.
|
|
4466
|
+
|
|
4467
|
+
Execute Polars DataFrame operations from the command line and display
|
|
4468
|
+
the results using Pointblank's visualization tools.
|
|
4469
|
+
|
|
4470
|
+
POLARS_EXPRESSION should be a valid Polars expression that returns a DataFrame.
|
|
4471
|
+
The 'pl' module is automatically imported and available.
|
|
4472
|
+
|
|
4473
|
+
Examples:
|
|
4474
|
+
|
|
4475
|
+
\b
|
|
4476
|
+
# Direct expression
|
|
4477
|
+
pb pl "pl.read_csv('data.csv')"
|
|
4478
|
+
pb pl "pl.read_csv('data.csv').select(['name', 'age'])"
|
|
4479
|
+
pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)"
|
|
4480
|
+
|
|
4481
|
+
# Multi-line with editor (supports multiple statements)
|
|
4482
|
+
pb pl --edit
|
|
4483
|
+
|
|
4484
|
+
# Multi-statement code example in editor:
|
|
4485
|
+
# csv = pl.read_csv('data.csv')
|
|
4486
|
+
# result = csv.select(['name', 'age']).filter(pl.col('age') > 25)
|
|
4487
|
+
|
|
4488
|
+
# Multi-line with a specific editor
|
|
4489
|
+
pb pl --edit --editor nano
|
|
4490
|
+
pb pl --edit --editor code
|
|
4491
|
+
pb pl --edit --editor micro
|
|
4492
|
+
|
|
4493
|
+
# From file
|
|
4494
|
+
pb pl --file query.py
|
|
4495
|
+
|
|
4496
|
+
# Piping to other pb commands
|
|
4497
|
+
pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)" --pipe | pb validate --check rows-distinct
|
|
4498
|
+
pb pl --edit --pipe | pb preview --head 10
|
|
4499
|
+
pb pl --edit --pipe | pb scan --output-html report.html
|
|
4500
|
+
pb pl --edit --pipe | pb missing --output-html missing_report.html
|
|
4501
|
+
|
|
4502
|
+
Use --output-format to change how results are displayed:
|
|
4503
|
+
|
|
4504
|
+
\b
|
|
4505
|
+
pb pl "pl.read_csv('data.csv')" --output-format scan
|
|
4506
|
+
pb pl "pl.read_csv('data.csv')" --output-format missing
|
|
4507
|
+
pb pl "pl.read_csv('data.csv')" --output-format info
|
|
4508
|
+
|
|
4509
|
+
Note: For multi-statement code, assign your final result to a variable like
|
|
4510
|
+
'result', 'df', 'data', or ensure it's the last expression.
|
|
4511
|
+
"""
|
|
4512
|
+
try:
|
|
4513
|
+
# Check if Polars is available
|
|
4514
|
+
if not _is_lib_present("polars"):
|
|
4515
|
+
console.print("[red]Error:[/red] Polars is not installed")
|
|
4516
|
+
console.print("\nThe 'pb pl' command requires Polars to be installed.")
|
|
4517
|
+
console.print("Install it with: [cyan]pip install polars[/cyan]")
|
|
4518
|
+
console.print("\nTo check all dependency status, run: [cyan]pb requirements[/cyan]")
|
|
4519
|
+
sys.exit(1)
|
|
4520
|
+
|
|
4521
|
+
import polars as pl
|
|
4522
|
+
|
|
4523
|
+
# Determine the source of the query
|
|
4524
|
+
query_code = None
|
|
4525
|
+
|
|
4526
|
+
if file:
|
|
4527
|
+
# Read from file
|
|
4528
|
+
query_code = Path(file).read_text()
|
|
4529
|
+
elif edit:
|
|
4530
|
+
# Determine which editor to use
|
|
4531
|
+
chosen_editor = editor or _get_best_editor()
|
|
4532
|
+
|
|
4533
|
+
# When piping, send editor message to stderr
|
|
4534
|
+
if pipe:
|
|
4535
|
+
print(f"Using editor: {chosen_editor}", file=sys.stderr)
|
|
4536
|
+
else:
|
|
4537
|
+
console.print(f"[dim]Using editor: {chosen_editor}[/dim]")
|
|
4538
|
+
|
|
4539
|
+
# Interactive editor with custom editor
|
|
4540
|
+
if chosen_editor == "code":
|
|
4541
|
+
# Special handling for VS Code
|
|
4542
|
+
query_code = _edit_with_vscode()
|
|
4543
|
+
else:
|
|
4544
|
+
# Use click.edit() for terminal editors
|
|
4545
|
+
query_code = click.edit(
|
|
4546
|
+
"# Enter your Polars query here\n"
|
|
4547
|
+
"# Example:\n"
|
|
4548
|
+
"# pl.read_csv('data.csv').select(['name', 'age'])\n"
|
|
4549
|
+
"# pl.read_csv('data.csv').filter(pl.col('age') > 25)\n"
|
|
4550
|
+
"# \n"
|
|
4551
|
+
"# The result should be a Polars DataFrame or LazyFrame\n"
|
|
4552
|
+
"\n",
|
|
4553
|
+
editor=chosen_editor,
|
|
4554
|
+
)
|
|
4555
|
+
|
|
4556
|
+
if query_code is None:
|
|
4557
|
+
if pipe:
|
|
4558
|
+
print("No query entered", file=sys.stderr)
|
|
4559
|
+
else:
|
|
4560
|
+
console.print("[yellow]No query entered[/yellow]")
|
|
4561
|
+
sys.exit(1)
|
|
4562
|
+
elif polars_expression:
|
|
4563
|
+
# Direct argument
|
|
4564
|
+
query_code = polars_expression
|
|
4565
|
+
else:
|
|
4566
|
+
# Try to read from stdin (for piping)
|
|
4567
|
+
if not sys.stdin.isatty():
|
|
4568
|
+
# Data is being piped in
|
|
4569
|
+
query_code = sys.stdin.read().strip()
|
|
4570
|
+
else:
|
|
4571
|
+
# No input provided and stdin is a terminal - show concise help
|
|
4572
|
+
_show_concise_help("pl", None)
|
|
4573
|
+
return
|
|
4574
|
+
|
|
4575
|
+
if not query_code or not query_code.strip():
|
|
4576
|
+
console.print("[red]Error:[/red] Empty query")
|
|
4577
|
+
sys.exit(1)
|
|
4578
|
+
|
|
4579
|
+
# Execute the query
|
|
4580
|
+
with console.status("[bold green]Executing Polars expression..."):
|
|
4581
|
+
namespace = {
|
|
4582
|
+
"pl": pl,
|
|
4583
|
+
"polars": pl,
|
|
4584
|
+
"__builtins__": __builtins__,
|
|
4585
|
+
}
|
|
4586
|
+
|
|
4587
|
+
try:
|
|
4588
|
+
# Check if this is a single expression or multiple statements
|
|
4589
|
+
if "\n" in query_code.strip() or any(
|
|
4590
|
+
keyword in query_code
|
|
4591
|
+
for keyword in [
|
|
4592
|
+
" = ",
|
|
4593
|
+
"import",
|
|
4594
|
+
"for ",
|
|
4595
|
+
"if ",
|
|
4596
|
+
"def ",
|
|
4597
|
+
"class ",
|
|
4598
|
+
"with ",
|
|
4599
|
+
"try:",
|
|
4600
|
+
]
|
|
4601
|
+
):
|
|
4602
|
+
# Multiple statements - use exec()
|
|
4603
|
+
exec(query_code, namespace)
|
|
4604
|
+
|
|
4605
|
+
# Look for the result in the namespace
|
|
4606
|
+
# Try common variable names first
|
|
4607
|
+
result = None
|
|
4608
|
+
for var_name in ["result", "df", "data", "table", "output"]:
|
|
4609
|
+
if var_name in namespace:
|
|
4610
|
+
result = namespace[var_name]
|
|
4611
|
+
break
|
|
4612
|
+
|
|
4613
|
+
# If no common names found, look for any DataFrame/LazyFrame
|
|
4614
|
+
if result is None:
|
|
4615
|
+
for key, value in namespace.items():
|
|
4616
|
+
if (
|
|
4617
|
+
hasattr(value, "collect") or hasattr(value, "columns")
|
|
4618
|
+
) and not key.startswith("_"):
|
|
4619
|
+
result = value
|
|
4620
|
+
break
|
|
4621
|
+
|
|
4622
|
+
# If still no result, get the last assigned variable (excluding builtins)
|
|
4623
|
+
if result is None:
|
|
4624
|
+
# Get variables that were added to namespace (excluding our imports)
|
|
4625
|
+
user_vars = {
|
|
4626
|
+
k: v
|
|
4627
|
+
for k, v in namespace.items()
|
|
4628
|
+
if k not in ["pl", "polars", "__builtins__"] and not k.startswith("_")
|
|
4629
|
+
}
|
|
4630
|
+
if user_vars:
|
|
4631
|
+
# Get the last variable (this is a heuristic)
|
|
4632
|
+
last_var = list(user_vars.keys())[-1]
|
|
4633
|
+
result = user_vars[last_var]
|
|
4634
|
+
|
|
4635
|
+
if result is None:
|
|
4636
|
+
if pipe:
|
|
4637
|
+
print(
|
|
4638
|
+
"[red]Error:[/red] Could not find result variable", file=sys.stderr
|
|
4639
|
+
)
|
|
4640
|
+
print(
|
|
4641
|
+
"[dim]Assign your final result to a variable like 'result', 'df', or 'data'[/dim]",
|
|
4642
|
+
file=sys.stderr,
|
|
4643
|
+
)
|
|
4644
|
+
print(
|
|
4645
|
+
"[dim]Or ensure your last line returns a DataFrame[/dim]",
|
|
4646
|
+
file=sys.stderr,
|
|
4647
|
+
)
|
|
4648
|
+
else:
|
|
4649
|
+
console.print("[red]Error:[/red] Could not find result variable")
|
|
4650
|
+
console.print(
|
|
4651
|
+
"[dim]Assign your final result to a variable like 'result', 'df', or 'data'[/dim]"
|
|
4652
|
+
)
|
|
4653
|
+
console.print("[dim]Or ensure your last line returns a DataFrame[/dim]")
|
|
4654
|
+
sys.exit(1)
|
|
4655
|
+
|
|
4656
|
+
else:
|
|
4657
|
+
# Single expression - use eval()
|
|
4658
|
+
result = eval(query_code, namespace)
|
|
4659
|
+
|
|
4660
|
+
# Validate result
|
|
4661
|
+
if not hasattr(result, "collect") and not hasattr(result, "columns"):
|
|
4662
|
+
if pipe:
|
|
4663
|
+
print(
|
|
4664
|
+
"[red]Error:[/red] Expression must return a Polars DataFrame or LazyFrame",
|
|
4665
|
+
file=sys.stderr,
|
|
4666
|
+
)
|
|
4667
|
+
print(f"[dim]Got: {type(result)}[/dim]", file=sys.stderr)
|
|
4668
|
+
else:
|
|
4669
|
+
console.print(
|
|
4670
|
+
"[red]Error:[/red] Expression must return a Polars DataFrame or LazyFrame"
|
|
4671
|
+
)
|
|
4672
|
+
console.print(f"[dim]Got: {type(result)}[/dim]")
|
|
4673
|
+
sys.exit(1)
|
|
4674
|
+
|
|
4675
|
+
except Exception as e:
|
|
4676
|
+
# When piping, send errors to stderr so they don't interfere with the pipe
|
|
4677
|
+
if pipe:
|
|
4678
|
+
print(f"Error executing Polars expression: {e}", file=sys.stderr)
|
|
4679
|
+
print(file=sys.stderr)
|
|
4680
|
+
|
|
4681
|
+
# Create a panel with the expression(s) for better readability
|
|
4682
|
+
if "\n" in query_code.strip():
|
|
4683
|
+
# Multi-line expression
|
|
4684
|
+
print(f"Expression(s) provided:\n{query_code}", file=sys.stderr)
|
|
4685
|
+
else:
|
|
4686
|
+
# Single line expression
|
|
4687
|
+
print(f"Expression provided: {query_code}", file=sys.stderr)
|
|
4688
|
+
else:
|
|
4689
|
+
# Normal error handling when not piping
|
|
4690
|
+
console.print(f"[red]Error executing Polars expression:[/red] {e}")
|
|
4691
|
+
console.print()
|
|
4692
|
+
|
|
4693
|
+
# Create a panel with the expression(s) for better readability
|
|
4694
|
+
if "\n" in query_code.strip():
|
|
4695
|
+
# Multi-line expression
|
|
4696
|
+
console.print(
|
|
4697
|
+
Panel(
|
|
4698
|
+
query_code,
|
|
4699
|
+
title="Expression(s) provided",
|
|
4700
|
+
border_style="red",
|
|
4701
|
+
expand=False,
|
|
4702
|
+
title_align="left",
|
|
4703
|
+
)
|
|
4704
|
+
)
|
|
4705
|
+
else:
|
|
4706
|
+
# Single line expression
|
|
4707
|
+
console.print(
|
|
4708
|
+
Panel(
|
|
4709
|
+
query_code,
|
|
4710
|
+
title="Expression provided",
|
|
4711
|
+
border_style="red",
|
|
4712
|
+
expand=False,
|
|
4713
|
+
title_align="left",
|
|
4714
|
+
)
|
|
4715
|
+
)
|
|
4716
|
+
|
|
4717
|
+
sys.exit(1)
|
|
4718
|
+
|
|
4719
|
+
# Only print success message when not piping (so it doesn't interfere with pipe output)
|
|
4720
|
+
if not pipe:
|
|
4721
|
+
console.print("[green]✓[/green] Polars expression executed successfully")
|
|
4722
|
+
|
|
4723
|
+
# Process output
|
|
4724
|
+
if pipe:
|
|
4725
|
+
# Output data for piping to other commands
|
|
4726
|
+
_handle_pl_pipe(result, pipe_format)
|
|
4727
|
+
elif output_format == "preview":
|
|
4728
|
+
_handle_pl_preview(result, preview_head, preview_tail, output_html)
|
|
4729
|
+
elif output_format == "scan":
|
|
4730
|
+
_handle_pl_scan(result, query_code, output_html)
|
|
4731
|
+
elif output_format == "missing":
|
|
4732
|
+
_handle_pl_missing(result, query_code, output_html)
|
|
4733
|
+
elif output_format == "info":
|
|
4734
|
+
_handle_pl_info(result, query_code, output_html)
|
|
4735
|
+
elif output_format == "validate":
|
|
4736
|
+
console.print("[yellow]Validation output format not yet implemented[/yellow]")
|
|
4737
|
+
console.print("Use 'pb validate' with a data file for now")
|
|
4738
|
+
|
|
4739
|
+
except Exception as e:
|
|
4740
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
4741
|
+
sys.exit(1)
|
|
4742
|
+
|
|
4743
|
+
|
|
4744
|
+
def _handle_pl_preview(result: Any, head: int, tail: int, output_html: str | None) -> None:
|
|
4745
|
+
"""Handle preview output for Polars results."""
|
|
4746
|
+
try:
|
|
4747
|
+
# Create preview using existing preview function
|
|
4748
|
+
gt_table = pb.preview(
|
|
4749
|
+
data=result,
|
|
4750
|
+
n_head=head,
|
|
4751
|
+
n_tail=tail,
|
|
4752
|
+
show_row_numbers=True,
|
|
4753
|
+
)
|
|
4754
|
+
|
|
4755
|
+
if output_html:
|
|
4756
|
+
html_content = gt_table.as_raw_html()
|
|
4757
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
4758
|
+
console.print(f"[green]✓[/green] HTML saved to: {output_html}")
|
|
4759
|
+
else:
|
|
4760
|
+
# Get metadata for enhanced preview
|
|
4761
|
+
try:
|
|
4762
|
+
total_rows = pb.get_row_count(result)
|
|
4763
|
+
total_columns = pb.get_column_count(result)
|
|
4764
|
+
table_type = _get_tbl_type(result)
|
|
4765
|
+
|
|
4766
|
+
preview_info = {
|
|
4767
|
+
"total_rows": total_rows,
|
|
4768
|
+
"total_columns": total_columns,
|
|
4769
|
+
"head_rows": head,
|
|
4770
|
+
"tail_rows": tail,
|
|
4771
|
+
"is_complete": total_rows <= (head + tail),
|
|
4772
|
+
"source_type": "Polars expression",
|
|
4773
|
+
"table_type": table_type,
|
|
4774
|
+
}
|
|
4775
|
+
|
|
4776
|
+
_rich_print_gt_table(gt_table, preview_info)
|
|
4777
|
+
except Exception:
|
|
4778
|
+
# Fallback to basic display
|
|
4779
|
+
_rich_print_gt_table(gt_table)
|
|
4780
|
+
|
|
4781
|
+
except Exception as e:
|
|
4782
|
+
console.print(f"[red]Error creating preview:[/red] {e}")
|
|
4783
|
+
sys.exit(1)
|
|
4784
|
+
|
|
4785
|
+
|
|
4786
|
+
def _handle_pl_scan(result: Any, expression: str, output_html: str | None) -> None:
|
|
4787
|
+
"""Handle scan output for Polars results."""
|
|
4788
|
+
try:
|
|
4789
|
+
scan_result = pb.col_summary_tbl(data=result)
|
|
4790
|
+
|
|
4791
|
+
if output_html:
|
|
4792
|
+
html_content = scan_result.as_raw_html()
|
|
4793
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
4794
|
+
console.print(f"[green]✓[/green] Data scan report saved to: {output_html}")
|
|
4795
|
+
else:
|
|
4796
|
+
# Get metadata for enhanced scan display
|
|
4797
|
+
try:
|
|
4798
|
+
total_rows = pb.get_row_count(result)
|
|
4799
|
+
total_columns = pb.get_column_count(result)
|
|
4800
|
+
table_type = _get_tbl_type(result)
|
|
4801
|
+
|
|
4802
|
+
_rich_print_scan_table(
|
|
4803
|
+
scan_result,
|
|
4804
|
+
expression,
|
|
4805
|
+
"Polars expression",
|
|
4806
|
+
table_type,
|
|
4807
|
+
total_rows,
|
|
4808
|
+
total_columns,
|
|
4809
|
+
)
|
|
4810
|
+
except Exception as e:
|
|
4811
|
+
console.print(f"[yellow]Could not display scan summary: {e}[/yellow]")
|
|
4812
|
+
|
|
4813
|
+
except Exception as e:
|
|
4814
|
+
console.print(f"[red]Error creating scan:[/red] {e}")
|
|
4815
|
+
sys.exit(1)
|
|
4816
|
+
|
|
4817
|
+
|
|
4818
|
+
def _handle_pl_missing(result: Any, expression: str, output_html: str | None) -> None:
|
|
4819
|
+
"""Handle missing values output for Polars results."""
|
|
4820
|
+
try:
|
|
4821
|
+
missing_table = pb.missing_vals_tbl(data=result)
|
|
4822
|
+
|
|
4823
|
+
if output_html:
|
|
4824
|
+
html_content = missing_table.as_raw_html()
|
|
4825
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
4826
|
+
console.print(f"[green]✓[/green] Missing values report saved to: {output_html}")
|
|
4827
|
+
else:
|
|
4828
|
+
_rich_print_missing_table(missing_table, result)
|
|
4829
|
+
|
|
4830
|
+
except Exception as e:
|
|
4831
|
+
console.print(f"[red]Error creating missing values report:[/red] {e}")
|
|
4832
|
+
sys.exit(1)
|
|
4833
|
+
|
|
4834
|
+
|
|
4835
|
+
def _handle_pl_info(result: Any, expression: str, output_html: str | None) -> None:
|
|
4836
|
+
"""Handle info output for Polars results."""
|
|
4837
|
+
try:
|
|
4838
|
+
# Get basic info
|
|
4839
|
+
tbl_type = _get_tbl_type(result)
|
|
4840
|
+
row_count = pb.get_row_count(result)
|
|
4841
|
+
col_count = pb.get_column_count(result)
|
|
4842
|
+
|
|
4843
|
+
# Get column names and types
|
|
4844
|
+
if hasattr(result, "columns"):
|
|
4845
|
+
columns = list(result.columns)
|
|
4846
|
+
elif hasattr(result, "schema"):
|
|
4847
|
+
columns = list(result.schema.names)
|
|
4848
|
+
else:
|
|
4849
|
+
columns = []
|
|
4850
|
+
|
|
4851
|
+
dtypes_dict = _get_column_dtypes(result, columns)
|
|
4852
|
+
|
|
4853
|
+
if output_html:
|
|
4854
|
+
# Create a simple HTML info page
|
|
4855
|
+
# TODO: Implement an improved version of this in the Python API and then
|
|
4856
|
+
# use that here
|
|
4857
|
+
html_content = f"""
|
|
4858
|
+
<html><body>
|
|
4859
|
+
<h2>Polars Expression Info</h2>
|
|
4860
|
+
<p><strong>Expression:</strong> {expression}</p>
|
|
4861
|
+
<p><strong>Table Type:</strong> {tbl_type}</p>
|
|
4862
|
+
<p><strong>Rows:</strong> {row_count:,}</p>
|
|
4863
|
+
<p><strong>Columns:</strong> {col_count:,}</p>
|
|
4864
|
+
<h3>Column Details</h3>
|
|
4865
|
+
<ul>
|
|
4866
|
+
{"".join(f"<li>{col}: {dtypes_dict.get(col, '?')}</li>" for col in columns)}
|
|
4867
|
+
</ul>
|
|
4868
|
+
</body></html>
|
|
4869
|
+
"""
|
|
4870
|
+
Path(output_html).write_text(html_content, encoding="utf-8")
|
|
4871
|
+
console.print(f"[green]✓[/green] HTML info saved to: {output_html}")
|
|
4872
|
+
else:
|
|
4873
|
+
# Display info table
|
|
4874
|
+
from rich.box import SIMPLE_HEAD
|
|
4875
|
+
|
|
4876
|
+
info_table = Table(
|
|
4877
|
+
title="Polars Expression Info",
|
|
4878
|
+
show_header=True,
|
|
4879
|
+
header_style="bold magenta",
|
|
4880
|
+
box=SIMPLE_HEAD,
|
|
4881
|
+
title_style="bold cyan",
|
|
4882
|
+
title_justify="left",
|
|
4883
|
+
)
|
|
4884
|
+
info_table.add_column("Property", style="cyan", no_wrap=True)
|
|
4885
|
+
info_table.add_column("Value", style="green")
|
|
4886
|
+
|
|
4887
|
+
info_table.add_row("Expression", expression)
|
|
4888
|
+
# Capitalize "polars" to "Polars" for consistency with pb info command
|
|
4889
|
+
display_tbl_type = (
|
|
4890
|
+
tbl_type.replace("polars", "Polars") if "polars" in tbl_type.lower() else tbl_type
|
|
4891
|
+
)
|
|
4892
|
+
info_table.add_row("Table Type", display_tbl_type)
|
|
4893
|
+
info_table.add_row("Rows", f"{row_count:,}")
|
|
4894
|
+
info_table.add_row("Columns", f"{col_count:,}")
|
|
4895
|
+
|
|
4896
|
+
console.print()
|
|
4897
|
+
console.print(info_table)
|
|
4898
|
+
|
|
4899
|
+
# Show column details
|
|
4900
|
+
if columns:
|
|
4901
|
+
console.print("\n[bold cyan]Column Details:[/bold cyan]")
|
|
4902
|
+
for col in columns[:10]: # Show first 10 columns
|
|
4903
|
+
dtype = dtypes_dict.get(col, "?")
|
|
4904
|
+
console.print(f" • {col}: [yellow]{dtype}[/yellow]")
|
|
4905
|
+
|
|
4906
|
+
if len(columns) > 10:
|
|
4907
|
+
console.print(f" ... and {len(columns) - 10} more columns")
|
|
4908
|
+
|
|
4909
|
+
except Exception as e:
|
|
4910
|
+
console.print(f"[red]Error creating info:[/red] {e}")
|
|
4911
|
+
sys.exit(1)
|
|
4912
|
+
|
|
4913
|
+
|
|
4914
|
+
def _handle_pl_pipe(result: Any, pipe_format: str) -> None:
|
|
4915
|
+
"""Handle piped output from Polars results."""
|
|
4916
|
+
try:
|
|
4917
|
+
import sys
|
|
4918
|
+
import tempfile
|
|
4919
|
+
|
|
4920
|
+
# Create a temporary file to store the data
|
|
4921
|
+
with tempfile.NamedTemporaryFile(
|
|
4922
|
+
mode="w", suffix=f".{pipe_format}", prefix="pb_pipe_", delete=False
|
|
4923
|
+
) as temp_file:
|
|
4924
|
+
temp_path = temp_file.name
|
|
4925
|
+
|
|
4926
|
+
# Write the data to the temporary file
|
|
4927
|
+
if pipe_format == "parquet":
|
|
4928
|
+
if hasattr(result, "write_parquet"):
|
|
4929
|
+
# Polars
|
|
4930
|
+
result.write_parquet(temp_path)
|
|
4931
|
+
elif hasattr(result, "to_parquet"):
|
|
4932
|
+
# Pandas
|
|
4933
|
+
result.to_parquet(temp_path)
|
|
4934
|
+
else:
|
|
4935
|
+
# Convert to pandas and write
|
|
4936
|
+
import pandas as pd
|
|
4937
|
+
|
|
4938
|
+
pd_result = pd.DataFrame(result)
|
|
4939
|
+
pd_result.to_parquet(temp_path)
|
|
4940
|
+
else: # CSV
|
|
4941
|
+
if hasattr(result, "write_csv"):
|
|
4942
|
+
# Polars
|
|
4943
|
+
result.write_csv(temp_path)
|
|
4944
|
+
elif hasattr(result, "to_csv"):
|
|
4945
|
+
# Pandas
|
|
4946
|
+
result.to_csv(temp_path, index=False)
|
|
4947
|
+
else:
|
|
4948
|
+
# Convert to pandas and write
|
|
4949
|
+
import pandas as pd
|
|
4950
|
+
|
|
4951
|
+
pd_result = pd.DataFrame(result)
|
|
4952
|
+
pd_result.to_csv(temp_path, index=False)
|
|
4953
|
+
|
|
4954
|
+
# Output the temporary file path to stdout for the next command
|
|
4955
|
+
print(temp_path)
|
|
4956
|
+
|
|
4957
|
+
except Exception as e:
|
|
4958
|
+
print(f"[red]Error creating pipe output:[/red] {e}", file=sys.stderr)
|
|
4959
|
+
sys.exit(1)
|
|
4960
|
+
|
|
4961
|
+
|
|
4962
|
+
def _get_best_editor() -> str:
|
|
4963
|
+
"""Detect the best available editor on the system."""
|
|
4964
|
+
|
|
4965
|
+
# Check environment variable first
|
|
4966
|
+
if "EDITOR" in os.environ:
|
|
4967
|
+
return os.environ["EDITOR"]
|
|
4968
|
+
|
|
4969
|
+
# Check for common editors in order of preference
|
|
4970
|
+
editors = [
|
|
4971
|
+
"code", # VS Code
|
|
4972
|
+
"micro", # Modern terminal editor
|
|
4973
|
+
"nano", # User-friendly terminal editor
|
|
4974
|
+
"vim", # Vim
|
|
4975
|
+
"vi", # Vi (fallback)
|
|
4976
|
+
]
|
|
4977
|
+
|
|
4978
|
+
for editor in editors:
|
|
4979
|
+
if shutil.which(editor):
|
|
4980
|
+
return editor
|
|
4981
|
+
|
|
4982
|
+
# Ultimate fallback
|
|
4983
|
+
return "nano"
|
|
4984
|
+
|
|
4985
|
+
|
|
4986
|
+
def _edit_with_vscode() -> str | None:
|
|
4987
|
+
"""Edit Polars query using VS Code."""
|
|
4988
|
+
import subprocess
|
|
4989
|
+
import tempfile
|
|
4990
|
+
|
|
4991
|
+
# Create a temporary Python file
|
|
4992
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", prefix="pb_pl_", delete=False) as f:
|
|
4993
|
+
f.write("import polars as pl\n")
|
|
4994
|
+
f.write("\n")
|
|
4995
|
+
f.write("# Enter your Polars query here\n")
|
|
4996
|
+
f.write("# Examples:\n")
|
|
4997
|
+
f.write("# \n")
|
|
4998
|
+
f.write("# Single expression:\n")
|
|
4999
|
+
f.write("# pl.read_csv('data.csv').select(['name', 'age'])\n")
|
|
5000
|
+
f.write("# \n")
|
|
5001
|
+
f.write("# Multiple statements:\n")
|
|
5002
|
+
f.write("# csv = pl.read_csv('data.csv')\n")
|
|
5003
|
+
f.write("# result = csv.select(['name', 'age']).filter(pl.col('age') > 25)\n")
|
|
5004
|
+
f.write("# \n")
|
|
5005
|
+
f.write("# For multi-statement code, assign your final result to a variable\n")
|
|
5006
|
+
f.write("# like 'result', 'df', 'data', or just ensure it's the last line\n")
|
|
5007
|
+
f.write("# \n")
|
|
5008
|
+
f.write("# Save and then close this file in VS Code to execute the query\n")
|
|
5009
|
+
f.write("\n")
|
|
5010
|
+
temp_file = f.name
|
|
5011
|
+
|
|
5012
|
+
try:
|
|
5013
|
+
# Open in VS Code and wait for it to close
|
|
5014
|
+
result = subprocess.run(
|
|
5015
|
+
["code", "--wait", temp_file], capture_output=True, text=True, timeout=300
|
|
5016
|
+
)
|
|
5017
|
+
|
|
5018
|
+
if result.returncode != 0:
|
|
5019
|
+
console.print(f"[yellow]VS Code exited with code {result.returncode}[/yellow]")
|
|
5020
|
+
|
|
5021
|
+
# Read the edited content
|
|
5022
|
+
with open(temp_file, "r") as f:
|
|
5023
|
+
content = f.read()
|
|
5024
|
+
|
|
5025
|
+
# Remove comments, empty lines, and import statements for cleaner execution
|
|
5026
|
+
lines = []
|
|
5027
|
+
for line in content.split("\n"):
|
|
5028
|
+
stripped = line.strip()
|
|
5029
|
+
if (
|
|
5030
|
+
stripped
|
|
5031
|
+
and not stripped.startswith("#")
|
|
5032
|
+
and not stripped.startswith("import polars")
|
|
5033
|
+
and not stripped.startswith("import polars as pl")
|
|
5034
|
+
):
|
|
5035
|
+
lines.append(line)
|
|
5036
|
+
|
|
5037
|
+
return "\n".join(lines) if lines else None
|
|
5038
|
+
|
|
5039
|
+
except subprocess.TimeoutExpired:
|
|
5040
|
+
console.print("[red]Timeout:[/red] VS Code took too long to respond")
|
|
5041
|
+
return None
|
|
5042
|
+
except subprocess.CalledProcessError as e:
|
|
5043
|
+
console.print(f"[red]Error:[/red] Could not open VS Code: {e}")
|
|
5044
|
+
return None
|
|
5045
|
+
except FileNotFoundError:
|
|
5046
|
+
console.print("[red]Error:[/red] VS Code not found in PATH")
|
|
5047
|
+
return None
|
|
5048
|
+
finally:
|
|
5049
|
+
# Clean up
|
|
5050
|
+
Path(temp_file).unlink(missing_ok=True)
|
|
5051
|
+
|
|
5052
|
+
|
|
5053
|
+
def _show_concise_help(command_name: str, ctx: click.Context) -> None:
|
|
5054
|
+
"""Show concise help for a command when required arguments are missing."""
|
|
5055
|
+
|
|
5056
|
+
if command_name == "info":
|
|
5057
|
+
console.print("[bold cyan]pb info[/bold cyan] - Display information about a data source")
|
|
5058
|
+
console.print()
|
|
5059
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5060
|
+
console.print(" pb info data.csv")
|
|
5061
|
+
console.print(" pb info small_table")
|
|
5062
|
+
console.print()
|
|
5063
|
+
console.print("[dim]Shows table type, dimensions, column names, and data types[/dim]")
|
|
5064
|
+
console.print()
|
|
5065
|
+
console.print(
|
|
5066
|
+
"[dim]Use [bold]pb info --help[/bold] for complete options and examples[/dim]"
|
|
5067
|
+
)
|
|
5068
|
+
|
|
5069
|
+
elif command_name == "preview":
|
|
5070
|
+
console.print(
|
|
5071
|
+
"[bold cyan]pb preview[/bold cyan] - Preview a data table showing head and tail rows"
|
|
5072
|
+
)
|
|
5073
|
+
console.print()
|
|
5074
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5075
|
+
console.print(" pb preview data.csv")
|
|
5076
|
+
console.print(" pb preview data.parquet --head 10 --tail 5")
|
|
5077
|
+
console.print()
|
|
5078
|
+
console.print("[bold yellow]Key Options:[/bold yellow]")
|
|
5079
|
+
console.print(" --head N Number of rows from the top (default: 5)")
|
|
5080
|
+
console.print(" --tail N Number of rows from the bottom (default: 5)")
|
|
5081
|
+
console.print(" --columns LIST Comma-separated list of columns to display")
|
|
5082
|
+
console.print(" --output-html Save HTML output to file")
|
|
5083
|
+
console.print()
|
|
5084
|
+
console.print(
|
|
5085
|
+
"[dim]Use [bold]pb preview --help[/bold] for complete options and examples[/dim]"
|
|
5086
|
+
)
|
|
5087
|
+
|
|
5088
|
+
elif command_name == "scan":
|
|
5089
|
+
console.print(
|
|
5090
|
+
"[bold cyan]pb scan[/bold cyan] - Generate a comprehensive data profile report"
|
|
5091
|
+
)
|
|
5092
|
+
console.print()
|
|
5093
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5094
|
+
console.print(" pb scan data.csv")
|
|
5095
|
+
console.print(" pb scan data.parquet --output-html report.html")
|
|
5096
|
+
console.print()
|
|
5097
|
+
console.print("[bold yellow]Key Options:[/bold yellow]")
|
|
5098
|
+
console.print(" --output-html Save HTML scan report to file")
|
|
5099
|
+
console.print(" --columns LIST Comma-separated list of columns to scan")
|
|
5100
|
+
console.print()
|
|
5101
|
+
console.print(
|
|
5102
|
+
"[dim]Use [bold]pb scan --help[/bold] for complete options and examples[/dim]"
|
|
5103
|
+
)
|
|
5104
|
+
|
|
5105
|
+
elif command_name == "missing":
|
|
5106
|
+
console.print("[bold cyan]pb missing[/bold cyan] - Generate a missing values report")
|
|
5107
|
+
console.print()
|
|
5108
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5109
|
+
console.print(" pb missing data.csv")
|
|
5110
|
+
console.print(" pb missing data.parquet --output-html missing_report.html")
|
|
5111
|
+
console.print()
|
|
5112
|
+
console.print("[bold yellow]Key Options:[/bold yellow]")
|
|
5113
|
+
console.print(" --output-html Save HTML output to file")
|
|
5114
|
+
console.print()
|
|
5115
|
+
console.print(
|
|
5116
|
+
"[dim]Use [bold]pb missing --help[/bold] for complete options and examples[/dim]"
|
|
5117
|
+
)
|
|
5118
|
+
|
|
5119
|
+
elif command_name == "validate":
|
|
5120
|
+
console.print("[bold cyan]pb validate[/bold cyan] - Perform data validation checks")
|
|
5121
|
+
console.print()
|
|
5122
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5123
|
+
console.print(" pb validate data.csv")
|
|
5124
|
+
console.print(" pb validate data.csv --check col-vals-not-null --column email")
|
|
5125
|
+
console.print()
|
|
5126
|
+
console.print("[bold yellow]Key Options:[/bold yellow]")
|
|
5127
|
+
console.print(" --check TYPE Validation check type (default: rows-distinct)")
|
|
5128
|
+
console.print(" --column COL Column name for column-specific checks")
|
|
5129
|
+
console.print(" --show-extract Show failing rows if validation fails")
|
|
5130
|
+
console.print(" --list-checks List all available validation checks")
|
|
5131
|
+
console.print()
|
|
5132
|
+
console.print(
|
|
5133
|
+
"[dim]Use [bold]pb validate --help[/bold] for complete options and examples[/dim]"
|
|
5134
|
+
)
|
|
5135
|
+
|
|
5136
|
+
elif command_name == "run":
|
|
5137
|
+
console.print("[bold cyan]pb run[/bold cyan] - Run a Pointblank validation script")
|
|
5138
|
+
console.print()
|
|
5139
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5140
|
+
console.print(" pb run validation_script.py")
|
|
5141
|
+
console.print(" pb run validation_script.py --data data.csv")
|
|
5142
|
+
console.print()
|
|
5143
|
+
console.print("[bold yellow]Key Options:[/bold yellow]")
|
|
5144
|
+
console.print(" --data SOURCE Replace data source in validation objects")
|
|
5145
|
+
console.print(" --output-html Save HTML validation report to file")
|
|
5146
|
+
console.print(" --show-extract Show failing rows if validation fails")
|
|
5147
|
+
console.print(" --fail-on LEVEL Exit with error on critical/error/warning/any")
|
|
5148
|
+
console.print()
|
|
5149
|
+
console.print("[dim]Use [bold]pb run --help[/bold] for complete options and examples[/dim]")
|
|
5150
|
+
|
|
5151
|
+
elif command_name == "make-template":
|
|
5152
|
+
console.print(
|
|
5153
|
+
"[bold cyan]pb make-template[/bold cyan] - Create a validation script template"
|
|
5154
|
+
)
|
|
5155
|
+
console.print()
|
|
5156
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5157
|
+
console.print(" pb make-template my_validation.py")
|
|
5158
|
+
console.print(" pb make-template validation_template.py")
|
|
5159
|
+
console.print()
|
|
5160
|
+
console.print("[dim]Creates a sample Python script with validation examples[/dim]")
|
|
5161
|
+
console.print("[dim]Edit the template and run with [bold]pb run[/bold][/dim]")
|
|
5162
|
+
console.print()
|
|
5163
|
+
console.print(
|
|
5164
|
+
"[dim]Use [bold]pb make-template --help[/bold] for complete options and examples[/dim]"
|
|
5165
|
+
)
|
|
5166
|
+
|
|
5167
|
+
elif command_name == "pl":
|
|
5168
|
+
console.print(
|
|
5169
|
+
"[bold cyan]pb pl[/bold cyan] - Execute Polars expressions and display results"
|
|
5170
|
+
)
|
|
5171
|
+
console.print()
|
|
5172
|
+
console.print("[bold yellow]Usage:[/bold yellow]")
|
|
5173
|
+
console.print(" pb pl \"pl.read_csv('data.csv')\"")
|
|
5174
|
+
console.print(" pb pl --edit")
|
|
5175
|
+
console.print()
|
|
5176
|
+
console.print("[bold yellow]Key Options:[/bold yellow]")
|
|
5177
|
+
console.print(" --edit Open editor for multi-line input")
|
|
5178
|
+
console.print(" --file FILE Read query from file")
|
|
5179
|
+
console.print(" --output-format Output format: preview, scan, missing, info")
|
|
5180
|
+
console.print(" --pipe Output for piping to other pb commands")
|
|
5181
|
+
console.print()
|
|
5182
|
+
console.print("[dim]Use [bold]pb pl --help[/bold] for complete options and examples[/dim]")
|
|
5183
|
+
|
|
5184
|
+
# Fix the exit call at the end
|
|
5185
|
+
if ctx is not None:
|
|
5186
|
+
ctx.exit(1)
|
|
5187
|
+
else:
|
|
5188
|
+
sys.exit(1)
|